novel

spider

发布日期: 2018-05-11

阅读次数:

爬取到小说的具体内容，要进行滤除处理，完整的代码

# -*- coding:UTF-8 -*-
    from bs4 import BeautifulSoup
    import requests

    if __name__ == "__main__":
        target = 'http://www.biqukan.com/1_1094/5403177.html'
        req = requests.get(url = target)
        html = req.text
        bf = BeautifulSoup(html)
        texts = bf.find_all('div', class_ = 'showtxt')
        print(texts[0].text.replace('\xa0'*8,'\n\n'))

正文

import json
from bs4 import BeautifulSoup
import requests


def strips(contents):
    result_list =  []
    list1 = []
    matter = contents[::2][::2]
    for item in matter:
        result_list.append(item)
    for i in result_list:
        list1.append(i.strip())
    return list1

# 保存json数据
def save_json(article, num):
    fiction_json_str = json.dumps(article, ensure_ascii=False)
    with open(str(num)+'.json', 'w', encoding='utf-8') as f:
        f.write(fiction_json_str)

# 取页面HTML
def get_one_page(url):

    headers =  {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        text = response.content.decode('utf-8')
        return text
    return None

def parse_with_bs4(html):
    all_list = []
    all_dic = {}
    soup = BeautifulSoup(html, 'lxml')
    title = soup.select('.content_read .box_con .bookname h1')[0].string
    contents = soup.select('.content_read .box_con #content')[0].contents
    content = strips(contents)
    all_dic['title'] = title
    all_dic['content'] = content
    all_list.append(all_dic)
    return all_list

def next_url(html):
    soup = BeautifulSoup(html, 'lxml')
    # 下一章的后缀路由
    next_url = soup.select('.content_read .box_con .bookname .bottem1 a')[3]['href']
    nexts_url = 'http://www.xbiquge.la' + next_url
    return nexts_url

def main():
    num = 1
    url = "http://www.xbiquge.la/2/2823/1790661.html"
    while True:
        html = get_one_page(url)
        article = parse_with_bs4(html)
        save_json(article, num)
        url = next_url(html)
        if url == 'http://www.xbiquge.la/2/2823/':
            break
        num += 1

if __name__ == '__main__':
    main()

转载请注明: 微笑看你 novel

practice——jd

数据库字段创建 create database jddb default character set=utf8; use jddb; create table jd ( id integer auto_increment pri

2018-05-11 spider

sqlalchemy

shop_mo

蘑菇街agent_list.py import random def get_random_agent(): agent_list = [ "Mozilla/4.0 (compatible; MSIE 6.0; Wind

2018-05-09 spider

agent

novel

爬取到小说的具体内容，要进行滤除处理，完整的代码

你的赏识是我前进的动力