novel
爬取到小说的具体内容,要进行滤除处理,完整的代码
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests
if __name__ == "__main__":
target = 'http://www.biqukan.com/1_1094/5403177.html'
req = requests.get(url = target)
html = req.text
bf = BeautifulSoup(html)
texts = bf.find_all('div', class_ = 'showtxt')
print(texts[0].text.replace('\xa0'*8,'\n\n'))
正文
import json
from bs4 import BeautifulSoup
import requests
def strips(contents):
result_list = []
list1 = []
matter = contents[::2][::2]
for item in matter:
result_list.append(item)
for i in result_list:
list1.append(i.strip())
return list1
# 保存json数据
def save_json(article, num):
fiction_json_str = json.dumps(article, ensure_ascii=False)
with open(str(num)+'.json', 'w', encoding='utf-8') as f:
f.write(fiction_json_str)
# 取页面HTML
def get_one_page(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
text = response.content.decode('utf-8')
return text
return None
def parse_with_bs4(html):
all_list = []
all_dic = {}
soup = BeautifulSoup(html, 'lxml')
title = soup.select('.content_read .box_con .bookname h1')[0].string
contents = soup.select('.content_read .box_con #content')[0].contents
content = strips(contents)
all_dic['title'] = title
all_dic['content'] = content
all_list.append(all_dic)
return all_list
def next_url(html):
soup = BeautifulSoup(html, 'lxml')
# 下一章的后缀路由
next_url = soup.select('.content_read .box_con .bookname .bottem1 a')[3]['href']
nexts_url = 'http://www.xbiquge.la' + next_url
return nexts_url
def main():
num = 1
url = "http://www.xbiquge.la/2/2823/1790661.html"
while True:
html = get_one_page(url)
article = parse_with_bs4(html)
save_json(article, num)
url = next_url(html)
if url == 'http://www.xbiquge.la/2/2823/':
break
num += 1
if __name__ == '__main__':
main()