将武侠小说列表,章节,内容保存成json格式内容,转存成文件,保存到本地。支持重复增量爬取
代码如下
[Python] 纯文本查看 复制代码 import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from lxml import etree
import os
import json
import time
import random
# 主程序下载文件目录
main_dir = 'tianyabooks.com'
session = requests.Session()
retry = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
# 根据路径请求网页(随机暂停5-10秒后请求)
def get_html_tree(url, encoding='utf-8'):
# 所有的网络请求都休眠几秒再获取
sep = random.randint(1, 5)
print(str(sep) + '秒后请求网址:' + url)
time.sleep(sep)
headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
}
# requests.DEFAULT_RETRIES = 5 # 增加重试连接次数
# s = requests.session()
# s.keep_alive = False # 关闭多余连接
# response = requests.get(url=url, headers=headers, timeout=300)
response = session.get(url=url, headers=headers, timeout=300)
response.encoding = encoding
content = response.text
# print(content)
return etree.HTML(content)
# 获取武侠小说列表
def get_wuxia_article_list(page_index, page_url):
html_tree = get_html_tree(page_url)
# 获取所有的文章
article_a_list = html_tree.xpath('//div[@class=\'listbox\']//ul[@class=\'e2\']//li/a[@class=\'title\']')
class_data = []
# 如果没有数据,直接返回
if len(article_a_list) <= 0:
return class_data
for article_a in article_a_list:
temp_title = article_a.xpath('.//text()')
json_data = {
'article_url': article_a.xpath('./@href')[0],
'article_title': ''.join(temp_title)
}
# print(json_data)
class_data.append(json_data)
print(class_data)
with open('./' + main_dir + '/武侠小说/' + 'article_list_page_' + str(page_index) + '.json', 'w') as fp:
fp.write(json.dumps(class_data))
return class_data
# 获取武侠小说章节列表
def get_wuxia_article(article_url, article_title):
article_dirname = './' + main_dir + '/武侠小说/' + article_title
if not os.path.exists(article_dirname):
os.mkdir(article_dirname)
article_filename = article_dirname + '/article_info.json'
html_tree = get_html_tree(article_url)
book_div = html_tree.xpath('//div[@id=\'main\']//div[@class=\'book\']')[0]
title = book_div.xpath('./h1/text()')[0]
author = book_div.xpath('./h2/text()')[0]
description_temp = book_div.xpath('./div[@class=\'description\']/p/text()')
description = ''
if len(description_temp) > 0:
description = description_temp[0]
chapter_a_list = book_div.xpath('./dl/dd/a')
chapter_list = []
for chapter in chapter_a_list:
chapter_data = {
'chapter_url': chapter.xpath('./@href')[0],
'chapter_name': chapter.xpath('./text()')[0]
}
chapter_list.append(chapter_data)
json_data = {
'article_url': article_url,
'article_title': title,
'article_author': author,
'description': description,
'chapter_list': chapter_list
}
print(json_data)
with open(article_filename, 'w') as fp:
fp.write(json.dumps(json_data))
return json_data
# 获取武侠小说章节内容
def get_wuxia_chapter(chapter_url, article_title, chapter_name):
chapter_name = chapter_name.replace("?", "").replace("*", "")
chapter_filename = './' + main_dir + '/武侠小说/' + article_title + '/' + chapter_name + '.json'
if os.path.exists(chapter_filename):
print(chapter_filename + '文件已存在,不再重复下载!')
return
html_tree = get_html_tree(chapter_url,'GB2312')
content = html_tree.xpath('//table[4]//p/text()')
json_data = {
'chapter_url': chapter_url,
'chapter_name': chapter_name,
'content': content
}
print(json_data)
with open(chapter_filename, 'w') as fp:
fp.write(json.dumps(json_data))
return json_data
# 获取武侠小说
def get_wuxia():
print('开始获取武侠小说================')
if not os.path.exists('./' + main_dir + '/武侠小说'):
os.mkdir('./' + main_dir + '/武侠小说')
# 翻页 1.html
for i in range(1, 40):
page_url = 'https://wx.tianyabooks.com/book/list_' + str(i) + '.html'
print(page_url)
page_index_filename = './' + main_dir + '/武侠小说/' + 'article_list_page_' + str(i) + '.json'
article_list = []
if os.path.exists(page_index_filename):
with open(page_index_filename, 'r') as f:
article_list = json.load(f)
print(page_index_filename + '武侠小说分页文章列表文件存在,直接读取')
else:
print(page_index_filename + '从网页获取武侠小说第【' + str(i) + '】页下文章列表')
article_list = get_wuxia_article_list(i, page_url)
# 循环列表读取文章
article_list_len = len(article_list)
article_index = 1
for article in article_list:
article_url = article['article_url']
article_title = article['article_title']
print('【' + str(article_index) + '/' + str(article_list_len) + '】开始获取【' + article_title + '】文章数据:')
article_index = article_index + 1
article_filename = './' + main_dir + '/武侠小说/' + article_title + '/article_info.json'
if os.path.exists(article_filename):
print(article_filename + '文件已存在,不再重复下载!')
continue
article_info = get_wuxia_article('https://wx.tianyabooks.com/' + article_url, article_title)
chapter_list_len = len(article_info['chapter_list'])
chapter_list_index = 1
for chapter in article_info['chapter_list']:
print('【' + str(chapter_list_index) + '/' + str(chapter_list_len) + '】开始获取小说【' + article_title + '】-【' + chapter['chapter_name'] + '】的章节数据')
chapter_list_index = chapter_list_index + 1
get_wuxia_chapter('https://wx.tianyabooks.com/' + article_url + chapter['chapter_url'], article_title, chapter['chapter_name'])
# 主程序执行
if __name__ == '__main__':
# 当前网站存储目录
if not os.path.exists('./' + main_dir):
os.mkdir('./' + main_dir)
# 获取武侠
get_wuxia()
|