Python爬取武侠小说

ymsn2023 发表于 2023-8-2 13:00

将武侠小说列表，章节，内容保存成json格式内容，转存成文件，保存到本地。支持重复增量爬取

代码如下
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

from lxml import etree
import os
import json
import time
import random

# 主程序下载文件目录
main_dir = 'tianyabooks.com'

session = requests.Session()
retry = Retry(total=5, backoff_factor=0.1, status_forcelist=)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

# 根据路径请求网页（随机暂停5-10秒后请求）
def get_html_tree(url, encoding='utf-8'):
# 所有的网络请求都休眠几秒再获取
sep = random.randint(1, 5)
print(str(sep) + '秒后请求网址：' + url)
time.sleep(sep)

headers = {
   'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
}

# requests.DEFAULT_RETRIES = 5# 增加重试连接次数
# s = requests.session()
# s.keep_alive = False# 关闭多余连接
# response = requests.get(url=url, headers=headers, timeout=300)
response = session.get(url=url, headers=headers, timeout=300)
response.encoding = encoding
content = response.text
# print(content)
return etree.HTML(content)

# 获取武侠小说列表
def get_wuxia_article_list(page_index, page_url):
html_tree = get_html_tree(page_url)

# 获取所有的文章
article_a_list = html_tree.xpath('//div[@class=\'listbox\']//ul[@class=\'e2\']//li/a[@class=\'title\']')

class_data = []
# 如果没有数据，直接返回
if len(article_a_list) <= 0:
   return class_data

for article_a in article_a_list:
   temp_title = article_a.xpath('.//text()')
   json_data = {
         'article_url': article_a.xpath('./@href'),
         'article_title': ''.join(temp_title)
   }
   # print(json_data)
   class_data.append(json_data)

print(class_data)

with open('./' + main_dir + '/武侠小说/' + 'article_list_page_' + str(page_index) + '.json', 'w') as fp:
   fp.write(json.dumps(class_data))

return class_data

# 获取武侠小说章节列表
def get_wuxia_article(article_url, article_title):
article_dirname = './' + main_dir + '/武侠小说/' + article_title
if not os.path.exists(article_dirname):
   os.mkdir(article_dirname)

article_filename = article_dirname + '/article_info.json'

html_tree = get_html_tree(article_url)

book_div = html_tree.xpath('//div[@id=\'main\']//div[@class=\'book\']')
title = book_div.xpath('./h1/text()')
author = book_div.xpath('./h2/text()')
description_temp = book_div.xpath('./div[@class=\'description\']/p/text()')
description = ''
if len(description_temp) > 0:
   description = description_temp

chapter_a_list = book_div.xpath('./dl/dd/a')

chapter_list = []
for chapter in chapter_a_list:
   chapter_data = {
         'chapter_url': chapter.xpath('./@href'),
         'chapter_name': chapter.xpath('./text()')
   }
   chapter_list.append(chapter_data)

json_data = {
   'article_url': article_url,
   'article_title': title,
   'article_author': author,
   'description': description,
   'chapter_list': chapter_list
}
print(json_data)

with open(article_filename, 'w') as fp:
   fp.write(json.dumps(json_data))

return json_data

# 获取武侠小说章节内容
def get_wuxia_chapter(chapter_url, article_title, chapter_name):
chapter_name = chapter_name.replace("?", "").replace("*", "")
chapter_filename = './' + main_dir + '/武侠小说/' + article_title + '/' + chapter_name + '.json'
if os.path.exists(chapter_filename):
   print(chapter_filename + '文件已存在，不再重复下载！')
   return

html_tree = get_html_tree(chapter_url,'GB2312')

content = html_tree.xpath('//table//p/text()')

json_data = {
   'chapter_url': chapter_url,
   'chapter_name': chapter_name,
   'content': content
}
print(json_data)

with open(chapter_filename, 'w') as fp:
   fp.write(json.dumps(json_data))

return json_data

# 获取武侠小说
def get_wuxia():
print('开始获取武侠小说================')

if not os.path.exists('./' + main_dir + '/武侠小说'):
   os.mkdir('./' + main_dir + '/武侠小说')

# 翻页 1.html
for i in range(1, 40):
   page_url = 'https://wx.tianyabooks.com/book/list_' + str(i) + '.html'
   print(page_url)

   page_index_filename = './' + main_dir + '/武侠小说/' + 'article_list_page_' + str(i) + '.json'
   article_list = []
   if os.path.exists(page_index_filename):
         with open(page_index_filename, 'r') as f:
            article_list = json.load(f)
            print(page_index_filename + '武侠小说分页文章列表文件存在，直接读取')
   else:
         print(page_index_filename + '从网页获取武侠小说第【' + str(i) + '】页下文章列表')
         article_list = get_wuxia_article_list(i, page_url)

   # 循环列表读取文章
   article_list_len = len(article_list)
   article_index = 1
   for article in article_list:
         article_url = article['article_url']
         article_title = article['article_title']
         print('【' + str(article_index) + '/' + str(article_list_len) + '】开始获取【' + article_title + '】文章数据:')
         article_index = article_index + 1

         article_filename = './' + main_dir + '/武侠小说/' + article_title + '/article_info.json'
         if os.path.exists(article_filename):
            print(article_filename + '文件已存在，不再重复下载！')
            continue

         article_info = get_wuxia_article('https://wx.tianyabooks.com/' + article_url, article_title)

         chapter_list_len = len(article_info['chapter_list'])
         chapter_list_index = 1
         for chapter in article_info['chapter_list']:
            print('【' + str(chapter_list_index) + '/' + str(chapter_list_len) + '】开始获取小说【' + article_title + '】-【' + chapter['chapter_name'] + '】的章节数据')
            chapter_list_index = chapter_list_index + 1

            get_wuxia_chapter('https://wx.tianyabooks.com/' + article_url + chapter['chapter_url'], article_title, chapter['chapter_name'])

# 主程序执行
if __name__ == '__main__':
# 当前网站存储目录
if not os.path.exists('./' + main_dir):
   os.mkdir('./' + main_dir)

# 获取武侠
get_wuxia()

xz91168 发表于 2023-8-3 11:18

C:\Users\Administrator\AppData\Local\Microsoft\WindowsApps\python3.11.exe C:\Users\Administrator\PycharmProjects\pythonProject\main.py
Traceback (most recent call last):
File "C:\Users\Administrator\PycharmProjects\pythonProject\main.py", line 1, in <module>
import requests
ModuleNotFoundError: No module named 'requests'

进程已结束,退出代码1

kkkkkkkkn 发表于 2023-8-4 17:57

xz91168 发表于 2023-8-3 11:18
C:%users\Administrator\AppData\Local\Microsoft\WindowsApps\python3.11.exe C:%users\Administrator\Pyc ...

您没有下载request模块，它是一个非常流行的用于发送 HTTP 请求的库。requests 库提供了简洁而友好的 API，使得发送 HTTP 请求变得非常容易。

OfficeDK 发表于 2023-8-2 15:58

好，学习了！！！

rjqg2023 发表于 2023-8-2 15:58

放入python里直接运行就行吗？

坐久落花多 发表于 2023-8-2 18:00

前两天想找同学帮忙做个爬取某网站的内容的，结果他说可能有违法风险，咋回事？实在是不懂啊。。。

吖力锅 发表于 2023-8-2 22:39

哇哇。这么多行代码的吗

RedWolfT 发表于 2023-8-3 11:50

学习了，感谢分享！

echoaku 发表于 2023-8-3 14:11

不错，支持一下

jrwapj 发表于 2023-8-3 16:47

不错，支持，学习了

lzaiz24 发表于 2023-8-3 17:50

刚好学了点爬虫{:1_918:}

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

Python爬取武侠小说