【爬虫】小说爬取实例
最近看了太一生水的《万古至尊》,觉得挺好看的,推荐一下。随便找了个网站,下载一下。
以下是爬虫源码。
如果由于网络因素等问题部分章节没有下载下来,可以再次运行,程序只会下载那些没有的,不用担心重复下载浪费时间。
# -*- coding:utf-8 -*-
import requests
from lxml import etree
import os
from multiprocessing.dummy import Pool
def create_path(file_path):
if not os.path.exists(file_path):
os.makedirs(file_path)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
}
book_url = 'https://www.bqvvxg8.cc/wenzhang/1/1424/' + 'index.html'
book_detail_content = requests.get(url=book_url, headers=headers)
book_detail_content.encoding = 'gbk'
book_detail_content = book_detail_content.text
book_detail_tree = etree.HTML(book_detail_content)
book_name = book_detail_tree.xpath('//div[@class="book"]/div[@class="info"]/h2/text()')
create_path('./' + book_name)
chapter_dd_list = book_detail_tree.xpath('//div[@class="listmain"]/dl/dd')
def down_chapter(dd):
chapter_url = 'https://www.bqvvxg8.cc/' + dd.xpath('./a/@href')
chapter_title = dd.xpath('./a/text()').replace('?', '?')
chapter_txt_path = './' + book_name +'/' + chapter_title + '.txt'
if not os.path.exists(chapter_txt_path):
chapter_content = requests.get(url=chapter_url, headers=headers).text
chapter_tree = etree.HTML(chapter_content)
chapter_text = chapter_tree.xpath('//*[@id="content"]/text()')
# 保存章节
with open(chapter_txt_path, 'a', encoding='UTF-8') as file:
file.write(chapter_title)
for i in range(1, chapter_text.__len__() - 3):
file.write(chapter_text)
print(chapter_title, " 下载成功")
pool = Pool(20)
pool.map(down_chapter, chapter_dd_list)
pool.close()
pool.join() icer233 发表于 2024-11-20 20:15
可以用网上的免费代{过}{滤}理
# -*- coding:utf-8 -*-
import requests
from lxml import etree
import os
from multiprocessing.dummy import Pool
import random
def create_path(file_path):
if not os.path.exists(file_path):
os.makedirs(file_path)
fetch or update this list as needed)
proxy_list = [
{"http": "http://127.0.0.1:7890"},
{"http": "http://127.0.0.1:7891"},
]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
}
book_url = 'https://www.bqvvxg8.cc/wenzhang/1/1424/' + 'index.html'
# Fetch the book's main page using a random proxy
proxy = random.choice(proxy_list)
book_detail_content = requests.get(url=book_url, headers=headers, proxies=proxy)
book_detail_content.encoding = 'gbk'
book_detail_content = book_detail_content.text
book_detail_tree = etree.HTML(book_detail_content)
book_name = book_detail_tree.xpath('//div[@class="book"]/div[@class="info"]/h2/text()')
create_path('./' + book_name)
chapter_dd_list = book_detail_tree.xpath('//div[@class="listmain"]/dl/dd')
def down_chapter(dd):
chapter_url = 'https://www.bqvvxg8.cc/' + dd.xpath('./a/@href')
chapter_title = dd.xpath('./a/text()').replace('?', '?')
chapter_txt_path = './' + book_name + '/' + chapter_title + '.txt'
if not os.path.exists(chapter_txt_path):
# Fetch the chapter content using a random proxy
proxy = random.choice(proxy_list)
chapter_content = requests.get(url=chapter_url, headers=headers, proxies=proxy).text
chapter_tree = etree.HTML(chapter_content)
chapter_text = chapter_tree.xpath('//*[@id="content"]/text()')
# Save the chapter content
with open(chapter_txt_path, 'a', encoding='UTF-8') as file:
file.write(chapter_title + "\n")
for i in range(1, len(chapter_text) - 3):
file.write(chapter_text + "\n")
print(chapter_title, "下载成功")
# Use multithreading to download chapters
pool = Pool(20)
pool.map(down_chapter, chapter_dd_list)
pool.close()
pool.join()
好的,感谢{:301_975:} 这个好,谢谢大神。 学习一下,感谢教程 感谢分享 感谢分享,学习学习 感谢分享 感谢分享 I感谢楼主分享技术 感谢大神分享 试了一下请求的频率高了报错,应该是服务器限流或者没抗住,加个err等5s重试就OK