本帖最后由 xiaohanxxx 于 2019-9-20 09:48 编辑
[Python] 纯文本查看 复制代码
import requests
from lxml import etree
import time
import re
import os
import threading
#9.20修改get_url()
def get_url():
# 全书网每个栏目的url
for i in range(1,12):
url = 'http://www.quanshuwang.com/list/' + str(i) + '_1.html'
r = requests.get(url)
r.encoding = 'gbk'
html = etree.HTML(r.text)
lastpage = int(''.join(html.xpath('//a[@class="last"]/text()')))
yield url, lastpage
#获取每一页的小说链接
def get_url_page():
dict_url = get_url()
for i in dict_url:
for j in range(1,i[1]):
url = ''.join(re.findall(r'(.*\d_)', i[0]))
url = url + str(j) + '.html' #url拼接
r = requests.get(url)
r.encoding = 'gbk'
html = etree.HTML(r.text)
href = html.xpath('//a[@class="clearfix stitle"]/@href') # 每一页的小说url
yield href
def get_url_page_book(url):
def crawl_page():
print(url,"执行第",attempts,"次")
r = requests.get(url)
r.encoding = 'gbk'
html = etree.HTML(r.text)
'''出现的问题:
1、请求时可能出现空白页面,实际上是网络原因,需要重新多次请求尝试
'''
href = ''.join(html.xpath('//div[@class="detail"]/a/@href')) # 小说地址
title = ''.join(html.xpath('//h1/text()')) # 小说名
title = re.sub(r"[\\/:*?<>|!\.\"]", '', title) # 修改小说带有非法字符的名称
em = ''.join(html.xpath('//*[@id="waa"]/text()')) # 小说简介
path = 'F:/python/xiaoshuo' # 小说文件夹路径
read_path = path + '/' + title
r1 = requests.get(href) # 请求小说的目录页
r1.encoding = 'gbk'
html_page = etree.HTML(r1.text)
list_href = html_page.xpath('//div[@class="clearfix dirconone"]/li/a/@href') # 章节链接
list_title = html_page.xpath('//div[@class="clearfix dirconone"]/li/a/text()') # 章节标题
# print(list_href,list_title)# 小说的章节标题和链接
if not os.path.exists(read_path):#判断路径是否存在
os.mkdir(read_path) # 创建小说文件夹路径
for j in range(len(list_href)): # 保存小说章节内容
r2 = requests.get(list_href[j])
r2.encoding = 'gbk'
html_content = etree.HTML(r2.text)
try:
content = ''.join(html_content.xpath('//*[@id="content"]/text()'))
except AttributeError:
print(list_href[j],"读取内容失败")
#小说列表字符处理
list_titlee = re.sub(r"[\\/:*?<>|!\"]", '', list_title[j])
list_read_path = read_path + '/' + list_titlee
# 写入小说
if not os.path.exists(list_read_path):#判断路径是否存在
os.mkdir(list_read_path)
file = open(list_read_path + '/' + 'text.txt', 'w', encoding='utf-8')
file.write(content)
file.close()
#出现解析等问题重试五次
attempts = 1
success = False
while attempts < 6 and not success:
try:
crawl_page()
success = True
except:
print("失败重试...")
attempts += 1
if attempts == 6:
break
if __name__ == '__main__':
urls = get_url_page()
threads = []
for uu in urls:#对每一页小说进行迭代
for u in uu:
t1 = threading.Thread(target=get_url_page_book,args=(u,))
t1.start()
time.sleep(1)
threads.append(t1)
time.sleep(90)
for j in threads:
j.join()
|