python小说全站爬虫，渣渣程序需改进

xiaohanxxx 发表于 2019-9-19 13:40

本帖最后由 xiaohanxxx 于 2019-9-20 09:48 编辑

import requests
from lxml import etree
import time
import re
import os
import threading

#9.20修改get_url()
def get_url():
# 全书网每个栏目的url
for i in range(1,12):
   url = 'http://www.quanshuwang.com/list/' + str(i) + '_1.html'
   r = requests.get(url)
   r.encoding = 'gbk'
   html = etree.HTML(r.text)
   lastpage = int(''.join(html.xpath('//a[@class="last"]/text()')))
   yield url, lastpage

#获取每一页的小说链接
def get_url_page():
dict_url = get_url()
for i in dict_url:
   for j in range(1,i):
         url = ''.join(re.findall(r'(.*\d_)', i))
         url = url + str(j) + '.html'#url拼接
         r = requests.get(url)
         r.encoding = 'gbk'
         html = etree.HTML(r.text)
         href = html.xpath('//a[@class="clearfix stitle"]/@href') # 每一页的小说url
         yield href

def get_url_page_book(url):
def crawl_page():
   print(url,"执行第",attempts,"次")
   r = requests.get(url)
   r.encoding = 'gbk'
   html = etree.HTML(r.text)
   '''出现的问题：
   1、请求时可能出现空白页面，实际上是网络原因，需要重新多次请求尝试
   '''
   href = ''.join(html.xpath('//div[@class="detail"]/a/@href'))# 小说地址
   title = ''.join(html.xpath('//h1/text()'))# 小说名
   title = re.sub(r"[\\/:*?<>|!\.\"]", '', title)# 修改小说带有非法字符的名称
   em = ''.join(html.xpath('//*[@id="waa"]/text()'))# 小说简介
   path = 'F:/python/xiaoshuo'# 小说文件夹路径
   read_path = path + '/' + title

   r1 = requests.get(href)# 请求小说的目录页
   r1.encoding = 'gbk'
   html_page = etree.HTML(r1.text)
   list_href = html_page.xpath('//div[@class="clearfix dirconone"]/li/a/@href')# 章节链接
   list_title = html_page.xpath('//div[@class="clearfix dirconone"]/li/a/text()')# 章节标题
   # print(list_href,list_title)# 小说的章节标题和链接
   if not os.path.exists(read_path):#判断路径是否存在
         os.mkdir(read_path)# 创建小说文件夹路径

   for j in range(len(list_href)):# 保存小说章节内容
         r2 = requests.get(list_href)
         r2.encoding = 'gbk'
         html_content = etree.HTML(r2.text)
         try:
            content = ''.join(html_content.xpath('//*[@id="content"]/text()'))
         except AttributeError:
            print(list_href,"读取内容失败")
         #小说列表字符处理
         list_titlee = re.sub(r"[\\/:*?<>|!\"]", '', list_title)
         list_read_path = read_path + '/' + list_titlee
         # 写入小说
         if not os.path.exists(list_read_path):#判断路径是否存在
            os.mkdir(list_read_path)
            file = open(list_read_path + '/' + 'text.txt', 'w', encoding='utf-8')
            file.write(content)
            file.close()

#出现解析等问题重试五次
attempts = 1
success = False
while attempts < 6 and not success:
   try:
         crawl_page()
         success = True
   except:
         print("失败重试...")
         attempts += 1
   if attempts == 6:
         break

if __name__ == '__main__':
urls = get_url_page()
threads = []
for uu in urls:#对每一页小说进行迭代
   for u in uu:
         t1 = threading.Thread(target=get_url_page_book,args=(u,))
         t1.start()
         time.sleep(1)
         threads.append(t1)
   time.sleep(90)

for j in threads:
   j.join()

xiaohanxxx 发表于 2019-9-19 15:43

zhuce129 发表于 2019-9-19 15:31
请教，为什么不用returu用yield？这样有什么区别吗

程序控制流程不同
如果用return每个函数都要把数据全部采集下来后再给下一个函数使用，一次性过的
用yield就返回一个生成器的对象，等要用到上一级函数的时候就调用一次

xiaohanxxx 发表于 2019-9-19 14:23

狼本善发表于 2019-9-19 14:11
get_url()函数可以简化一下，这里的字符串是有规律的，可以用个循环解决。另外，可以尝试使用下Beautiful S ...

可以的，做全站爬的话就是担心BS效率的问题，但是具体效率怎么样我也不知道，哈哈

ewmwxb 发表于 2019-9-19 13:49

高手呀学习了

到最後受了傷 发表于 2019-9-19 13:54

太感谢，学习了

非笑发表于 2019-9-19 14:06

的确是高手，有没有成品的

javaxue 发表于 2019-9-19 14:11

请问这个运行环境怎么搭建

狼本善 发表于 2019-9-19 14:11

get_url()函数可以简化一下，这里的字符串是有规律的，可以用个循环解决。另外，可以尝试使用下Beautiful Soup这个库

yanmingming 发表于 2019-9-19 14:15

继续努力{:301_993:}

xiaohanxxx 发表于 2019-9-19 14:18

javaxue 发表于 2019-9-19 14:11
请问这个运行环境怎么搭建

1、新建一个小说的存放路径：F:/python/xiaoshuo
2、安装上面用到的python库
3、运行程序
有关线程方面的问题没有解决，奈何技术限制{:301_972:}

xiaohanxxx 发表于 2019-9-19 14:20

非笑发表于 2019-9-19 14:06
的确是高手，有没有成品的

目前没有，还有些问题没有解决，现在只能将就用{:301_998:}

页: [1] 2 3

吾爱破解 - 52pojie.cn's Archiver

python小说全站爬虫，渣渣程序需改进