通用在线小说阅读爬虫

ccwuax 发表于 2021-8-20 14:50

看了很多Python的爬虫例子，就试着学习，发现大多数小说网站爬虫其实大同小异，就想试着写一个通用的，不过想想容易做做难，不同网站对于爬虫的处理还是有些许不同，折个中，改成不同网站不同配置，我自己用百度搜了在线小说，已经写了3个配置，其他网站看懂的应该自己能写上去！
顺便说一下爬取思路，首先根据文章目录取得章节标题文字和相应的url,再爬取内容存入文件即可，利用xpath定位，没有使用多线程、协程，一来主要是为学习，提供思路，二来我自己并不看小说，大规模多线程会对服务器会造成压力
直接贴源码：
from lxml import etree
from os import makedirs,path,getcwd, system
import requests
from time import sleep
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)#移除警告
is_prox=False
proxies =None# {"http": "http://127.0.0.1:10809"}#使用代{过}{滤}理
mysession = requests.Session()#创建session对象
ser_num = 0 #输出时的计数
allow_redirects=False
rooturl=cata_url=title_xpath=url_xpath=filename_xpaht=content_xpath=""
def isconnect_enable(_url):
try:
   # if proxies !=None:
   requests.get(_url,proxies=proxies)
   print("网站可以连通")
   return True
except :
   print("网站无法连接")
   return False
def gethtml(_url):#根据网址取得网页html源码
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36",
}
try_count=5#尝试次数
while True:
   try:
         print("正在获取HTML,网址:",_url,end="")
         html = mysession.get(_url, headers=header,proxies=proxies,allow_redirects=allow_redirects)
         # html = html.content
         html.encoding = html.apparent_encoding#直接执行encoding为正确编码
         # print(html)
         mysession.close()
         return html.text
   except requests.exceptions.RequestException:
         if try_count<=0:
            print("\n获取HTML失败，程序退出！")
            exit(0)
         sleep(0.1)
         print(".",end="")
         try_count-=1
def get_title_titleurl(_url,title_xpath,url_xpath,filename_xpath):#根据目录页提取文章题目和url,返回字典和小说名字（文件名）
'''_url:目录面网址
   title_xpath：题目的xpath,记得加text()哦
   url_xpath：内容页网址的xpaht,记得加@href
   filename_xpath:获取小说题目的xpath
'''
html = gethtml(_url)#取得列表网址源码
tree = etree.HTML(html,etree.HTMLParser())
filename = str(tree.xpath(filename_xpath)).strip()
title_dict = {}#返回的字典
title_list = tree.xpath(title_xpath)
url_list = tree.xpath(url_xpath)#这里要注意,默认为全路径，如果是相对路径需要加根url哦
for i in range(len(title_list)):
   title_dict).strip()]=url_list
return title_dict,filename#返回字典和小说名字（文件名）
def get_file_content(url,content_xpath):#取得文章内容
userAgent = r"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
header = {'User-Agent': userAgent}
global ser_num
print (ser_num,"\t正在获取网页内容...",end="")
ser_num+=1
postUrl=url
responseRes = mysession.get(postUrl,proxies=proxies,headers = header,verify=False)
mysession.close()#关闭连接
responseRes.encoding='utf-8'
# print("返回状态码：",responseRes.status_code,end="\t")
if str(responseRes.status_code)=="200":
   print("获取成功!",end="\t")
tree =etree.HTML(responseRes.text)
str_content = tree.xpath(content_xpath)
str_content="".join(str_content).replace("\r\n","\n")#清除一下多余的空行
return str_content
def write_to_file(_dirs,_filename,_content,chapter="",_extfn=".txt",mode="w",_prefix="",_fileurl=""):#写入到文件
'''
_dirs:目录
_prefix:文件名前缀
_filename:文件名
_content:文件内容
_extfn:文件扩展名
mode：写入模式
_fileurl:本文网址
'''
if chapter!="":
   # _filename=_filename+"\t"+chapter
   # chapter
   print("写入->文件夹:{0}\t文件名:{1}".format(_dirs,_prefix+_filename),"\t"+chapter)
else:
   print("写入->文件夹:{0}\t文件名:{1}".format(_dirs,_prefix+_filename))
if not path.exists(_dirs):
   makedirs(_dirs)
with open(_dirs+"\\"+_prefix+_filename+".txt",mode,encoding = "utf-8") as myfile:
   # myfile.write(_filename+"\n")
   if _fileurl!="":
         myfile.write("本文网址:"+str(_fileurl)+"\n")
   myfile.write(_content)
def config(site_choice,bookid):
if site_choice=="纵横":
   global rooturl,cata_url,title_xpath,url_xpath,filename_xpaht,content_xpath,allow_redirects
   rooturl = r''#章节url如果是相对路径需要在此添加根路径
   cata_url = f'http://book.zongheng.com/showchapter/{bookid}.html'                #目录页网址
   title_xpath='/html/body/div/div/div/div/ul/li/a/text()'       #章节文字xpath
   url_xpath='/html/body/div/div/div/div/ul/li/a/@href'          #章节url的xpath
   filename_xpaht='/html/body/div/div/h1/text()'                      #文件名,即小说名字的xpath
   content_xpath='//*[@id="readerFt"]/div/div//text()'                   #内容页文本的xpath
if site_choice=="17k":
   # global rooturl,cata_url,title_xpath,url_xpath,filename_xpaht,content_xpath
   rooturl = r'http://www.17k.com'#章节url如果是相对路径需要在此添加根路径
   cata_url = f'http://www.17k.com/list/{bookid}.html'                #目录页网址
   title_xpath='/html/body/div/dl/dd/a/span/text()'       #章节文字xpath
   url_xpath='/html/body/div/dl/dd/a/@href'          #章节url的xpath
   filename_xpaht='/html/body/div/h1/text()'                      #文件名,即小说名字的xpath
   content_xpath='/html/body/div/div/div/div/div/p//text()'                   #内容页文本的xpath
if site_choice=="起点":
   # global rooturl,cata_url,title_xpath,url_xpath,filename_xpaht,content_xpath
   allow_redirects=True
   rooturl = r'http:'#章节url如果是相对路径需要在此添加根路径
   cata_url = f'http://book.qidian.com/info/{bookid}/#Catalog'                #目录页网址
   title_xpath='//*[@id="j-catalogWrap"]/div/div/ul/li/a/text()'       #章节文字xpath
   url_xpath='//*[@id="j-catalogWrap"]/div/div/ul/li/a/@href'          #章节url的xpath
   filename_xpaht='/html/body/div/div/div/div/h1/em/text()'                      #文件名,即小说名字的xpath
   content_xpath='//div/div/div/div/p/text()'                   #内容页文本的xpath
                  # /html/body/div/div/div/div/div/div/div/p/span
if __name__ == "__main__":
site_list = ["纵横","17k","起点"]
for i in range(len(site_list)):
   print("\t",i,"\t",site_list)
site=input("请选择网站：")
site=site_list
bookid=input("请输入文章ID:")
config(site,str(bookid))
path_dir = ".\mytemp"                                                    #存放小说文件的路径
title_url_dict,filename = get_title_titleurl(cata_url,title_xpath,url_xpath,filename_xpaht)
print("\t小说或文章标题:《"+filename+"》")
for title in title_url_dict:
   target_url = rooturl+title_url_dict#内容页网址
   content = get_file_content(target_url,content_xpath)
   write_to_file(path_dir,filename,"*"*100+"\n"+title+"\n"+(content)+"\n",chapter=title,mode="a",_fileurl=target_url)
   sleep(1)
print("程序运行结束!!!")

页: [1]

吾爱破解 - 52pojie.cn's Archiver

通用在线小说阅读爬虫