通用在线小说阅读爬虫
看了很多Python的爬虫例子,就试着学习,发现大多数小说网站爬虫其实大同小异,就想试着写一个通用的,不过想想容易做做难,不同网站对于爬虫的处理还是有些许不同,折个中,改成不同网站不同配置,我自己用百度搜了在线小说,已经写了3个配置,其他网站看懂的应该自己能写上去!顺便说一下爬取思路,首先根据文章目录取得章节标题文字和相应的url,再爬取内容存入文件即可,利用xpath定位,没有使用多线程、协程,一来主要是为学习,提供思路,二来我自己并不看小说,大规模多线程会对服务器会造成压力
直接贴源码:
from lxml import etree
from os import makedirs,path,getcwd, system
import requests
from time import sleep
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)#移除警告
is_prox=False
proxies =None# {"http": "http://127.0.0.1:10809"}#使用代{过}{滤}理
mysession = requests.Session()#创建session对象
ser_num = 0 #输出时的计数
allow_redirects=False
rooturl=cata_url=title_xpath=url_xpath=filename_xpaht=content_xpath=""
def isconnect_enable(_url):
try:
# if proxies !=None:
requests.get(_url,proxies=proxies)
print("网站可以连通")
return True
except :
print("网站无法连接")
return False
def gethtml(_url):#根据网址取得网页html源码
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36",
}
try_count=5#尝试次数
while True:
try:
print("正在获取HTML,网址:",_url,end="")
html = mysession.get(_url, headers=header,proxies=proxies,allow_redirects=allow_redirects)
# html = html.content
html.encoding = html.apparent_encoding#直接执行encoding为正确编码
# print(html)
mysession.close()
return html.text
except requests.exceptions.RequestException:
if try_count<=0:
print("\n获取HTML失败,程序退出!")
exit(0)
sleep(0.1)
print(".",end="")
try_count-=1
def get_title_titleurl(_url,title_xpath,url_xpath,filename_xpath):#根据目录页提取文章题目和url,返回字典和小说名字(文件名)
'''_url:目录面网址
title_xpath:题目的xpath,记得加text()哦
url_xpath:内容页网址的xpaht,记得加@href
filename_xpath:获取小说题目的xpath
'''
html = gethtml(_url)#取得列表网址源码
tree = etree.HTML(html,etree.HTMLParser())
filename = str(tree.xpath(filename_xpath)).strip()
title_dict = {}#返回的字典
title_list = tree.xpath(title_xpath)
url_list = tree.xpath(url_xpath)#这里要注意,默认为全路径,如果是相对路径需要加根url哦
for i in range(len(title_list)):
title_dict).strip()]=url_list
return title_dict,filename#返回字典和小说名字(文件名)
def get_file_content(url,content_xpath):#取得文章内容
userAgent = r"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
header = {'User-Agent': userAgent}
global ser_num
print (ser_num,"\t正在获取网页内容...",end="")
ser_num+=1
postUrl=url
responseRes = mysession.get(postUrl,proxies=proxies,headers = header,verify=False)
mysession.close()#关闭连接
responseRes.encoding='utf-8'
# print("返回状态码:",responseRes.status_code,end="\t")
if str(responseRes.status_code)=="200":
print("获取成功!",end="\t")
tree =etree.HTML(responseRes.text)
str_content = tree.xpath(content_xpath)
str_content="".join(str_content).replace("\r\n","\n")#清除一下多余的空行
return str_content
def write_to_file(_dirs,_filename,_content,chapter="",_extfn=".txt",mode="w",_prefix="",_fileurl=""):#写入到文件
'''
_dirs:目录
_prefix:文件名前缀
_filename:文件名
_content:文件内容
_extfn:文件扩展名
mode:写入模式
_fileurl:本文网址
'''
if chapter!="":
# _filename=_filename+"\t"+chapter
# chapter
print("写入->文件夹:{0}\t文件名:{1}".format(_dirs,_prefix+_filename),"\t"+chapter)
else:
print("写入->文件夹:{0}\t文件名:{1}".format(_dirs,_prefix+_filename))
if not path.exists(_dirs):
makedirs(_dirs)
with open(_dirs+"\\"+_prefix+_filename+".txt",mode,encoding = "utf-8") as myfile:
# myfile.write(_filename+"\n")
if _fileurl!="":
myfile.write("本文网址:"+str(_fileurl)+"\n")
myfile.write(_content)
def config(site_choice,bookid):
if site_choice=="纵横":
global rooturl,cata_url,title_xpath,url_xpath,filename_xpaht,content_xpath,allow_redirects
rooturl = r''#章节url如果是相对路径需要在此添加根路径
cata_url = f'http://book.zongheng.com/showchapter/{bookid}.html' #目录页网址
title_xpath='/html/body/div/div/div/div/ul/li/a/text()' #章节文字xpath
url_xpath='/html/body/div/div/div/div/ul/li/a/@href' #章节url的xpath
filename_xpaht='/html/body/div/div/h1/text()' #文件名,即小说名字的xpath
content_xpath='//*[@id="readerFt"]/div/div//text()' #内容页文本的xpath
if site_choice=="17k":
# global rooturl,cata_url,title_xpath,url_xpath,filename_xpaht,content_xpath
rooturl = r'http://www.17k.com'#章节url如果是相对路径需要在此添加根路径
cata_url = f'http://www.17k.com/list/{bookid}.html' #目录页网址
title_xpath='/html/body/div/dl/dd/a/span/text()' #章节文字xpath
url_xpath='/html/body/div/dl/dd/a/@href' #章节url的xpath
filename_xpaht='/html/body/div/h1/text()' #文件名,即小说名字的xpath
content_xpath='/html/body/div/div/div/div/div/p//text()' #内容页文本的xpath
if site_choice=="起点":
# global rooturl,cata_url,title_xpath,url_xpath,filename_xpaht,content_xpath
allow_redirects=True
rooturl = r'http:'#章节url如果是相对路径需要在此添加根路径
cata_url = f'http://book.qidian.com/info/{bookid}/#Catalog' #目录页网址
title_xpath='//*[@id="j-catalogWrap"]/div/div/ul/li/a/text()' #章节文字xpath
url_xpath='//*[@id="j-catalogWrap"]/div/div/ul/li/a/@href' #章节url的xpath
filename_xpaht='/html/body/div/div/div/div/h1/em/text()' #文件名,即小说名字的xpath
content_xpath='//div/div/div/div/p/text()' #内容页文本的xpath
# /html/body/div/div/div/div/div/div/div/p/span
if __name__ == "__main__":
site_list = ["纵横","17k","起点"]
for i in range(len(site_list)):
print("\t",i,"\t",site_list)
site=input("请选择网站:")
site=site_list
bookid=input("请输入文章ID:")
config(site,str(bookid))
path_dir = ".\mytemp" #存放小说文件的路径
title_url_dict,filename = get_title_titleurl(cata_url,title_xpath,url_xpath,filename_xpaht)
print("\t小说或文章标题:《"+filename+"》")
for title in title_url_dict:
target_url = rooturl+title_url_dict#内容页网址
content = get_file_content(target_url,content_xpath)
write_to_file(path_dir,filename,"*"*100+"\n"+title+"\n"+(content)+"\n",chapter=title,mode="a",_fileurl=target_url)
sleep(1)
print("程序运行结束!!!")
页:
[1]