本帖最后由 事燃cs释然 于 2020-9-18 21:29 编辑
之前发布过一个功能差不多的爬虫程序,不过写的比较粗陋,现在把代码该删删,该换换,最后将代码进行了模块化,最终成型了下面的代码:
运行顺序:
1、main.py:获取用户输入的目录页地址
2、chapterList.py:解析所有的章节名及章节所在地址
2.1、query.py:判断该小说是否已经保存
3、chapter.py:解析每个章节内容
4、save.py:保存解析出来的数据
1、main.py:
[Python] 纯文本查看 复制代码 import chapterList
def main():
url = input("请输入小说目录页地址:")
book = chapterList.classify(url)
input("保存结束!请点击任意键退出")
if __name__ == '__main__':
main()
2、chapterList.py
[Python] 纯文本查看 复制代码 #解析整本书的章节名及章节地址
from lxml import etree
import requests
import chapter
import query
HEADER = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
#解析规则
rules = {
"xbiquge":{
"title":'//div[@id="list"]//dd//text()',
"urlList":'//div[@id="list"]//dd/a/@href',
"book_name":'//div[@id="info"]//h1/text()',
"realm":"http://www.xbiquge.la/",
"code":"utf-8"
},
"dawen":{
"title":'//ul[@id="listsss"]//a/text()',
"urlList":'//li[@id="chapter"]//a/@href',
"book_name":'//div[@class="top"]//h3//text()',
"realm":"",
"code":"utf-8"
},
"biquge":{
"title":'//div[@id="list"]//dd//text()',
"urlList":'//div[@id="list"]//dd/a/@href',
"book_name":'//div[@id="info"]//h1/text()',
"realm":"",
"code":"utf-8"
}
}
#判断网站,识别运行哪一种解析规则
def classify(url):
for web in rules:
if rules[web]["realm"] == "":rules[web]["realm"] = url
webName = url.split(".")[1]
begin = False
for i in rules:
if webName == i:
begin = True
if begin:return analysis(webName,url)
else:
print("链接错误或暂不支持该网站,可前往论坛反馈,要求楼主添加!!!")
return
#进行解析,获取章节名和章节链接
def analysis(webName,url):
html = etree.HTML(requests.get(url,HEADER).content.decode(rules[webName]['code']))
title = html.xpath(rules[webName]['title'])
urlList = html.xpath(rules[webName]['urlList'])
book_name = html.xpath(rules[webName]['book_name'])[0]
#判断,判断该小说是否保存过
query_all = query.query(book_name,title,urlList)
store_old = []
if query_all:
title = query_all["title"]
urlList = query_all["urlList"]
store_old = query_all["store"]
print("小说名称:%s"%book_name)
print("小说章节:%d章"%(len(title)+len(store_old)))
print("已存章节:%d章"%len(store_old))
if input("是否开始保存?(y/n):") == "n":return
print("-----开始保存-----")
realm = rules[webName]['realm']
store_all = []
for i in range(len(title)):
store = {
"title":title[i],
"urlList":realm+urlList[i]
}
store_all.append(store)
chapter.classify(book_name,webName,store_all,store_old)
2.1、query.py
[Python] 纯文本查看 复制代码 #查找文件列表,判断是否保存过该小说,若保存过则从记录开始
import json
import os
def query(book_name,title,urlList):
if os.path.isfile("%s.js"%book_name):
with open('%s.js'%book_name,'r',encoding='utf-8')as fp:
store = json.load(fp)
index = 0
for i in store:
for y in range(len(title)):
if i['title'] == title[y]:
index = y+1
del title[:index]
del urlList[:index]
query_all = {
"title":title,
"urlList":urlList,
"store":store
}
return query_all
return False
3、chapter.py
[Python] 纯文本查看 复制代码 #解析每一个章节中的内容
from lxml import etree
import requests
import save
HEADER = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
store_save = []
def classify(book_name,webName,store,store_old):
global store_save
store_save = store_old
for i in range(len(store)):
title = store[i]['title']
urlList = store[i]['urlList']
analysis(webName,book_name,title,urlList)
save.save(book_name,store_save)
#解析规则
rules = {
"xbiquge":{
"chapter":'//div[@id="content"]/text()',
"code":"utf-8"
},
"dawen":{
"chapter":'//div[@class="art_con"]/dd[not(@data-id="999")]//text()',
"code":"utf-8"
},
"biquge":{
"chapter":'//div[@id="content"]/text()',
"code":"utf-8"
}
}
#解析每个章节的内容
def analysis(webName,book_name,title,urlList):
html = etree.HTML(requests.get(urlList,HEADER).content.decode(rules[webName]['code']))
chapter = html.xpath(rules[webName]['chapter'])
if chapter == []:
analysis(webName,book_name,title,urlList)
return
chapter_cont = ''
for i in chapter:
if len(i) >= 2:
chapter_cont = chapter_cont + "\n" + i.strip()
storeList = {
"title":title,
"store":chapter_cont
}
store_save.append(storeList)
save.save(book_name,store_save)
print(title+"----保存成功")
return store_save
4、save.py
[Python] 纯文本查看 复制代码 #将解析获得的章节名及章节内容保存
import json
def save(book_name,store):
with open("%s.js"%book_name,"w",encoding='utf-8') as fp:
json.dump(store,fp,ensure_ascii=False)
with open("%s.txt"%book_name,"w",encoding='utf-8')as fp:
for i in store:
fp.write(i["title"])
fp.write(i["store"]+'\n')
最后附上打包好的程序的帖子链接:https://www.52pojie.cn/forum.php?mod=viewthread&tid=1265020&page=1#pid34083564 |