本帖最后由 lihu5841314 于 2021-6-2 22:58 编辑
[Asm] 纯文本查看 复制代码 import requests
import re
import os
from urllib import parse
import time
from lxml import etree
for i in range(1,15):
url = "https://www.taiuu.com/book/quanbu/default-0-0-0-0-0-0-{}.html".format(i)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
r= requests.get(url=url,headers=headers,timeout=200)
r.encoding="gb2312"
tree = etree.HTML(r.text)
dl_list = tree.xpath('//div[@class="sitebox"]/dl')
book_page_url = tree.xpath('//div[@id="pages"]/a/@href')
if not os.path.exists("./books"):
os.mkdir("./books")
for dl in dl_list:
detail_url = dl.xpath('./dt/a/@href')[0]
# print(detail_url)
new_url = parse.urljoin("https://www.taiuu.com",detail_url)
# print(new_url)
r2 = requests.get(url=new_url,headers=headers)
r2.encoding ="gb2312"
tree2 = etree.HTML(r2.text)
book_name = tree2.xpath('//div[@class="book_info"]//img/@title')[0] + ".txt"
book_zuozhe = tree2.xpath('//div[@class="options"]/span/text()')[0]
book_title = tree2.xpath('//h3[@class="bookinfo_intro"]//text()')
path = "./books/" +book_name
li_list = tree2.xpath('//div[@class="book_list"]/ul/li')
# print(book_title)
for li in li_list:
book_detail_url = li.xpath('./a/@href')[0]
book_url_mu = parse.urljoin(new_url,book_detail_url)
book_mulu = li.xpath('./a/text()')[0]
# print(book_url_mu)
r3 = requests.get(url=book_url_mu,headers=headers)
tree3 = etree.HTML(r3.text)
book_detail_nr = tree3.xpath('//div[@id="htmlContent"]//text()')
# book_detail_nr = re.sub(r'(\s+)','',book_detail_nr) #怎么不行呢 显示类型错误 例子:\r\n\xa0\xa0\xa0\xa0吴仙师的脸色彻底阴沉了下来,
# out = "".join(book_detail_nr.split()) #不知道原理 还是不行
# for i in book_detail_nr:
# with open(path,"a",encoding="utf8") as pf:
# pf.write(i) #不换掉\xa0 就写不进去提示 gbk错误
print(book_mulu,"下载完毕")
print(book_name,"下载完毕") [mw_shl_code=asm,true]import requests
|