[Asm] 纯文本查看 复制代码 """
https://www.xbiquge.la/modules/article/waps.php
searchkey: 三寸人间
Referer: https://www.xbiquge.la/xuanhuanxiaoshuo/
"""
import requests,os
from bs4 import BeautifulSoup
from lxml import etree
from multiprocessing.dummy import Pool
url1 = "https://www.xbiquge.la"
url = "https://www.xbiquge.la/modules/article/waps.php"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36",
'Referer':url1
}
def get_book_url(url,name): #获取搜索的小说下载地址url
data = {
"searchkey": name
}
resp = requests.post(url=url,headers=headers,data=data)
resp.encoding = resp.apparent_encoding
soup = BeautifulSoup(resp.text,"lxml")
book_url = soup.find('td',attrs={'class':'even'}).find("a").get('href')
book_name = soup.find('td', attrs={'class': 'even'}).find("a").text
return book_url
def get_menu_url(url): #获取小说章节页名字 和 url
resp = requests.get(url=url,headers=headers)
resp.encoding = resp.apparent_encoding
soup = BeautifulSoup(resp.text, "lxml")
dd_list =soup.find('div',attrs={'id':"list"}).find_all('dd')
dic_list = []
for dd in dd_list:
menu_url =url1 + dd.find('a').get("href") #url不全 补齐
menu_name = dd.find('a').text
dic ={
"menu_url":menu_url,
"menu_name":menu_name
}
dic_list.append(dic)
return dic_list
def book_down(dic):
url = dic["menu_url"]
menu_name = dic["menu_name"] + ".txt"
resp = requests.get(url=url,headers=headers)
resp.encoding = resp.apparent_encoding
# soup = BeautifulSoup(resp.text, "lxml")
# content_list = soup.find("div",attrs={"id":"content"}).text #bs4 我搞不定 换成xpath试试
tree =etree.HTML(resp.text)
content = tree.xpath('//div[@id="content"]//text()')
content ="".join([x.strip() for x in content])
content = content.split("-----")[0] # 文本清洗完毕
with open(menu_name,"w",encoding="utf_8") as f:
f.write(content)
print(menu_name,"下载完成")
def main(name):
pool = Pool(4)
if os.path.exists(f"./{name}"):
os.mkdir(f"./{name}")
book_url = get_book_url(url, name)
dic_list = get_menu_url(book_url)
print("进程池启动")
pool.map(book_down,dic_list)
print(name, "下载完成") #基本下载功能已经完成 就是效率太低了 练习单线程异步协程
if __name__ == '__main__':
name = input("请输入需要下载的小说或者作者名字:")
main(name)
|