本帖最后由 lihu5841314 于 2021-7-7 20:26 编辑
[Asm] 纯文本查看 复制代码 """
目标网站https://www.xbiquge.la/
#tqdm 进度条库
#parsel 解析库可以使用css 和xpath
"""
import requests,os,datetime,random
import parsel
from tqdm import tqdm
import pandas as pd
import aiohttp
import aiofiles
import asyncio
headers= {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
'Referer': 'https: // www.xbiquge.la /'
}
def get_response(html_url):
resp = requests.get(url=html_url,headers=headers,timeout=200)
resp.encoding = resp.apparent_encoding
return resp
async def Chapter_content(dic): # 提取单页的内容
html_url = dic['chapter_url']
index = str(dic['index'])
async with aiohttp.ClientSession() as session:
for i in range(4):
async with await session.get(url=html_url,headers=headers) as resp:
if resp.status!=200:
await asyncio.sleep(random.randrange(5))
continue
else:
resp = await resp.text(encoding="utf-8")
break
if len(index) < 5:
index = "0" * (5 - len(index)) + index
# 实例化parsel对象
selector = parsel.Selector(resp)
# css解析
content = selector.css('#content::text').extract()
chapter_name = selector.xpath('//div[@class="box_con"]/div[2]/h1/text()').extract_first()
chapter_content = "".join([x.strip() + '\n' for x in content])
path = novel_name1 + "/" + index + ".txt"
async with aiofiles.open(path, 'w', encoding="utf-8") as f:
if chapter_name != None:
await f.write(chapter_name)
await f.write("\n")
await f.write(chapter_content)
await f.write("\n")
def marge():
path = os.getcwd()
os.chdir(path+"/" + novel_name1 )
lis = os.listdir()
s = "+".join(lis)
os.system(fr'copy /b {s} {novel_name1}.txt')
print(s)
for file in os.listdir():
if not file.startswith(novel_name1):
os.remove(file)
def novel_chapter_memu(html_url): #分析章节的来源 小说目录
resp = get_response(html_url)
# 实例化parsel对象
selector = parsel.Selector(resp.text)
# css解析
lis_href = selector.css('#list dd a::attr(href)').getall()
dics = []
index = 0
for chapter_url in lis_href:
index +=1
chapter_url = 'https://www.xbiquge.la' + chapter_url
dic ={
'index':index,
'chapter_url':chapter_url
}
dics.append(dic)
return dics
def search():
while True:
serch_url = 'https://www.xbiquge.la/modules/article/waps.php' # Request Method: POST Referer: https://www.xbiquge.la/
name = str(input('请输入需要下载的小说名字或者作者名字:'))
print(f'正在查询小说{name.center(int(50),"-")}')
data = {
'searchkey': name
}
resp = requests.post(url=serch_url,headers=headers,data=data)
resp.encoding = resp.apparent_encoding
selector = parsel.Selector(resp.text)
tr_lis = selector.xpath('//div[@id="content"]//tr')[1:]
dics =[]
for tr in tr_lis:
novel_url = tr.xpath('./td[@class="even"]/a/@href').extract_first()
novel_name = tr.xpath('./td[@class="even"]/a/text()').extract_first()
author_name = tr.xpath('./td[3]/text()').extract_first()
dic = {
'novel_name':novel_name,
'author_name':author_name,
'novel_url':novel_url,
}
dics.append(dic)
df = pd.DataFrame(dics)
if df.empty:
print("没有找到,请重新输入")
continue
print(df)
num = int(input("请选择需要下载的小说:"))
url = df.iloc[num][2]
novel_name1 = df.iloc[num][0]
return url,novel_name1
def main():
while True:
start = datetime.datetime.now()
global novel_name1
memu_url,novel_name1 = search()
if not os.path.exists(novel_name1):
os.makedirs(novel_name1)
dics = novel_chapter_memu(memu_url)
tasks = []
for dic in tqdm(dics):
task = asyncio.ensure_future(Chapter_content(dic))
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
marge()
print('耗时',datetime.datetime.now()-start,"秒")
select = input("是否需要继续下载Y/N:")
if select == 'Y':
continue
else:
break
if __name__ == '__main__':
loop = asyncio.get_event_loop()
main()
#加循环 和随机延时 解决问题 [Asm] 纯文本查看 复制代码 """
目标网站https://www.xbiquge.la/
#tqdm 进度条库
#parsel 解析库可以使用css 和xpath
"""
import requests,os,datetime
import parsel
from tqdm import tqdm
import pandas as pd
import aiohttp
import aiofiles
import asyncio
headers= {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
'Referer': 'https: // www.xbiquge.la /'
}
def get_response(html_url):
resp = requests.get(url=html_url,headers=headers,timeout=200)
resp.encoding = resp.apparent_encoding
return resp
async def Chapter_content(dic): # 提取单页的内容
html_url = dic['chapter_url']
index = str(dic['index'])
async with aiohttp.ClientSession() as session:
async with await session.get(url=html_url,headers=headers) as resp:
resp = await resp.text(encoding="utf-8")
if len(index) < 5:
index = "0" * (5 - len(index)) + index
# 实例化parsel对象
selector = parsel.Selector(resp)
# css解析
content = selector.css('#content::text').extract()
chapter_name = selector.xpath('//div[@class="box_con"]/div[2]/h1/text()').extract_first()
chapter_content = "".join([x.strip() + '\n' for x in content])
path = novel_name1 + "/" + index + ".txt"
async with aiofiles.open(path, 'w', encoding="utf-8") as f:
if chapter_name != None:
await f.write(chapter_name)
await f.write("\n")
await f.write(chapter_content)
await f.write("\n")
await asyncio.sleep(1)
def marge():
path = os.getcwd()
os.chdir(path+"/" + novel_name1 )
lis = os.listdir()
s = "+".join(lis)
os.system(fr'copy /b {s} {novel_name1}.txt')
print(s)
for file in os.listdir():
if not file.startswith(novel_name1):
os.remove(file)
def novel_chapter_memu(html_url): #分析章节的来源 小说目录
resp = get_response(html_url)
# 实例化parsel对象
selector = parsel.Selector(resp.text)
# css解析
lis_href = selector.css('#list dd a::attr(href)').getall()
dics = []
index = 0
for chapter_url in lis_href:
index +=1
chapter_url = 'https://www.xbiquge.la' + chapter_url
dic ={
'index':index,
'chapter_url':chapter_url
}
dics.append(dic)
return dics
def search():
while True:
serch_url = 'https://www.xbiquge.la/modules/article/waps.php' # Request Method: POST Referer: https://www.xbiquge.la/
name = str(input('请输入需要下载的小说名字或者作者名字:'))
print(f'正在查询小说{name.center(int(50),"-")}')
data = {
'searchkey': name
}
resp = requests.post(url=serch_url,headers=headers,data=data)
resp.encoding = resp.apparent_encoding
selector = parsel.Selector(resp.text)
tr_lis = selector.xpath('//div[@id="content"]//tr')[1:]
dics =[]
for tr in tr_lis:
novel_url = tr.xpath('./td[@class="even"]/a/@href').extract_first()
novel_name = tr.xpath('./td[@class="even"]/a/text()').extract_first()
author_name = tr.xpath('./td[3]/text()').extract_first()
dic = {
'novel_name':novel_name,
'author_name':author_name,
'novel_url':novel_url,
}
dics.append(dic)
df = pd.DataFrame(dics)
if df.empty:
print("没有找到,请重新输入")
continue
print(df)
num = int(input("请选择需要下载的小说:"))
url = df.iloc[num][2]
novel_name1 = df.iloc[num][0]
return url,novel_name1
def main():
while True:
start = datetime.datetime.now()
global novel_name1
memu_url,novel_name1 = search()
if not os.path.exists(novel_name1):
os.makedirs(novel_name1)
dics = novel_chapter_memu(memu_url)
tasks = []
for dic in tqdm(dics):
task = asyncio.ensure_future(Chapter_content(dic))
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
marge()
print('耗时',datetime.datetime.now()-start,"秒")
select = input("是否需要继续下载Y/N:")
if select == 'Y':
continue
else:
break
if __name__ == '__main__':
loop = asyncio.get_event_loop()
main()
|