本帖最后由 luoshiyong123 于 2020-5-2 00:20 编辑
V1.0
---抓取的笔趣阁网站上的---用的xpath解析html
---后边的章节目录为啥不见了尴尬
[Python] 纯文本查看 复制代码 import requests
import unicodedata
from lxml import etree
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
}
num = 10 #下载多少章
first = 'https://www.bqg5.cc/1_1273/'
numfirst = 669621
last = '.html'
for i in range(0,num):
url = first+str(numfirst+3*i)+last
res = requests.get(url = url,headers=headers)
if res.status_code==200:
print('请求成功!')
html = etree.HTML(res.text)
data = html.xpath('/html/body/div[@id="wrapper"]/div[@class="content_read"]/div[@class="box_con"]/div[@id="content"]/p/text()')
mystr = '\n'.join(data)
print(mystr)
if i==0:
with open('C:/Users/lsy/Desktop/1.txt','w') as fp:
fp.write(mystr)
else:
with open('C:/Users/lsy/Desktop/1.txt','a') as fp:
fp.write(mystr)
V1.1
经过大神们的回复,目前修复了章节不存在的bug,但是还是存在下载速度不够快和下到了500多章后程序无响应这种异常情况
[Python] 纯文本查看 复制代码 import requests
import unicodedata
from lxml import etree
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
}
mulu_url = 'https://www.bqg5.cc/5_5157/'
main_url='https://www.bqg5.cc'
rep = requests.get(url = mulu_url,headers=headers)
if rep.status_code==200:
print('请求目录页成功!')
mulu_html = etree.HTML(rep.text)
mulu_array= mulu_html.xpath('//div[@class="box_con"]//dl/dd/a/@href')
num = len(mulu_array) #章节一共有多少章
for i in range(9,num):
#rint('i='+str(i))
xiazai_url = main_url+mulu_array[i]
#print(xiazai_url)
res = requests.get(url = xiazai_url,headers=headers)
if res.status_code==200:
print('请求成功!')
print('正在下载',i-8)
html = etree.HTML(res.text)
data = html.xpath('/html/body/div[@id="wrapper"]/div[@class="content_read"]/div[@class="box_con"]/div[@id="content"]/p/text()')
mystr = '\n'.join(data)
#print(mystr)
if i==9:
with open('C:/Users/lsy/Desktop/蛊真人1.txt','w') as fp:
fp.write(mystr)
else:
with open('C:/Users/lsy/Desktop/蛊真人1.txt','a') as fp:
fp.write(mystr)
else:
print('请求失败!'+xiazai_url) |