本帖最后由 nightcat 于 2020-8-30 21:21 编辑
[Python] 纯文本查看 复制代码
import requests
import lxml.etree
from fake_useragent import UserAgent
from pyquery import PyQuery as pq
ua = UserAgent(verify_ssl=False).chrome
header = {'user-agent': ua}
def start_url(url):
response = requests.get(url,headers=header)
selector = lxml.etree.HTML(response.text)
title = pq(response.content)("#content h1").text()
body = pq(response.content)('#cont-text').text()
body = body.replace(r'DaoCaoRen.getCode("ui-content");','')
file = f'{title}.txt'
download_file(file,body,title)
for i in selector.xpath('//*[@id="content"]/div[3]/ul/li/a/@href'):
new_url = "https://www.daocaorenshuwu.com/book/yinhezhixin2/" + str(i)
new_body = content(new_url)
new_body = new_body.replace(r'DaoCaoRen.getCode("ui-content");', '')
download_file(file,conttext=new_body)
def content(url):
response = requests.get(url, headers=header)
new_body = pq(response.content)('#cont-text').text()
return new_body
def download_file(file,conttext,title=None):
with open(file,'a+',encoding='utf-8') as f:
if title:
f.write(f'{title}\n')
f.write(conttext)
if __name__ == '__main__':
for i in range(1128871, 1128911):
url = "https://www.daocaorenshuwu.com/book/yinhezhixin2/" + str(i) + ".html"
start_url(url)
|