本帖最后由 苏紫方璇 于 2024-1-21 16:36 编辑
[Python] 纯文本查看 复制代码 import re
import time
import lxml
from bs4 import BeautifulSoup
import requests
def get_description(book_url,headers=None):
text_id = book_url.split('/')[-2]
print(text_id)
headers = {
'Host': "www.qidian.com",
"Referer": headers,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}
resp = requests.get(url, headers=headers)
description = re.search(r'<main id="c-{0}" data-type="cjk".*?<p>(.*?)</main>'.format(text_id), resp.text).group(1)
print(description)
bs4 = BeautifulSoup(resp.text, 'lxml')
result = bs4.findAll('a', class_='nav-btn')
for i in result:
print(i.text)
if i.text == '下一章':
next_url = i['href'].split('//')[1]
# with open('my_test.html', "w") as f:
# f.write(description)
return 'https://'+next_url
if __name__ == '__main__':
url = 'https://www.qidian.com/chapter/1038476773/774184394/'
for i in range(10):
if i==0:
headers='https://www.qidian.com/chapter/1038476773'
else:
headers=url
url = get_description(url,headers=headers)
time.sleep(3) |