Python爬虫新手,第一个项目是笔趣阁小说下载,感觉有很多可以改正的地方,欢迎大佬批评。
[Python] 纯文本查看 复制代码 # -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import time
# 下载链接的前缀,‘944’是《剑来》的地址,想下载其他书可以查看原网页换地址
first_url = 'https://www.bqg99.com/book/944/'
basic_url = {
'whole': None,
'index': 1
}
custom_header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.35'
}
# 下载好的小说
output = 'D:/Data/novel.txt'
# 更新网页链接
def update_link(url_dict):
url_dict['whole'] = first_url + str(url_dict['index']) + '.html'
url_dict['index'] = url_dict['index'] + 1
print(url_dict['whole'])
return url_dict
# 通过GET方法获取网页文本内容
def fetch_text(url_dict, request_header):
data = requests.get(url=url_dict['whole'], headers=request_header, allow_redirects=False)
print('status = %d' % data.status_code)
if data.status_code == 302:
return None
else:
data = BeautifulSoup(data.text, 'lxml')
article = data.find(name='div', class_='content')
chapter_topic = article.h1.text
content_soup = article.find(name='div', id='chaptercontent', class_='Readarea ReadAjax_content')
content_soup.p.decompose() # 去掉多余的“上一章”、“下一章”的导航链接
charter_words = content_soup.stripped_strings
chapter = {
'topic': chapter_topic,
'content': charter_words
}
return chapter
def main():
novel = open(file=output, mode='a+', encoding='utf8')
link = update_link(basic_url)
text = fetch_text(url_dict=link, request_header=custom_header)
while text is not None:
novel.write(text['topic'])
novel.write('\n')
for line in text['content']:
line = line
novel.write(line)
novel.write('\n')
novel.write('\n\n')
time.sleep(1) # 暂停1秒,防止服务器拒绝,不过这个网站好像没有反爬机制
link = update_link(link)
text = fetch_text(url_dict=link, request_header=custom_header)
novel.close()
if __name__ == '__main__':
main()
|