[Asm] 纯文本查看 复制代码 import requests
from bs4 import BeautifulSoup
#爬取三国演义的所有章节和内容
if __name__ == '__main__':
#UA伪装
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
#对首页的内容进行爬取
url = "https://www.shicimingju.com/book/sanguoyanyi.html"
page_text = requests.get(url=url,headers=headers).text
#在首页中解析出章节的标题和详情页的url
#1.实例化BeautifulSoup对象,需要将页面的源码数据加载到该对象中
soup = BeautifulSoup(page_text,"lxml")
#解析出章节的标题和详情页的url
li_list = soup.select(".book-mulu > ul > li")
fp = open("./sanguo.txt","w",encoding="UTF-8")
print(li_list)
for li in li_list:
title=li.a.string
detail_url = "https://www.shicimingju.com" + li.a["href"]
#对详情页发起请求,解析出章节内容
detil_nei = requests.get(url=detail_url,headers=headers).text
#解析出详情页中的章节内容
page_text_nei_soup = BeautifulSoup(detil_nei,"lxml")
dic_tag =page_text_nei_soup.find("div",class_="chapter_content")
#解析到了章节的内容
content = dic_tag.text
fp.write(title+":"+content+"\n")
print(title,”爬取成功")
|