初学python 小破网站勿压 还请手下留情
代码如下:
[Python] 纯文本查看 复制代码 import requests
from bs4 import BeautifulSoup
print('打开book.wiiv.cn网站随便点击一本小说.../novel/51702.html,这个数字就是ID')
url_id = input('输入小说ID:')
url = 'http://m.book.wiiv.cn/other/chapters/id/' + url_id +'.html'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36',}
page_text = requests.get(url=url,headers=headers).text#对页面数据进行爬取
soup = BeautifulSoup(page_text,'lxml')#实例化BeautifulSoup对象,需要将页面源码数据加载到该对象中
li_list = soup.select('.xb-all-wrap > ul > li')#div .xb-all-wrap标签下的ul标签下的li标签 #解析章节标题和详情页的url
book_title=soup.h2.string#获取小说名
fileName = book_title + '.txt'
fp = open(fileName,'w',encoding='utf-8')
for li in li_list:#for循环 for 迭代变量 in 字符串|列表|元组|字典|集合: 在li_list里面循环li标签
title = li.a.string#循环找出a标签标题(string)
detail_url = 'http://m.book.wiiv.cn' + li.a['href']#循环找出a标签链接(href)
detail_page = requests.get(url=detail_url,headers=headers).text #对小说内容页发起请求,解析出小说内容
detail_soup = BeautifulSoup(detail_page,'lxml') #解析出详情页中相关的章节内容
div_tag = detail_soup.find('section', class_ = 'read-section jsChapterWrapper')
print(div_tag)
content = div_tag.text
content=str(content)#转换成str要不然下面报错
fp.write(title+':'+content+'\n')
print(title,'爬取成功!!!')
|