#爬取糗事百科段子[/size]
import requests
from lxml import etree
#设置UA
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
#设置需要爬取页数
page=int(input("请输入您需要的页数:"))
#获取各页链接
url2=[]
for x in range(1,page+1):
url2.append("https://www.qiushibaike.com/8hr/page/"+str(x))
#print(url2)
#读取各页信息
for url in url2:
response=requests.get(url,headers=headers).text
html=etree.HTML(response)
result1=html.xpath('//div//a[@class="recmd-content"]/@href')
#print(result1)
for site in result1:
xurl="https://www.qiushibaike.com"+site
#print(xurl)
response2=requests.get(xurl).text
html2=etree.HTML(response2)
result2=html2.xpath("//div[@class='content']")
try:
print(result2[0].text)
except Exception as e:
print("错误:糗百君的飞船出了一点小毛病……")