最近同学搞对象了,总是在群里抱怨没有话说,不会讲土味情话,虽然我知道他是在装X,但是身为大哥还是要帮他一把于是我从语录网站上收集了各种土味情话和励志正能量送给他
废话不多说,上代码!
展示!
[Python] 纯文本查看 复制代码 import requests
import re
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.73'}
#存放页数链接
page_list = []
#存放待清洗的链接
clear_url_list = []
#存放最后使用的链接
end_url_list = []
#统计页数
page_brain = 1
lzyl = 'http://www.yuluju.com/lizhimingyan/'
aqyl = 'http://www.yuluju.com/aiqingyulu/'
gxyl = 'http://www.yuluju.com/gaoxiaoyulu/'
rsyl = 'http://www.yuluju.com/renshenggeyan/'
qgyl = 'http://www.yuluju.com/qingganyulu/'
jdyl = 'http://www.yuluju.com/jingdianyulu/'
sgyl = 'http://www.yuluju.com/shangganyulu/'
mryl = 'http://www.yuluju.com/mingrenmingyan/'
xqyl = 'http://www.yuluju.com/xinqingyulu/'
def main(url):
response = requests.get(url,headers=headers).text
bsObj = BeautifulSoup(response,'html.parser')
content = bsObj.find_all('div',{'class':'content'})
save_content = re.findall('<span style="font-size:14px;">\d+(.*)</span></div>',str(content))
with open('保存的语录.txt','a',encoding='ISO-8859-1') as f:
for i in save_content:
f.write(i+'\n')
def get_speak(class_url,page1):
global page_brain
res = requests.get(class_url, headers=headers).text
num = re.findall('<li><span class="pageinfo">.*<strong>(.*)</strong>.*<strong>(.*)</strong>.*</span></li>',res)
pages = num[0][0]
print('共有' + pages + '页' + num[0][1] + '条')
print('正在提取链接\n')
for page in range(1,int(pages)+1):
page_url = class_url +'/list_'+ str(page1) +'_' + str(page) + '.html'
#print(page_url)
page_list.append(page_url)
for i in page_list:
get_page_content(i)
for asd in clear_url_list:
for ass in asd:
end_content = re.findall('/.*/(\d*\.html)',ass)
end_url_list.append(end_content)
#准备进行爬取,对end_url_list链接进行遍历访问并获取它的内容
for not_url in end_url_list:
print('正在下载第%s条'%page_brain)
url = class_url + not_url[0]
#爬虫主函数
try:
main(url)
print('第%s页下载完成\n' % page_brain)
page_brain += 1
except Exception as error:
print('第%s页下载失败,原因是:\n')
print(error)
page_brain += 1
pass
print('结束!请在此文件目录下查看保存的文件!')
def get_page_content(page_url):
response = requests.get(page_url,headers=headers).text
res = re.findall('<h2><a href="(.*)" class="title" target="_blank">.*</a></h2>',response)
#print(res)
clear_url_list.append(res)
def st():
print('需要什么语录?')
select_speak = '1、励志语录\n2、爱情语录\n3、搞笑语录\n4、人生语录\n5、情感语录\n6、经典语录\n7、伤感语录\n8、名人语录\n9、心情语录\n'
print(select_speak)
ss = input('请输入序号:')
if ss == '1':
page1 = 1
print(lzyl)
get_speak(lzyl,page1)
elif ss == '2':
page1 = 18
print(aqyl)
get_speak(aqyl, page1)
elif ss == '3':
page1 = 19
print(gxyl)
get_speak(gxyl, page1)
elif ss == '4':
page1 = 14
print(rsyl)
get_speak(rsyl, page1)
elif ss == '5':
page1 = 23
print(qgyl)
get_speak(qgyl, page1)
elif ss == '6':
page1 = 12
print(jdyl)
get_speak(jdyl, page1)
elif ss == '7':
page1 = 21
print(sgyl)
get_speak(sgyl, page1)
elif ss == '8':
page1 = 2
print(mryl)
get_speak(mryl, page1)
elif ss == '9':
page1 = 22
print(xqyl)
get_speak(xqyl, page1)
else:
print('输入有误,请重新输入!')
st()
if __name__ == '__main__':
st()
在下学艺不精,代码冗长沉重,希望朋友们能给点建议
未解决的问题:有的页面会因为编码问题出错导致下载失败,可能是由于压缩的原因吧,希望大家可以给出解决方案。 |