[Python] 纯文本查看 复制代码
import requests
import time
from bs4 import BeautifulSoup
def get_content(url):
'''
分析贴吧的网页文件,整理信息,保存在列表变量中
'''
# 初始化一个列表来保存所有的帖子信息:
comments = []
# 使用request请求所需url
html = requests.get(url)
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html.text, 'lxml')
# 找到所有具有‘j_thread_list clearfix’属性的li标签
liTags = soup.find_all('li', attrs={"class":['j_thread_list', 'clearfix']})
# 循环遍历li标签
for li in liTags:
# 初始化一个字典来存储帖子信息
comment = {}
try:
# 筛选信息,并保存到字典中
comment['title'] = li.find('a', attrs={"class": ['j_th_tit']}).text.strip()
comment['link'] = "tieba.baidu.com/" + li.find('a', attrs={"class": ['j_th_tit']})['href']
comment['name'] = li.find('span', attrs={"class": ['tb_icon_author']}).text.strip()
comment['time'] = li.find('span', attrs={"class": ['pull-right is_show_create_time']}).text.strip()
comment['replyNum'] = li.find('span', attrs={"class": ['threadlist_rep_num center_text']}).text.strip()
comments.append(comment)
except:
print('出了点小问题')
return comments
def Out2File(comments):
'''
将爬取到的文件写入到本地
保存到当前目录的TTBT.txt文件中。
'''
with open('TTBT.txt', 'a+', encoding='utf-8') as f:
for comment in comments:
f.write('标题:{} \t 链接:{} \t 发帖人:{} \t 发帖时间:{} \t 回复数量:{} \n'.format(
comment['title'], comment['link'], comment['name'], comment['time'], comment['replyNum']))
print('当前页面爬取完成')
def main(base_url, deep):
url_list = []
# 将所有需要爬取的url存入列表
for i in range(0, deep):
url_list.append(base_url + '&pn=' + str(50 * i))
# 循环写入所有的数据
for url in url_list:
print(f"开始爬取:{url}")
content = get_content(url)
print(content)
Out2File(content)
time.sleep(5)
print('所有的信息都已经保存完毕!')
base_url = 'https://tieba.baidu.com/f?ie=utf-8&kw=亚运会'
# 设置需要爬取的页码数量
deep = 3
if __name__ == '__main__':
main(base_url, deep)
|