爬取吾爱论坛上自己的帖子

mxwawaawxm 发表于 2019-3-10 20:42

本帖最后由 mxwawaawxm 于 2019-3-11 12:49 编辑

需要在下图光标处填入自己在吾爱上的Cookie，才能运行

代码目前写得很差。{:301_1005:}思路没有条理性。以后再改正。
小白请大佬们指点。
#!/usr/bin/env python3
#coding:utf-8

import requests
from lxml import etree
from functools import reduce

HEADERS = {
'Cookie': '',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
}
PAGE_INDEX = 1

def get_text(url):
try:
   response = requests.get(url, headers=HEADERS)
   if response.status_code == 200:
         return response.text
except requests.ConnectionError:
   print('获取网页{}出错'.format(url))
   return None

def get_my_topic_dict():
global PAGE_INDEX
#建立字典，以版块名称为key，以版块里帖子标题、发帖时间、地址、回复/查看人数构成的列表作为value
my_topic_dict = {}
while True:
   url = r'https://www.52pojie.cn/forum.php?mod=guide&view=my&type=thread&page={}'.format(PAGE_INDEX)
   text = get_text(url)
   if text:
         html = etree.HTML(text)
         if html.xpath('string(//div[@class="bm_c"]/table//tr)') == '暂时还没有帖子':
            return my_topic_dict
         my_topic = html.xpath('//tbody//th[@class="common"]')

         for each in my_topic:
            #获取帖子从属的版块
            my_topic_board = each.xpath('string(./following-sibling::td[@class="by"]/a)')
            #以版块名称为字典的key，对应的值默认为空列表
            my_topic_dict.setdefault(my_topic_board, [])
            #获取帖子标题
            my_topic_title = each.xpath('string(./a[@class="xst"])')
            #获取帖子地址
            my_topic_url = '帖子地址：https://www.52pojie.cn/{}'.format(each.xpath('./a[@class="xst"]/@href'))
            #获取帖子发帖时间
            my_topic_create_time = r'发帖时间：{}'.format(each.xpath('string(./following-sibling::td[@class="by"]//span)'))
            #获取帖子回复/查看人数
            my_topic_reply_view = r'回复/查看：{}/{}'.format(each.xpath('string(./following-sibling::td[@class="num"]//a)'), each.xpath('string(./following-sibling::td[@class="num"]//em)'))

            #以帖子标题、发帖时间、地址、回复/查看人数形成列表
            my_topic_list =
            #把帖子标题、发帖时间、地址、回复/查看人数构成的列表扩展至字典value默认的空列表
            my_topic_dict.extend(my_topic_list)
         PAGE_INDEX += 1


def print_my_topic(my_topic_dict):
#统计发帖总量及发帖版块总数
print('发帖总数--{}，发帖的版块数--{}\n\n'.format(reduce(lambda a,b:a+b, ), len(my_topic_dict)))
for my_topic_board, my_topics in my_topic_dict.items():
   my_topic_num = len(my_topics)//4
   print('{}，发帖数--{}'.format(my_topic_board, my_topic_num))
   for each in range(0, len(my_topics), 4):
         print(my_topics,
            my_topics,
            my_topics,
            my_topics,
            sep='\n'
            )
         print('¨¨¨¨¨¨¨¨¨¨¨¨¨¨'*4)
   print('\n'*3)

def main():
my_topic_dict = get_my_topic_dict()
print_my_topic(my_topic_dict)


if __name__ == '__main__':
main()

打印结果在cmd显示如下

导演发表于 2019-3-10 20:57

我都是用面向对象写的感觉我的复杂

先有我后有天 发表于 2019-3-10 21:08

我突发奇想想偷自己的钱

mxwawaawxm 发表于 2019-3-11 12:46

导演发表于 2019-3-10 20:57
我都是用面向对象写的感觉我的复杂

发出来学习下。用类来写，我老是觉得思维换不过来。习惯面向方法

雷晨发表于 2020-5-15 15:41

是不是可以爬该板块下的所有帖子？下载下来之后是什么文件呢？有成品软件吗？吾爱很多帖子不错想爬下来保存到本地

页: [1]

吾爱破解 - 52pojie.cn's Archiver

爬取吾爱论坛上自己的帖子