本帖最后由 Lobolang 于 2019-8-14 22:24 编辑
由于刚学没多久,技术有限,有不少错误望各位师兄师姐多多指点..
[Python] 纯文本查看 复制代码 import requests
import threading
from lxml import etree
from queue import Queue
headers={'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Mobile Safari/537.36'}
class TiebaSpider:
def __init__(self):
self.url='http://tieba.baidu.com/f?&kw=lol&pn={}' #本代码以lol为例子,lol可更换为其他贴吧的吧名
self.url_queen = Queue()
self.response_queen = Queue()
def get_urllist(self):
for i in range(0,5):
self.url_queen.put(self.url.format(i*30))
def get_response(self):
while True:
url = self.url_queen.get()
response=requests.get(url,headers=headers)
self.response_queen.put(response.content.decode())
self.url_queen.task_done()
def get_info(self):
while True:
html = etree.HTML(self.response_queen.get())
title=html.xpath('//div[@class="ti_title"]/span[not(contains(text(),"置顶"))][not(contains(@class,"icon"))]/text()')
title_url=html.xpath('//ul[@class="threads_list"]//a[not(contains(@class,"ti_item j_click_stats"))][not(contains(text(),"立即查看"))]/@href')
str='http://tieba.baidu.com'
url=[str+i for i in title_url]
count=0
while count<len(title):
print('标题:%s 地址:%s' %(title[count],url[count]))
count+=1
self.response_queen.task_done()
def run(self):
thread_list=[]
for i in range(1): #获取页码的线程数:1 线程数可根据自己喜好更改
t_list=threading.Thread(target=self.get_urllist)
thread_list.append(t_list)
for i in range(10): #获取响应的线程数:10
t_response=threading.Thread(target=self.get_response)
thread_list.append(t_response)
for i in range(3): #采集数据的线程数:3
t_info=threading.Thread(target=self.get_info)
thread_list.append(t_info)
for t in thread_list:
t.setDaemon(True)
t.start()
for p in (self.url_queen,self.response_queen):
p.join() #设置主线程等待其他线程结束再结束
print("\33[36m获取结束")
if __name__ == '__main__':
q=TiebaSpider()
q.run() |