好友
阅读权限20
听众
最后登录1970-1-1
|
本帖最后由 lgsp_Jim 于 2021-5-30 22:05 编辑
我反手也爬了一波
[Python] 纯文本查看 复制代码 import requests
from multiprocessing import Queue
from lxml import etree
import threading
from crawl.handle_mongo import MongoClient
class CrawlPage(threading.Thread):
def __init__(self, thread_name, page_queue, data_queue, *args, **kwargs):
super(CrawlPage, self).__init__(*args, **kwargs)
self.thread_name = thread_name
self.page_queue = page_queue
self.data_queue = data_queue
self.headers = {
'Host': 'movie.douban.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
def run(self) -> None:
print('启动%s处理页码线程' % self.thread_name)
while not page_flag:
try:
page = self.page_queue.get(block=False)
page_url = 'https://movie.douban.com/top250?start=' + str(page) + '&filter='
print('当前构造的url为%s' % page_url)
res = requests.get(url=page_url, headers=self.headers)
res.encoding = 'urf-8'
# print(res.text)
self.data_queue.put(res.text)
except Exception as e:
pass
class CrawlHtml(threading.Thread):
def __init__(self, thread_name, data_queue, lock, db, collections, *args, **kwargs):
super(CrawlHtml, self).__init__(*args, **kwargs)
self.thread_name = thread_name
self.data_queue = data_queue
self.lock = lock
self.db = db
self.collections = collections
def parse(self, text):
html = etree.HTML(text)
items = html.xpath('//ol[@class="grid_view"]/li//div[@class="info"]')
data_list = []
for item in items:
data = {}
data['title'] = item.xpath('.//div[@class="hd"]/a/span/text()')[0]
info = item.xpath('.//div[@class="bd"]/p/text()')[0]
info = ' '.join(info.split('\xa0\xa0\xa0'))
data['info'] = info.strip()
year = item.xpath('.//div[@class="bd"]/p/text()')[1]
year = ' '.join(year.split('\xa0'))
data['year'] = year.strip()
data['star'] = item.xpath('.//div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]
data['comment'] = item.xpath('.//div[@class="bd"]/div[@class="star"]/span/text()')[-1]
try:
data['introduce'] = item.xpath('.//div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()')[0]
except IndexError:
data['introduce'] = '暂无简介'
data_list.append(data)
return data_list
def run(self) -> None:
print('当前%s线程启动处理数据任务' % self.thread_name)
while not data_flag:
try:
text = self.data_queue.get(block=False)
result = self.parse(text)
# print(result)
with self.lock:
insert_data = MongoClient(self.db, self.collections)
insert_data.insert_db(result)
except Exception as e:
pass
page_flag = False
data_flag = False
def main():
page_queue = Queue()
data_queue = Queue()
lock = threading.Lock()
page = 0
while page < 250:
page_queue.put(page)
page += 25
print('当前页码队列的总量为:%s' % page_queue.qsize())
crawl_page_list = ['页码线程1', '页码线程2', '页码线程3']
page_thread_list = []
for thread_name in crawl_page_list:
thread_page = CrawlPage(thread_name, page_queue, data_queue)
thread_page.start()
page_thread_list.append(thread_page)
data_list = ['数据线程1', '数据线程2', '数据线程3']
data_thread_list = []
db = 'db_douban'
collections = 'collections_douban'
for thread_name in data_list:
thread_data = CrawlHtml(thread_name, data_queue, lock, db, collections)
thread_data.start()
data_thread_list.append(thread_data)
global page_flag
while not page_queue.empty():
pass
page_flag = True
for thread_join in page_thread_list:
thread_join.join()
print(thread_join.thread_name, '页码处理结束')
global data_flag
while not data_queue.empty():
pass
data_flag = True
for thread_join in data_thread_list:
thread_join.join()
print(thread_join.thread_name, '数据处理结束')
if __name__ == '__main__':
main()
[Python] 纯文本查看 复制代码 import pymongo
class MongoClient:
def __init__(self, db, collections, *args, **kwargs):
super(MongoClient, self).__init__(*args, **kwargs)
client = pymongo.MongoClient('mongodb://admin:admin@127.0.0.1:27017')
self.db = client[db]
self.collections = self.db[collections]
def insert_db(self, item):
self.collections.insert_many(item) |
免费评分
-
查看全部评分
|