使用Python獲取豆瓣Top250電影排行榜數據

QingYi. 发表于 2021-5-29 20:56

本帖最后由 QingYi. 于 2021-5-30 11:46 编辑

做了一個可以爬取的的代碼：https://www.52pojie.cn/thread-1449882-1-1.html

Java已經算是學入門了吧，試試Python，學習了一個下午，沒別的意思，拿豆瓣電影排行榜來練手獲取的內容有：電影名字、上映時間、評分和評價人數
代碼很短，20余行

import re
import requests

p = 0
url = "https://movie.douban.com/top250?start=" + str(p) + "&filter="
while (p < 250):
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"
}
resp = requests.get(url, headers=headers)
page = resp.text
obj = re.compile(r'<li>.*?<div class="item">.*?(?P<name>.*?).*?'
 r'.*? (?P<year>.*?) .*?'
 r'(?P<score>.*?).*?'
 r'(?P<cnt>.*?)人评价', re.S)
res = obj.finditer(page)
for it in res:
 print(it.group("name") + " : " + it.group("year").strip() + " : " + it.group("score") + " : " + it.group("cnt"))
 dic = it.groupdict()
 # 去除獲取到的空格
 dic['year'] = dic['year'].strip()
# 達到翻頁的目的
p += 25

在新标签打开所有链接复制所有链接URL复制所有链接URL（反向）复制所有链接标题 + URL复制所有链接标题 + URL (MD)复制所有链接标题 + URL (BBS)复制所有链接标题 + URL (筛选)复制所有链接标题 + URL (设置复制格式)在新标签页打开所有图片链接在一个标签页显示所有图片链接
复选框 - 选中
复选框 - 取消
复选框 - 反选
单选框 - 选中
单选框 - 取消
特殊单选框 - 选中

lgsp_Jim 发表于 2021-5-30 22:04

本帖最后由 lgsp_Jim 于 2021-5-30 22:05 编辑

我反手也爬了一波{:301_1001:}

import requests
from multiprocessing import Queue
from lxml import etree
import threading

from crawl.handle_mongo import MongoClient

class CrawlPage(threading.Thread):
def __init__(self, thread_name, page_queue, data_queue, *args, **kwargs):
 super(CrawlPage, self).__init__(*args, **kwargs)
 self.thread_name = thread_name
 self.page_queue = page_queue
 self.data_queue = data_queue
 self.headers = {
 'Host': 'movie.douban.com',
 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
 'Accept-Encoding': 'gzip, deflate, br',
 'DNT': '1',
 'Connection': 'keep-alive',
 'Upgrade-Insecure-Requests': '1',
 'Pragma': 'no-cache',
 'Cache-Control': 'no-cache',
 }

def run(self) -> None:
 print('启动%s处理页码线程' % self.thread_name)
 while not page_flag:
 try:
 page = self.page_queue.get(block=False)
 page_url = 'https://movie.douban.com/top250?start=' + str(page) + '&filter='
 print('当前构造的url为%s' % page_url)
 res = requests.get(url=page_url, headers=self.headers)
 res.encoding = 'urf-8'
 # print(res.text)
 self.data_queue.put(res.text)
 except Exception as e:
 pass

class CrawlHtml(threading.Thread):
def __init__(self, thread_name, data_queue, lock, db, collections, *args, **kwargs):
 super(CrawlHtml, self).__init__(*args, **kwargs)
 self.thread_name = thread_name
 self.data_queue = data_queue
 self.lock = lock
 self.db = db
 self.collections = collections

def parse(self, text):
 html = etree.HTML(text)
 items = html.xpath('//ol[@class="grid_view"]/li//div[@class="info"]')
 data_list = []
 for item in items:
 data = {}
 data['title'] = item.xpath('.//div[@class="hd"]/a/span/text()')
 info = item.xpath('.//div[@class="bd"]/p/text()')
 info = ' '.join(info.split('\xa0\xa0\xa0'))
 data['info'] = info.strip()
 year = item.xpath('.//div[@class="bd"]/p/text()')
 year = ' '.join(year.split('\xa0'))
 data['year'] = year.strip()
 data['star'] = item.xpath('.//div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')
 data['comment'] = item.xpath('.//div[@class="bd"]/div[@class="star"]/span/text()')[-1]
 try:
 data['introduce'] = item.xpath('.//div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()')
 except IndexError:
 data['introduce'] = '暂无简介'
 data_list.append(data)
 return data_list

def run(self) -> None:
 print('当前%s线程启动处理数据任务' % self.thread_name)
 while not data_flag:
 try:
 text = self.data_queue.get(block=False)
 result = self.parse(text)
 # print(result)
 with self.lock:
 insert_data = MongoClient(self.db, self.collections)
 insert_data.insert_db(result)
 except Exception as e:
 pass

page_flag = False
data_flag = False

def main():
page_queue = Queue()
data_queue = Queue()

lock = threading.Lock()

page = 0
while page < 250:
 page_queue.put(page)
 page += 25
print('当前页码队列的总量为：%s' % page_queue.qsize())

crawl_page_list = ['页码线程1', '页码线程2', '页码线程3']
page_thread_list = []
for thread_name in crawl_page_list:
 thread_page = CrawlPage(thread_name, page_queue, data_queue)
 thread_page.start()
 page_thread_list.append(thread_page)

data_list = ['数据线程1', '数据线程2', '数据线程3']
data_thread_list = []

db = 'db_douban'
collections = 'collections_douban'
for thread_name in data_list:
 thread_data = CrawlHtml(thread_name, data_queue, lock, db, collections)
 thread_data.start()
 data_thread_list.append(thread_data)

global page_flag
while not page_queue.empty():
 pass
page_flag = True

for thread_join in page_thread_list:
 thread_join.join()
 print(thread_join.thread_name, '页码处理结束')

global data_flag
while not data_queue.empty():
 pass
data_flag = True

for thread_join in data_thread_list:
 thread_join.join()
 print(thread_join.thread_name, '数据处理结束')

if __name__ == '__main__':
main()

import pymongo

class MongoClient:
def __init__(self, db, collections, *args, **kwargs):
 super(MongoClient, self).__init__(*args, **kwargs)
 client = pymongo.MongoClient('mongodb://admin:admin@127.0.0.1:27017')
 self.db = client
 self.collections = self.db

def insert_db(self, item):
 self.collections.insert_many(item)

fanvalen 发表于 2021-5-29 21:01

我看到了一样的格式你们是一个老师教出来的吧

QingYi. 发表于 2021-5-29 21:08

fanvalen 发表于 2021-5-29 21:01
我看到了一样的格式你们是一个老师教出来的吧

說明這個老師桃李滿天下

smileat2000 发表于 2021-5-29 21:08

我看到了一样的格式你们是一个老师教出来的吧

QingYi. 发表于 2021-5-29 21:09

smileat2000 发表于 2021-5-29 21:08
我看到了一样的格式你们是一个老师教出来的吧

在哪發給我看看

blindcat 发表于 2021-5-29 21:22

感谢分享

wocuole 发表于 2021-5-29 21:33

牛啊牛啊

赵森heart 发表于 2021-5-29 21:40

能下载吗

QingYi. 发表于 2021-5-29 22:00

赵森heart 发表于 2021-5-29 21:40
能下载吗

僅僅做到的是提取文字而已，下載要找到能提供下載源的網站去爬

JokerX 发表于 2021-5-29 22:49

你这是爬了10遍top25

页: [1] 2 3 4

吾爱破解 - 52pojie.cn's Archiver

使用Python獲取豆瓣Top250電影排行榜數據