在家呆太久了无聊了？让我们一起爬爬豆瓣电影Top250看看还有哪些没看过的经典电影吧！

六道佩奇 · 发表于 2020-6-23 01:35

本帖最后由六道佩奇于 2020-6-23 01:41 编辑

前言

因为有些无聊，所以想看看有没有好看的电影，搜着搜着就看到了豆瓣电影TOP250的榜单，手痒就想把它给爬下来存到Execl（毕竟熟能生巧，要勤加练习嘛，胡说，明明是看着人家好爬！），然后做了一定的数据清洗和分析，绘制了几张图表，不得不说，图一出来，一些数据就更加一目了然了。代码比较简单，写得比较糙，可以给新手学习学习，大神请飘过（欢迎指正），话不多说，先上代码！

代码

import re
import requests
import wordcloud
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from openpyxl import Workbook
from openpyxl.styles import Alignment

def top250():
        wb = Workbook()
        ws = wb.get_sheet_by_name('Sheet')
        num = 0
        num1 = 0
        lst = []
        name_lst = []
        dy_lst = []
        zy_lst = []
        time_lst = []
        a = []
        country_lst = []
        country_lst1 = []
        leixing_lst = []
        pj_lst = []
        people_lst = []
        quote_lst = []
        time_dict = {}
        country_dict = {}
        headers = {
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36 Edg/81.0.416.68",
        }
        while num <= 225:
                url = 'https://movie.douban.com/top250?start=' + str(num) + '&filter='
                with requests.get(url=url, headers=headers) as r:
                        if r.status_code == 200:
                                r.encoding = r.apparent_encoding
                                soup = BeautifulSoup(r.text, 'html.parser')
                                ol = soup.find('ol')
                                li = ol.find_all('li')
                                for i in li:
                                        name = i.find('span').text
                                        p = i.find('p')
                                        p = str(p).split('<br/>', 1)
                                        fst = p[0].split('>', 1)[1]
                                        sec = p[1].split('<', 1)[0]
                                        daoyan = fst.split('主', 1)[0]
                                        daoyan = daoyan.replace('导演:', '')
                                        daoyan = daoyan.replace('\xa0', '')
                                        daoyan = daoyan.replace('\n                             ', '')
                                        try:
                                                zhuyan = fst.split('主演:', 1)[1]
                                        except:
                                                zhuyan = ''
                                        time = sec.split(' / ', 2)[0]
                                        time = time.replace('\xa0', '')
                                        time = time.replace('\n                            ', '')
                                        time = re.findall(r'\d{4}', time)[-1]
                                        country = sec.split(' / ', 2)[1]
                                        country = country.replace('\xa0', '')
                                        type = sec.split(' / ', 2)[2]
                                        type = type.replace('\xa0', '')
                                        type = type.replace('\n                        ', '')
                                        star = i.find('div', attrs={'class': "star"})
                                        span = star.find_all('span')
                                        pingjia = span[1].text
                                        people = span[3].text.split('评价', 1)[0]
                                        try:
                                                quote = i.find('p', attrs={'class': "quote"}).text
                                                quote = quote.replace('\n', '')
                                        except:
                                                quote = ''
                                        name_lst.append(name)
                                        dy_lst.append(daoyan)
                                        zy_lst.append(zhuyan)
                                        time_lst.append(time)
                                        country_lst.append(country)
                                        leixing_lst.append(type)
                                        pj_lst.append(pingjia)
                                        people_lst.append(people)
                                        quote_lst.append(quote)
                                if num1 == 9:
                                        time_lst1 = list(map(int, time_lst))
                                        for i in range(len(country_lst)):
                                                a.append(country_lst[i].split(' '))
                                                for j in range(len(a[i])):
                                                        country_lst1.append(a[i][j])
                                        for i in set(time_lst1):
                                                time_dict[i] = time_lst1.count(i)
                                        for i in set(country_lst1):
                                                country_dict[i] = country_lst1.count(i)
                                        plt.rcParams['font.sans-serif'] = ['SimHei']
                                        fig1 = plt.figure()
                                        fig2 = plt.figure().add_subplot(111)
                                        ax1 = fig1.add_subplot(3, 1, 1)
                                        ax2 = fig1.add_subplot(3, 1, 2)
                                        ax3 = fig1.add_subplot(3, 1, 3)
                                        for key in time_dict:
                                                ax1.text(key, time_dict[key], '%s' % time_dict[key], ha='center', va='bottom')
                                                ax2.text(key, time_dict[key], '%s' % time_dict[key], ha='center', va='bottom')
                                        for key in country_dict:
                                                ax3.text(key, country_dict[key], '%s' % country_dict[key], ha='center', va='bottom')
                                        ax1.bar(time_dict.keys(), time_dict.values(), color='indianred', alpha=0.8)
                                        ax2.plot(list(time_dict.keys()), list(time_dict.values()), color='indianred', alpha=0.8)
                                        ax3.bar(country_dict.keys(), country_dict.values(), color='indianred', alpha=0.8)
                                        ax1.set_title("豆瓣电影TOP250各年份上榜电影次数柱形图分析")
                                        # 为两条坐标轴设置名称
                                        ax1.set_xlabel("年份")
                                        ax1.set_ylabel("次数")
                                        ax2.set_title("豆瓣电影TOP250各年份上榜电影次数折线图分析")
                                        # 为两条坐标轴设置名称
                                        ax2.set_xlabel("年份")
                                        ax2.set_ylabel("次数")
                                        ax3.set_title("豆瓣电影TOP250各地区上榜电影次数柱形图分析")
                                        # 为两条坐标轴设置名称
                                        ax3.set_xlabel("地区")
                                        ax3.set_ylabel("次数")
                                        ax3.set_xticklabels(country_dict.keys(), fontsize=10, rotation=30)
                                        fig1.set_figwidth(15)
                                        # 读取image
                                        background_img = plt.imread('heart.jpg')
                                        # 生成wordcloud 对象
                                        wc = wordcloud.WordCloud(background_color='white', mask=background_img, scale=2, font_path='C:/Windows/Fonts/simhei.ttf')
                                        words_img = wc.generate_from_frequencies(country_dict)
                                        # 直接显示（颜色随机分配）
                                        fig2.imshow(words_img)
                                        # 去掉坐标轴
                                        fig2.set_axis_off()
                                        plt.tight_layout()
                                        plt.show()
                                num1 += 1
                                print('第{}页爬取完毕！'.format(num1))
                                if num == 225:
                                        print('爬取结束，开始写入excel。。。')
                                        paiming = list(range(1, 251))
                                        lst.append(paiming)
                                        lst.append(name_lst)
                                        lst.append(dy_lst)
                                        lst.append(zy_lst)
                                        lst.append(time_lst)
                                        lst.append(country_lst)
                                        lst.append(leixing_lst)
                                        lst.append(pj_lst)
                                        lst.append(people_lst)
                                        lst.append(quote_lst)
                                        head = ['排名', '电影名称', '导演', '主演', '年份', '地区', '类型', '评分', '评价人数', '一句简介']
                                        ws.append(head)
                                        for i in range(len(lst)):
                                                for j in range(len(lst[0])):
                                                        ws.cell(j + 2, i + 1).value = lst[i][j]
                                        print('写入excel完成！')
                                        for cell in ws['1']:
                                                cell.alignment = Alignment(horizontal='center', vertical='center')
                                        for cell in ws['A']:
                                                cell.alignment = Alignment(horizontal='center', vertical='center')
                                        for cell in ws['B']:
                                                cell.alignment = Alignment(horizontal='center', vertical='center')
                                        for cell in ws['E']:
                                                cell.alignment = Alignment(horizontal='center', vertical='center')
                                        for cell in ws['H']:
                                                cell.alignment = Alignment(horizontal='center', vertical='center')
                                        for cell in ws['I']:
                                                cell.alignment = Alignment(horizontal='center', vertical='center')
                                        ws.column_dimensions['B'].width = 25
                                        ws.column_dimensions['I'].width = 13
                                        wb.save('豆瓣电影top250.xlsx')
                                num += 25
                        else:
                                print('失败！')

if __name__ == '__main__':
        print('开始爬取！')
        top250()

结果

豆瓣TOP250Excel

matplotlib

注意

词云生成使用了背景图，如果要运行记得先更改背景图哈！附件是生成好的Excel文件，如果可以的话希望可以给点免费热心，回复回复也行呀~

豆瓣电影top250.zip (43.74 KB, 下载次数: 178)

涛之雨 · 发表于 2020-6-23 06:08

希望加一个功能判断一下我看过那几个然后筛选出来我没看过的
（开个玩笑。。。我说的不是加一个判读。。。。主要是我不记得我看过的电影了

）

wanwfy · 发表于 2020-6-23 19:57

本帖最后由 wanwfy 于 2020-6-23 20:41 编辑

发一个我练习的代码，用xpath+正则提取信息，不过有的信息是空的，挺费劲的
导演、主演、年代、地域很多地方采集不到，垮掉。哎。。。

[Python] 纯文本查看 复制代码

import json
import re
import requests
from lxml import etree


class DoubanTop(object):

    def __init__(self):
        self.baseurl = "https://movie.douban.com/top250"
        self.result_list = []

    def start_requests(self, url):
        headers = {
            "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"}
        r = requests.get(url, headers=headers)
        return r.content

    def parse(self, text):
        tree = etree.HTML(text)
        items = tree.xpath('//li/div')
        for item in items:
            mydict = {}
            mydict["title"] = item.xpath(".//span[@class='title']/text()")[0]
            p_text = item.xpath('.//p[contains(string(),"导演")]/text()')
            data = self.clean_data(p_text)
            mydict["director"] = data.get("director")
            mydict["to_star"] = data.get("to_star")
            mydict["release_time"] = data.get("release_time")
            mydict["region"] = data.get("region")
            mydict["type"] = data.get("type")
            mydict["score"] = item.xpath(".//span[@class='rating_num']/text()")[0]
            quote = item.xpath('.//span[@class="inq"]/text()')
            mydict["quote"] = quote[0] if quote else None
            mydict["comment_num"] = item.xpath('.//span[contains(text(),"评价")]/text()')[0][:-3]

            self.result_list.append(mydict)
        nextpage = tree.xpath('//a[contains(text(),"后页")]/@href')
        if nextpage:
            nexturl = self.baseurl + nextpage[0]
            text = self.start_requests(nexturl)
            self.parse(text)

    def clean_data(self, item):
        string = "".join(item)
        pattern = re.compile(".*?导演:\s+(.*?)\s+主演:\s+(.*?)\n.*?(\d+)\s+/\s+(.*?)\s+/\s+(.*?)\n", re.S)
        data = pattern.findall(string)
        director, to_star, release_time, region, type_ = data[0] if data else "12345"
        return {"director": director,
                "to_star": to_star,
                "release_time": release_time,
                "region": region,
                "type": type_}

    def write_json(self, result):

        s = json.dumps(result, indent=5, ensure_ascii=False)
        with open('movies.json', 'w', encoding='utf-8') as f:
            f.write(s)

    def start(self):
        text = self.start_requests(self.baseurl)
        self.parse(text)
        self.write_json(self.result_list)


douban = DoubanTop()
douban.start()

netspirit · 发表于 2020-6-23 01:47

我会告诉你top250电影可以直接下载??????

六道佩奇 · 发表于 2020-6-23 01:52

netspirit 发表于 2020-6-23 01:47
我会告诉你top250电影可以直接下载??????

哈哈，那么晚还不睡啊，我的目的就是发个代码交流交流，用途的话...

netspirit · 发表于 2020-6-23 02:05

六道佩奇发表于 2020-6-23 01:52
哈哈，那么晚还不睡啊，我的目的就是发个代码交流交流，用途的话...

你也一样啊....之前下了top100的电影然后发现没啥想看的
因为排行靠前的基本都是那种比较感人的但是我懒得费力气去感动了.....

VitoScaletta · 发表于 2020-6-23 02:12

感谢分享!

wanwfy · 发表于 2020-6-23 02:27

这个代码太乱了，应该还可以优化优化

mhku333 · 发表于 2020-6-23 03:28

加油，感谢

左耳近情 · 发表于 2020-6-23 03:55

感谢分享

nakasou · 发表于 2020-6-23 06:17

学习了，感谢分享！

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] 在家呆太久了无聊了？让我们一起爬爬豆瓣电影Top250看看还有哪些没看过的经典电影吧！

前言

代码

结果

注意

免费评分