好友
阅读权限30
听众
最后登录1970-1-1
|
本帖最后由 mxwawaawxm 于 2019-1-18 11:53 编辑
爬取豆瓣电影https://movie.douban.com/top250 Top 250名单(仅是爬取名单,没有下载电影),并保存电影名单信息至本地。
请各位大佬指点
第一版[Python] 纯文本查看 复制代码 import requests, re, json
from lxml import etree
def get_page_source(url, headers):
'''
获取网页源代码
'''
try:
response = requests.get(url, headers)
if response.status_code == 200:
return response.text
except requests.ConnectionError:
return None
def get_total_page_num(page_source):
'''
获取页面总页数
'''
html = etree.HTML(page_source)
total_page_num = int(html.xpath('string(//span[@class="next"]/preceding-sibling::a[1])'))
return total_page_num
def parse_page(page_source):
'''
解析网页,获取电影名称等信息
'''
if page_source:
html = etree.HTML(page_source)
for each_movie in html.xpath('//li//div[@class="info"]'):
#获取电影名称
movie_name = ' / '.join(map(lambda a:a.strip(), re.split(r'/', each_movie.xpath('string(./div[@class="hd"]/a)'))))
#获取电影豆瓣链接
movie_douban_link = each_movie.xpath('./div[@class="hd"]/a')[0].attrib.get('href')
#获取电影导演信息和主演信息
#获取电影发行年代、发行国家和类型
#豆瓣top250页面显示限制,有的电影没能完全展示主演信息,如第23部电影触不可及
movie_director_actor, movie_year_country_type = each_movie.xpath('string(./div[@class="bd"]/p)').strip().split('\n')
#获取电影评分
movie_rating_num = each_movie.xpath('string(./div[@class="bd"]//span[@class="rating_num"])')
#获取电影评价总人数
movie_rating_people = each_movie.xpath('string(./div[@class="bd"]//span[4])').strip()
#获取电影评语
movie_quote = each_movie.xpath('string(./div[@class="bd"]//p[@class="quote"])').strip()
yield {
'电影名称:': movie_name,
'电影豆瓣链接:': movie_douban_link,
'导演&主演:': movie_director_actor.replace('\xa0', ' '),
'上映日期: ': movie_year_country_type.split('\xa0/\xa0')[0].strip(),
'制片国家/地区:': movie_year_country_type.split('\xa0/\xa0')[1].strip(),
'类型: ': movie_year_country_type.split('\xa0/\xa0')[2].strip(),
'评分:': movie_rating_num,
'评分总人数:': movie_rating_people,
'影评:': movie_quote,
}
def save_to_json(content):
with open(r'douban.json', 'a', encoding='utf-8') as f_obj:
f_obj.write(json.dumps(content, indent=4, ensure_ascii=False))
f_obj.write('\n'*4)
def main():
url = r'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', }
page_source = get_page_source(url, headers)
total_page_num = get_total_page_num(page_source)
for each_page in range(total_page_num):
url = 'https://movie.douban.com/top250?start={}&filter='.format(each_page*25)
page_source = get_page_source(url, headers)
for each_movie_info in parse_page(page_source):
save_to_json(each_movie_info)
if __name__ == "__main__":
main()
成果图
第2版
[Python] 纯文本查看 复制代码 import requests, re, json, time, os
from lxml import etree
def get_page_source(url, headers):
'''
获取网页源代码
'''
try:
response = requests.get(url, headers)
if response.status_code == 200:
return response.content.decode('utf-8')
except requests.ConnectionError:
return None
def get_total_page_num(page_source):
'''
获取页面总页数
'''
html = etree.HTML(page_source)
total_page_num = int(html.xpath('string(//span[@class="next"]/preceding-sibling::a[1])'))
return total_page_num
def get_movie_info(page_source, headers):
if page_source:
html = etree.HTML(page_source)
#获取豆瓣top250每个页面下的电影豆瓣链接列表
movie_douban_link_list = html.xpath('//li//div[@class="info"]/div[@class="hd"]/a/@href')
#获取每个页面下的电影的代表影评
movie_quote_list = html.xpath('//p[@class="quote"]/span[@class="inq"]/text()')
for each_movie_link, each_movie_quote in zip(movie_douban_link_list, movie_quote_list):
time.sleep(0.3)
movie_page_source = get_page_source(each_movie_link, headers)
if movie_page_source:
html = etree.HTML(movie_page_source)
#新建列表
#存储导演、编剧、主演、类型、制片国家/地区、语言、上映日期、片长、又名、IMDb链接信息
movie_info_list = [a for a in re.split(r'\n ', html.xpath('string(//div[@id="info"])').strip()) if bool(a)]
#将影评添加到movie_info_list最后
movie_info_list.append('影评:'+ each_movie_quote)
#新建列表,存储电影其他信息
movie_info_extra_list = []
#获取电影排名
movie_num = r'电影排名:{}'.format(html.xpath('string(//div[@class="top250"]/span[@class="top250-no"])'))
#获取电影名称
movie_name = r'电影名称:{}'.format(html.xpath('string(//span[@property="v:itemreviewed"])'))
#获取电影链接
movie_link = r'电影链接:{}'.format(each_movie_link)
#获取电影评分
movie_average = r'电影评分:{}'.format(html.xpath('string(//strong[@property="v:average"])'))
#获取电影评价总人数
movie_rating_people = r'电影评价人数:{}'.format(html.xpath('string(//a[@class="rating_people"])'))
movie_info_extra_list = [
movie_num, movie_name, movie_link,
movie_average, movie_rating_people,
]
#获取电影5个评价星级
movie_starstop_list = [i.strip() for i in html.xpath('//div[@class="item"]//span[starts-with(@class, "stars")]/text()')]
#获取电影5个评价星级对应的评价人数
movie_rating_per_list = html.xpath('//div[@class="item"]//span[@class="rating_per"]/text()')
#将电影5个评价星级及对应评价人数写入列表
for each in zip(movie_starstop_list, movie_rating_per_list):
movie_info_extra_list.append('{}评价人数:{}'.format(*each))
movie_info_extra_list.extend(movie_info_list)
yield movie_info_extra_list
print('已经写入第{}部电影\t{}\t的信息'.format(html.xpath('string(//div[@class="top250"]/span[@class="top250-no"])'), html.xpath('string(//span[@property="v:itemreviewed"])')))
def save_to_json(content, file_name):
with open(file_name, 'a', encoding='utf-8') as f_obj:
f_obj.write(json.dumps(content, indent=4, ensure_ascii=False))
f_obj.write('\n'*4)
def main():
file_name = r'douban_top250.json'
if os.path.exists(file_name):
file_name = 'douban_top250{}.json'.format(time.strftime('%Y%m%d%H%M%S', time.localtime()))
f_obj = open(file_name, 'w')
f_obj.close()
url = r'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
}
page_source = get_page_source(url, headers)
total_page_num = get_total_page_num(page_source)
for each_page in range(total_page_num):
url = 'https://movie.douban.com/top250?start={}'.format(each_page*25)
page_source = get_page_source(url, headers)
for each_movie_info in get_movie_info(page_source, headers):
save_to_json(each_movie_info, file_name)
else:
print('豆瓣top250的电影信息写入完毕')
if __name__ == "__main__":
main()
成果图
|
|