[Asm] 纯文本查看 复制代码 import requests
import pymongo
from multiprocessing.dummy import Pool
from db_phb import *
from lxml import etree
"""
url = https://movie.douban.com/typerank?type_name=%E5%89%A7%E6%83%85&type=11&interval_id=100:90&action=
1.请求首页url 从response中找到详情页url
2.对详情页发送请求,从response中解析出电影标题 评分 简介
3.首页翻页请求 分析ajax动态加载 获得翻页的url
4.储存
存到mangodb
"""
def get_moive_index(url,data):
try:
response = requests.get(url=url,headers=headers,params=data)
if response.status_code == 200:
return response.json()
except Exception:
pass
def detail_mov(url):
try:
response = requests.get(url=url,headers=headers)
if response.status_code == 200:
return response.text
except Exception:
pass
#解析电影简介
def info(url):
rep = detail_mov(url)
tree = etree.HTML(rep)
try:
mov_info = tree.xpath('//div[[url=home.php?mod=space&uid=341152]@Class[/url] ="indent"]//span/text()')
mov_info = "".join([x.strip() for x in mov_info])
# mov_info = str(mov_info)
# mov_info = mov_info.replace("\\u3000","").replace("\\n","").replace(" ","").replace("[","").replace("]","") #和上面那个推导式效果差不多
# print(mov_info)
return mov_info
except:
pass
def save_to_mongo(item):
if db[MONGO_TABLE].insert(item):
print("储存到mongodb成功",item)
return True
return False
def main(data):
print(("多进程启动成功"))
respons= get_moive_index(url,data)
for dic in respons:
item = {}
ur = dic['url']
item['title'] = dic['title']
item['regions'] = dic['regions']
item['score'] = dic['score']
item['vote_count'] = dic['vote_count']
item['mov_info'] = info(ur)
print(item)
save_to_mongo(item)
if __name__ == '__main__':
clint = pymongo.MongoClient(MONGODB_URL)
db = clint[MONGO_DB]
url = "https://movie.douban.com/j/chart/top_list?"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
datas = []
for x in range(START, END + 1):
data = {
"type": "11",
"interval_id": "100:90",
"action": "",
"start": x * 20,
"limit": "20"
}
datas.append(data)
pool = Pool(4)
pool.map(main,datas)
pool.close()
pool.join()
clint.close() |