本帖最后由 lihu5841314 于 2021-6-10 18:47 编辑
[Asm] 纯文本查看 复制代码 import requests
import pymongo
from copy import deepcopy
import time
import pandas as pd
from multiprocessing.dummy import Pool
from lxml import etree
#请求每一页 获取每个电影的详细数据
def resp(url):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
rep1 = requests.get(url=url,headers=headers)
rep1.encoding =rep1.apparent_encoding
tree = etree.HTML(rep1.text)
"""
解析 电影名title 评分scroe 多少人评价assess 和 简介info
1.获取页面所有电影的标签列表
2.遍历所有列表,解析单个标签里面的内容
"""
li_list = tree.xpath('//*[@class="grid_view"]/li')
data = pd.DataFrame()
for li in li_list:
item = {}
title = li.xpath('.//div[@class="pic"]/a/img/@alt')
rep2 = li.xpath('.//div[@class="bd"]/p//text()')
info = "".join([x.strip() for x in rep2])
score = li.xpath('.//div[@class="star"]/span[2]/text()')
assess = li.xpath('.//div[@class="star"]/span[4]/text()')
item['title'] = title
item['info'] = info
item['score'] = score
item['assess'] = assess
# print(item)
# top_col.insert_one(item)
data = data.append(item,ignore_index=True)
print(data)
return data
if __name__ == '__main__':
# 初始url=https://movie.douban.com/top250?start=0&filter=
"""
一共250个电影 一共10页 构建一个urls全包
"""
# clint = pymongo.MongoClient("mongodb://localhost")
# db = clint.top250
# top_col = db.collect
start = time.time()
urls = []
for i in range(0, 251, 25):
url = f'https://movie.douban.com/top250?start={i}&filter='
urls.append(url)
pool = Pool(4)
db_pf = pd.DataFrame()
datas = pool.map(resp,urls)
for data in datas:
db_pf = db_pf.append(data,ignore_index=True)
pool.close()
pool.join()
db_pf.to_excel("db.xlsx")
end = time.time()
print("一共耗时",end-start)
[Asm] 纯文本查看 复制代码 import requests
import time
import pandas as pd
from multiprocessing.dummy import Pool
from lxml import etree
#请求每一页 获取每个电影的详细数据
def resp(url):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
rep1 = requests.get(url=url,headers=headers)
rep1.encoding =rep1.apparent_encoding
tree = etree.HTML(rep1.text)
"""
解析 电影名title 评分scroe 多少人评价assess 和 简介info
1.获取页面所有电影的标签列表
2.遍历所有列表,解析单个标签里面的内容
"""
item = {}
li_list = tree.xpath('//*[@class="grid_view"]/li')
for li in li_list:
title = li.xpath('.//div[@class="pic"]/a/img/@alt')
rep2 = li.xpath('.//div[@class="bd"]/p//text()')
info = "".join([x.strip() for x in rep2])
score = li.xpath('.//div[@class="star"]/span[2]/text()')
assess = li.xpath('.//div[@class="star"]/span[4]/text()')
item['title'] = title
item['info'] = info
item['score'] = score
item['assess'] = assess
# print(item)
db_df.append(item,ignore_index=True)
"""
为什么存不进去excel 思路是 先建立一个空的dataframe 然后db_df append 添加数据 最后存入excel
为啥append会存不进去呢 刚看完pandas 视频 水货一个
"""
if __name__ == '__main__':
# 初始url=https://movie.douban.com/top250?start=0&filter=
"""
一共250个电影 一共10页 构建一个urls全包
"""
start = time.time()
urls = []
for i in range(0, 251, 25):
url = f'https://movie.douban.com/top250?start={i}&filter='
urls.append(url)
db_df = pd.DataFrame()
pool = Pool(4)
pool.map(resp,urls)
pool.close()
pool.join()
db_df.to_excel("db.xlsx")
end = time.time()
print("一共耗时",end-start)
“”“搞定了 多谢 dataframe用2次才行 ”“” |