本帖最后由 liiv700718 于 2020-3-6 15:01 编辑
可能是cookie的问题,换成自己的没问题,但要注意封账号的问题,多爬了几次,我的账号差一点给封了
稍微修改了一下,把图片也爬了
没加cookie,想试试的话加上自己的吧[Python] 纯文本查看 复制代码
#! usr/bin/python3
# -*- coding : "UTF-8" -*-
import requests
import json
import os
import time
def get_urls():
urls = [f"https://movie.douban.com/j/search_subjects?type=movie&tag" \
f"=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20" \
f"&page_start={i * 20}" for i in range(18)]
# print(urls)
return urls
def get_json(urls, headers):
contents = []
for url in urls:
print(f"开始采集第 {int(url.split('=')[-1]) // 20 + 1} 页...")
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
results = json.loads(r.text)
for i in results['subjects']:
content = {}
content['电影名'] = i['title']
content['评分'] = i['rate']
content['链接'] = i['url']
content['图片地址'] = i['cover']
# print(content)
for item in content.items():
print(item)
contents.append(content)
print(f"\n第 {int(url.split('=')[-1]) // 20 + 1} 页采集完成\n")
time.sleep(1)
print("采集所有电影完成!")
return contents
def save2file(contents):
print("正在开始准备写入文件····")
if not os.path.exists('./DouBan'):
os.mkdir('./DouBan')
try:
os.remove('./DouBan/MV.csv')
except:
pass
with open('MV.csv', 'a')as f:
f.write('电影名, 评分, 链接, 图片地址' + '\n')
for content in contents:
f.write(f"{content['电影名']}, {content['评分']}, "
f"{content['链接']}, {content['图片地址']}\n")
print('文件已写入完成!')
print("\n开始采集图片\n")
i = 1
for content in contents:
url = content['图片地址']
image = requests.get(url, headers=headers)
with open(f"./DouBan/{i:03} {content['电影名']} {content['评分']}.jpg",
'wb') as fp:
fp.write(image.content)
print(f"第 {i:03} 张图片 --{content['电影名']}-- 采集完成")
i += 1
print('全部图片采集完成')
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/80.0.3987.132 Safari/537.36',
'Cookie': '***'
}
save2file(get_json(get_urls(), headers)) |