本帖最后由 mlyde 于 2021-4-3 15:52 编辑
下载 书《材料力学1 第六版刘鸿文-指导与例题习题解答》的全部图片
下载的图片保存在“./cailiaolixue”里,全部为300张图片;我没有重命名图片,图片按时间排序就大概是正确顺序了(图片中也有页码)。若想按页码命名,可以添加一个全局变量page,每次调用 img_download 就让page加一,添加到文件名上。
通过post方式获取图片链接时,data中要有正确的时间戳才能获取到含有图片链接的json。
若遇到不能下载图片的情况(会输出 'Image download Error!' ),可能是Cookie过期(我也不清楚Cookie的有效期是多少),打开那个网页,找到 Cookie 替换一下就好。
完整代码:
[Python] 纯文本查看 复制代码
# 爬取的网页: https://max.book118.com/html/2019/0628/8026130071002032.shtm
import requests
import time
import re
import os
# 修改此处的cookie
cookie = '__yjs_duid=1_773b9a1b3c7d6735375c5941b9f5f1331617165401049; CLIENT_SYS_UN_ID=3rvgCmBj/FkLjk48Bc44Ag==; s_v=cdh%3D%3E27a30245%7C%7C%7Cvid%3D%3E1617165403565050051%7C%7C%7Cfsts%3D%3E1617165403%7C%7C%7Cdsfs%3D%3E0%7C%7C%7Cnps%3D%3E1; s_rfd=cdh%3D%3E27a30245%7C%7C%7Ctrd%3D%3Emax.book118.com%7C%7C%7Cftrd%3D%3Ebook118.com; s_s=cdh%3D%3E27a30245%7C%7C%7Clast_req%3D%3E1617165403%7C%7C%7Csid%3D%3E1617165403961329323%7C%7C%7Cdsps%3D%3E0'
# 从cookie中获取时间戳
time_callback = re.findall('[0-9]{13}', cookie)[-1]
# print(time_callback)
def img_download(img_url):
'''下载并保存图片'''
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
# 下载图片
response = requests.get(img_url, headers = headers)
if response.status_code != 200:
print('Image download Error!')
# 保存的图片的名称
iname = img_url.split('/')[-1]
# 保存图片
with open('./cailiaolixue/' + iname, 'wb') as fp:
fp.write(response.content)
def get_urls(page):
'''得到图片链接'''
url = 'https://openapi.book118.com/getPreview.html'
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': cookie,
'Host': 'openapi.book118.com',
'Pragma': 'no-cache',
'Referer': 'https://max.book118.com/',
'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Dest': 'script',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
}
times = str(int(time.time() * 1000))
data = {
'project_id': '1',
'aid': '226572288',
't': '21680590b4a45e8ce7af4406a71ce42b',
'view_token': '_jS9X2H2hCTZ5H7fyadG@f7ZGPf5ZnZK',
'page': page,
'filetype': 'docx',
'callback': 'jQuery183010559838597161075_' + time_callback,
'_': times
}
response = requests.post(url, headers = headers, data = data)
response.encoding = 'utf-8'
# print(response.text)
# 通过正则表达式得到图片网址, 并下载
urls = re.findall(r'\\\/\\\/(.*?\.png)', response.text)
for url in urls:
time.sleep(0.5)
url = 'https://' + url
print(url)
img_download(url)
def main():
"""主函数"""
# 将图片保存在当前目录下的文件夹内,若'cailiaolixue'不存在,则创建
if not os.path.exists('./cailiaolixue'):
os.mkdir('./cailiaolixue')
# 共300页
for page in range(1, 300, 5):
print(str(page) + ' / 300')
time.sleep(0.6)
get_urls(page)
if __name__ == "__main__":
main()
os.system('pause')
|