python之requests爬取百度图片--re正则获取图片
本帖最后由 gopy 于 2022-3-28 22:04 编辑1、爬取百度图片
(1)、先下载一张图片
```
def download_img():
url = 'https://img2.baidu.com/it/u=2957654884,1708144004&fm=253&fmt=auto&app=138&f=JPEG?w=350&h=350'
content=requests.get(url)
with open('1.jpg','wb') as f:
f.write(content.content)
```
(2)、下载一页的图片
-----先要获取第一页网页的地址
------再获取一页所有图片的地址
先分析网页,百度运用的是js发送请求
发现百度图片每页以30张图片为一组,分析网页地址发现里面的(pn)以30的为步长进行累加
```
https://image.baidu.com/search/acjson?tn=resultjson_com&logid=8641423673525825447&ipn=rj&ct=201326592&is=&fp=result&fr=&word=%E5%A8%83%E5%A8%83&queryWord=%E5%A8%83%E5%A8%83&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn=30&rn=30&gsm=1e&1648349651809=
https://image.baidu.com/search/acjson?tn=resultjson_com&logid=8641423673525825447&ipn=rj&ct=201326592&is=&fp=result&fr=&word=%E5%A8%83%E5%A8%83&queryWord=%E5%A8%83%E5%A8%83&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn=60&rn=30&gsm=3c&1648349652108=
```
```
url='https://image.baidu.com/search/acjson?tn=resultjson_com&logid=8641423673525825447&ipn=rj&ct=201326592&is=&fp=result&fr=&word=%E5%A8%83%E5%A8%83&queryWord=%E5%A8%83%E5%A8%83&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn={}&rn=30&gsm=3c&1648349652108='
```
```
def gain_img_url():
url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=8641423673525825447&ipn=rj&ct=201326592&is=&fp=result&fr=&word=%E5%A8%83%E5%A8%83&queryWord=%E5%A8%83%E5%A8%83&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn={}&rn=30&gsm=3c&1648349652108='
url.format(30)
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36"
}
content=requests.get(url,headers=header)
rule = r'"thumbURL":"(.*?)"'
url_list = re.findall(rule, content.text)
return url_list
```
(3)构造多个页网页,实现下载多页
```
def create_url(page):
urls_list=[]
for i in range(1,page+1):
url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=8641423673525825447&ipn=rj&ct=201326592&is=&fp=result&fr=&word=%E5%A8%83%E5%A8%83&queryWord=%E5%A8%83%E5%A8%83&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn={}&rn=30&gsm=3c&1648349652108='
url=url.format(i*30)
urls_list.append(url)
return urls_list
```
```
import re
from urllib import parse
import os
import requests
def download_img(index,url,foldername):
if not os.path.exists(f'./{foldername}'):
print(f'{foldername}文件夹不存在,即将创建')
os.mkdir(f'{foldername}')
print('创建成功')
name=f'./{foldername}/{index}.jpg'
content = requests.get(url)
with open(name,'wb') as f:
f.write(content.content)
def gain_img_url(url):
# url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=8641423673525825447&ipn=rj&ct=201326592&is=&fp=result&fr=&word=%E5%A8%83%E5%A8%83&queryWord=%E5%A8%83%E5%A8%83&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn={}&rn=30&gsm=3c&1648349652108='
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36"
}
content=requests.get(url,headers=header)
rule = r'"thumbURL":"(.*?)"'
url_list = re.findall(rule, content.text)
return url_list
def create_url(page,name):
urls_list=[]
for i in range(1,page+1):
url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=8641423673525825447&ipn=rj&ct=201326592&is=&fp=result&fr=&word={}&queryWord={}cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn={}&rn=30&gsm=3c&1648349652108='
url=url.format(name,name,i*30)
urls_list.append(url)
return urls_list
def main():
page=int(input('请输入下载的页数'))
name = input('请输入你要下载的内容')
all_img_urls_list=[]
name_ascii=parse.quote(name)
index_url_list=create_url(page,name_ascii)
for url in index_url_list:
img_url_list=gain_img_url(url)
all_img_urls_list.extend(img_url_list)
for index,url in enumerate(all_img_urls_list):
download_img(index,url,name)
if __name__ == '__main__':
main()
``` 谢谢分享。 不错,学习了。{:1_921:} 我的pycharm报错说我没有requests这个模块啊 繁星六月 发表于 2022-3-29 21:52
我的pycharm报错说我没有requests这个模块啊
pip install requests
安装一下
页:
[1]