python爬虫-美女图片
本帖最后由 Qnly_genius 于 2018-12-11 12:03 编辑程序用pyinstaller打包了下
链接: https://pan.baidu.com/s/1uQTtpIwOpBMoMH5nO3PJrA 提取码: unuq
https://static.52pojie.cn/static/image/hrline/1.gif
python3.6.2
所需模块:requests,lxml
pip install requests lxml
在代码所在文件夹,新建“美图”文件夹,执行代码即可,大神勿喷!
代码:
#-*- coding:utf-8 -*-
import os
import time
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'Referer': 'http://www.mmjpg.com/tag/xinggan'
}
## tag/xinggan 性感首页
url = "http://www.mmjpg.com/tag/xinggan"
html = requests.get(url).text
soup1 = etree.HTML(html)
## 性感的页数
all_page = int(soup1.xpath('/html/body/div/div/div/a/@href').split('/')[-1])
for page in range(all_page):
url = "http://www.mmjpg.com/tag/xinggan/%d"%(page+1)
html = requests.get(url).text
soup1 = etree.HTML(html)
# 性感每一页总共15个人物
for i in range(15):
path = "/html/body/div/div/ul/li[%d]/a/@href"%(i+1)
# 每个人物的 首页 url
tep_url = soup1.xpath(path)
# 人物idreferer需要人物的id
id = int(tep_url.split('/')[-1].replace('.jpg', ''))
dir_name = '美图/'+ str(id)
os.mkdir(dir_name)
# 人物标题
title= soup1.xpath('/html/body/div/div/ul/li[%d]/span/a/text()'%(i+1))
# 首页的内容
pic_page = requests.get(tep_url).text
# 解析首页内容
soup2 = etree.HTML(pic_page)
# 该人物的图片数量
page_num = int(soup2.xpath('//*[@id="page"]/a/text()'))
# 获取人物首页图片的url
# http://fm.shiyunjj.com/2018/1502/1ie6.jpg
pic_url = soup2.xpath('//*[@id="content"]/a/img/@src')
# 1ie6.jpg
detail_url_end = pic_url.split('/')[-1]
# http://fm.shiyunjj.com/2018/1502/
detail_url_top = pic_url.replace(detail_url_end, '')
# 下载图片
for i in range(page_num):
# referer
detail_url = "http://www.mmjpg.com/mm/%d/%d" % (id, i + 1)
headers['Referer'] = detail_url
# 获取图片链接
html_detail = requests.get(detail_url).text
soup3 = etree.HTML(html_detail)
pic = soup3.xpath('//*[@id="content"]/a/img/@src')
# 下载图片
with open(dir_name+'/'+str(i+1)+'.jpg', 'wb') as f:
print('正在下载:', dir_name+'/'+str(i+1)+'jpg')
f.write(requests.get(pic, headers=headers).content)
# time.sleep(0.5)
【报错如下】前面可以正常执行,后面超时断了,不知道是不是我网络的问题
...
正在下载: 美图/831/31jpg
正在下载: 美图/831/32jpg
Traceback (most recent call last):
File "D:\Program Files\Python\3.7.0\lib\site-packages\urllib3\connection.py", line 171, in _new_conn
(self._dns_host, self.port), self.timeout, **extra_kw)
File "D:\Program Files\Python\3.7.0\lib\site-packages\urllib3\util\connection.py", line 79, in create_connection
raise err
File "D:\Program Files\Python\3.7.0\lib\site-packages\urllib3\util\connection.py", line 69, in create_connection
sock.connect(sa)
TimeoutError: 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Program Files\Python\3.7.0\lib\site-packages\urllib3\connectionpool.py", line 600, in urlopen
chunked=chunked)
File "D:\Program Files\Python\3.7.0\lib\site-packages\urllib3\connectionpool.py", line 354, in _make_request
conn.request(method, url, **httplib_request_kw)
File "D:\Program Files\Python\3.7.0\lib\http\client.py", line 1229, in request
self._send_request(method, url, body, headers, encode_chunked)
File "D:\Program Files\Python\3.7.0\lib\http\client.py", line 1275, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "D:\Program Files\Python\3.7.0\lib\http\client.py", line 1224, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "D:\Program Files\Python\3.7.0\lib\http\client.py", line 1016, in _send_output
self.send(msg)
File "D:\Program Files\Python\3.7.0\lib\http\client.py", line 956, in send
self.connect()
File "D:\Program Files\Python\3.7.0\lib\site-packages\urllib3\connection.py", line 196, in connect
conn = self._new_conn()
File "D:\Program Files\Python\3.7.0\lib\site-packages\urllib3\connection.py", line 180, in _new_conn
self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x000001CC08DD2908>: Failed to establish a new connection: 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Program Files\Python\3.7.0\lib\site-packages\requests\adapters.py", line 445, in send
timeout=timeout
File "D:\Program Files\Python\3.7.0\lib\site-packages\urllib3\connectionpool.py", line 638, in urlopen
_stacktrace=sys.exc_info())
File "D:\Program Files\Python\3.7.0\lib\site-packages\urllib3\util\retry.py", line 398, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='www.mmjpg.com', port=80): Max retries exceeded with url: /mm/830 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001CC08DD2908>: Failed to establish a new connection: 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "G:/PyCharm_Projects/图像/meitu.py", line 38, in <module>
pic_page = requests.get(tep_url).text
File "D:\Program Files\Python\3.7.0\lib\site-packages\requests\api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "D:\Program Files\Python\3.7.0\lib\site-packages\requests\api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "D:\Program Files\Python\3.7.0\lib\site-packages\requests\sessions.py", line 512, in request
resp = self.send(prep, **send_kwargs)
File "D:\Program Files\Python\3.7.0\lib\site-packages\requests\sessions.py", line 622, in send
r = adapter.send(request, **kwargs)
File "D:\Program Files\Python\3.7.0\lib\site-packages\requests\adapters.py", line 513, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='www.mmjpg.com', port=80): Max retries exceeded with url: /mm/830 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001CC08DD2908>: Failed to establish a new connection: 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。'))
Process finished with exit code 1
可以自动建立文件夹啦
#-*- coding:utf-8 -*-
import os
import time
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'Referer': 'http://www.mmjpg.com/tag/xinggan'
}
## tag/xinggan 性感首页
url = "http://www.mmjpg.com/tag/xinggan"
html = requests.get(url).text
soup1 = etree.HTML(html)
## 性感的页数
all_page = int(soup1.xpath('/html/body/div/div/div/a/@href').split('/')[-1])
for page in range(all_page):
url = "http://www.mmjpg.com/tag/xinggan/%d"%(page+1)
html = requests.get(url).text
soup1 = etree.HTML(html)
# 性感每一页总共15个人物
for i in range(15):
path = "/html/body/div/div/ul/li[%d]/a/@href"%(i+1)
# 每个人物的 首页 url
tep_url = soup1.xpath(path)
# 人物idreferer需要人物的id
id = int(tep_url.split('/')[-1].replace('.jpg', ''))
path1=os.getcwd()
isExists=os.path.exists(path1+'\\美女')
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
os.makedirs(path1+'\\美女')
dir_name = '美女/'+ str(id)
os.mkdir(dir_name)
# 人物标题
title= soup1.xpath('/html/body/div/div/ul/li[%d]/span/a/text()'%(i+1))
# 首页的内容
pic_page = requests.get(tep_url).text
# 解析首页内容
soup2 = etree.HTML(pic_page)
# 该人物的图片数量
page_num = int(soup2.xpath('//*[@id="page"]/a/text()'))
# 获取人物首页图片的url
# http://fm.shiyunjj.com/2018/1502/1ie6.jpg
pic_url = soup2.xpath('//*[@id="content"]/a/img/@src')
# 1ie6.jpg
detail_url_end = pic_url.split('/')[-1]
# http://fm.shiyunjj.com/2018/1502/
detail_url_top = pic_url.replace(detail_url_end, '')
# 下载图片
for i in range(page_num):
# referer
detail_url = "http://www.mmjpg.com/mm/%d/%d" % (id, i + 1)
headers['Referer'] = detail_url
# 获取图片链接
html_detail = requests.get(detail_url).text
soup3 = etree.HTML(html_detail)
pic = soup3.xpath('//*[@id="content"]/a/img/@src')
# 下载图片
with open(dir_name+'/'+str(i+1)+'.jpg', 'wb') as f:
f.write(requests.get(pic, headers=headers).content)
print('正在下载:', dir_name+'/'+str(i+1)+'jpg')
# time.sleep(0.5) 我想要网址~哈哈~ 感觉在看黄图{:301_999:} long860226 发表于 2018-12-10 23:35
我想要网址~哈哈~
网址就在代码里啊~ 感谢分享 你可以调用os库判断有没有文件夹如果没有创建, 然后保存进文件夹中. 准备开始学习python。 好像不能用,求楼主debug{:1_893:}
Traceback (most recent call last):
File "C:/Users/Python/Desktop/Python/美女图/mmjp.py", line 21, in <module>
for page in all_page:
TypeError: 'int' object is not iterable
Process finished with exit code 1 真的能用吗? 改成--------for page in range(all_page)