爬取妹子图(支持多类别选择)
支持6大分类索引,交互更友好文件存储在D盘pics文件夹下
因文件大约6M,上传不了,Exe文件在Github:https://github.com/XingJinming-real/Web_Spider_beauties(如果可以点个星哦)
我就不放图了,大家懂就好
import random
import time
import re
import requests
import os
from bs4 import BeautifulSoup
def getmainname(head):# 得到6大分类
url='https://www.mm131.net/'
r=requests.get(url,headers=head)
namels=[];urlls=[];namelist=[]
try:
r.raise_for_status()
except:
print('r.code!=200')
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text,'html.parser')
mainname=soup.find_all('li')
mainname=mainname#mainname为标签列表
mainurl=soup.find_all('a')
mainurl=mainurl#mainurl为主名字的标签列表
for i in range(len(mainurl)):
namels.append(mainname.string)
urlls.append(mainurl.get('href'))
time.sleep(0.1)
for i in range(len(mainurl)):
namelist.append(,urlls])
return namelist# 返回六大分类名字返回名字和相应的地址,为一个2维列表
def entername(namelist,head):#输入要查找的分类名字
subnamelist=[];subnamels=[];suburlls=[]
n=eval(input('请输入要查找的对象数字0——5之间'))
print('已选择{}'.format(namelist))
time.sleep(0.3)
r=requests.get(namelist,headers=head)
r.raise_for_status()
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text,'html.parser')
subnamelt=soup.find_all('img')
subnamelt=subnamelt[:20]#subnamels标签列表
suburllt=soup.find_all('a')
suburllt=suburllt#suburlls为标签列表
for i in range(len(subnamelt)):
subnamels.append(subnamelt.get('alt'))
suburlls.append(suburllt.get('href'))
for i in range(len(subnamels)):
subnamelist.append(,suburlls])
subnamelist=subnamelist
getpics(subnamelist)
return subnamelist#返回该分类下所有套图的名字和地址为一个2维列表
def getpics(subnamelist):#对该套图进行处理
global namelist,head
x=1;y=1
if not os.path.exists('D:\\pics\\'):
os.mkdir('D:\\pics\\')
for i in range(len(subnamelist)):#这里为对每一个套图进行遍历
print('此分类下共有{}个套图'.format(len(subnamelist)))
print('\t正在抓取第{}个套图:{}'.format(i+1,subnamelist))
ifall=eval(input('是否全部抓取:是请输入1'))
if not os.path.exists('D:\\pics\\'+subnamelist):
os.mkdir('D:\\pics\\'+subnamelist)#这里为创建对应套图名称的文件夹
urllist,url=getdetails(subnamelist,ifall)#urllist为对应套图中每张图片的地址
name='D:\\pics\\'+subnamelist+'\\'
for j in range(len(urllist)):#以地址为索引
print('\t正在抓取第{}张'.format(j+1))
time.sleep(0.2)
write(urllist,name,j,url)# 对于套图中的单个图片分别写入
if j==len(urllist)-1:
print('此套图抓取完毕')
print('已抓取全部')
pass
def write(urlls,name,j,url):#对套图中的单个图片写入
f=open(name+str(j)+'.jpg','wb')
r=requests.get(urlls,headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Referer':url
})
r.raise_for_status()
f.write(r.content)
f.close()
pass
def getdetails(url,ifall):#得到套图中每张图片的地址返回一个列表
urllist=[]
code=re.search(r'\d{4}',url).group(0)
url2='https://img1.mmmw.net/pic/'+code+'.html'
num=getnum(url)
string='\t此套图共有{}张'
print(string.format(num))
if not ifall==1:
n=eval(input('要获得多少张'))#https://img1.mmmw.net/pic/5513/1.jpg
else:
n=eval(num)
for j in range(1,n+1): #https://img1.mmmw.net/pic/5520/0.jpg
urlj=url2.strip('.html')+'/'+str(j)+'.jpg'#https://www.mm131.net/xinggan/5520/1.jpg
urlj=urlj.replace('ps','https')
urllist.append(urlj)
return urllist,url
def main(head,namelist):
subnamelist=entername(namelist,head)
return namelist
def getnum(url):
r=requests.get(url,headers=head)
try:
r.raise_for_status()
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text
,'html.parser')
num=soup.find('span').string
num=num.replace('页','')
num=num.replace('共','')
except:
print('getnum error')
return num
head = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
namelist=getmainname(head)
main(head,namelist)
import threading
import random
import time
import re
import requests
import os
from bs4 import BeautifulSoup
def getmainname(head):# 得到6大分类
url='https://www.mm131.net/'
r=requests.get(url,headers=head)
namels=[];urlls=[];namelist=[]
try:
r.raise_for_status()
except:
print('r.code!=200')
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text,'html.parser')
mainname=soup.find_all('li')
mainname=mainname#mainname为标签列表
mainurl=soup.find_all('a')
mainurl=mainurl#mainurl为主名字的标签列表
for i in range(len(mainurl)):
namels.append(mainname.string)
urlls.append(mainurl.get('href'))
for i in range(len(mainurl)):
namelist.append(,urlls])
return namelist# 返回六大分类名字返回名字和相应的地址,为一个2维列表
def get_page_num(page_url):
r=requests.get(page_url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'})
r.raise_for_status()
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text,'html.parser')
tag=soup.find_all('a','page-en')[-1]
num_temp=re.search(r'\d+.html',tag.get('href')).group(0)
num=eval(num_temp.strip('.html'))
return num
def entername(namelist,head):#输入要查找的分类名字
subnamelist=[];subnamels=[];suburlls=[]
for i in range(6):
print('{}--{}'.format(i,namelist))
n=eval(input('请输入要查找的对象数字0——5之间(按CTRL+C可强制退出程序):'))
print('已选择{}'.format(namelist))
dict_page={'性感美女':'6','清纯美眉':'1','美女校花':'2','性感车模':'3','旗袍美女':'4','明星写真':'5'}
page_num=get_page_num(namelist)
print('该分类下共有{}个套图'.format(page_num*20-20))
duplicate_namelist=namelist
for i in range(1,page_num*20-20):
namelist=duplicate_namelist
if i==1:
pass
else:
namelist=namelist+'list_'+dict_page]+'_'+str(i)+'.html'
print(namelist)
r=requests.get(namelist,headers=head)
r.raise_for_status()
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text,'html.parser')
subnamelt=soup.find_all('img')
subnamelt=subnamelt[:20]#subnamels标签列表
suburllt=soup.find_all('a')
suburllt=suburllt#suburlls为标签列表
for i in range(len(subnamelt)):
subnamels.append(subnamelt.get('alt'))
suburlls.append(suburllt.get('href'))
for i in range(len(subnamels)):
subnamelist.append(,suburlls])
subnamelist=subnamelist
getpics(subnamelist)
print('此分类已全部获取')
def getpics(subnamelist):#对该套图进行处理
global namelist,head
x=1;y=1
if not os.path.exists('D:\\pics\\'):
os.mkdir('D:\\pics\\')
ifall=eval(input('是否全部抓取:是请输入1:'))
subnamelist_duplicate=subnamelist
for i in range(len(subnamelist)):#这里为对每一个套图进行遍历
print('\t正在抓取第{}个套图:{}'.format(i+1,subnamelist))
if not os.path.exists('D:\\pics\\'+subnamelist):
os.mkdir('D:\\pics\\'+subnamelist)#这里为创建对应套图名称的文件夹
try:
urllist,url=getdetails(subnamelist,ifall)#urllist为对应套图中每张图片的地址
name='D:\\pics\\'+subnamelist+'\\'
for j in range(len(urllist)):#以地址为索引
print('\t正在抓取第{}张'.format(j+1))
threading.Thread(target=write,args=(urllist,name,j,url)).start() #引入多线程,加快速度
if j==len(urllist)-1:
print('此套图抓取完毕')
except:
pass
def write(urlls,name,j,url):#对套图中的单个图片写入
f=open(name+str(j)+'.jpg','wb')
r=requests.get(urlls,headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Referer':url
})
r.raise_for_status()
f.write(r.content)
f.close()
pass
def getdetails(url,ifall):#得到套图中每张图片的地址返回一个列表
urllist=[]
code=re.search(r'\d{3,4}',url).group(0)
url2='https://img1.mmmw.net/pic/'+code+'.html'
num=getnum(url)
string='\t此套图共有{}张'
print(string.format(num))
if not ifall==1:
n=eval(input('要获得多少张'))#https://img1.mmmw.net/pic/5513/1.jpg
else:
n=eval(num)
for j in range(1,n+1): #https://img1.mmmw.net/pic/5520/0.jpg
urlj=url2.strip('.html')+'/'+str(j)+'.jpg'#https://www.mm131.net/xinggan/5520/1.jpg
urlj=urlj.replace('ps','https')
urllist.append(urlj)
return urllist,url
def main(head,namelist):
entername(namelist,head)
def getnum(url):
r=requests.get(url,headers=head)
r.raise_for_status()
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text
,'html.parser')
num=soup.find('span').string
num=num.replace('页','')
num=num.replace('共','')
return num
head = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
namelist=getmainname(head)
main(head,namelist)
添加了多线程爬取,爬取更快,并支持一键爬取当前分类下的所有套图(感谢@aishangoumeige 提醒)
import threading
import random
import time
import re
import requests
import os
from bs4 import BeautifulSoup
def getmainname(head):# 得到6大分类
url='https://www.mm131.net/'
r=requests.get(url,headers=head)
namels=[];urlls=[];namelist=[]
try:
r.raise_for_status()
except:
print('r.code!=200')
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text,'html.parser')
mainname=soup.find_all('li')
mainname=mainname#mainname为标签列表
mainurl=soup.find_all('a')
mainurl=mainurl#mainurl为主名字的标签列表
for i in range(len(mainurl)):
namels.append(mainname.string)
urlls.append(mainurl.get('href'))
for i in range(len(mainurl)):
namelist.append(,urlls])
return namelist# 返回六大分类名字返回名字和相应的地址,为一个2维列表
def entername(namelist,head):#输入要查找的分类名字
subnamelist=[];subnamels=[];suburlls=[]
for i in range(6):
print('{}--{}'.format(i,namelist))
n=eval(input('请输入要查找的对象数字0——5之间(按CTRL+C可强制退出程序):'))
print('已选择{}'.format(namelist))
r=requests.get(namelist,headers=head)
r.raise_for_status()
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text,'html.parser')
subnamelt=soup.find_all('img')
subnamelt=subnamelt[:20]#subnamels标签列表
suburllt=soup.find_all('a')
suburllt=suburllt#suburlls为标签列表
for i in range(len(subnamelt)):
subnamels.append(subnamelt.get('alt'))
suburlls.append(suburllt.get('href'))
for i in range(len(subnamels)):
subnamelist.append(,suburlls])
subnamelist=subnamelist
getpics(subnamelist)
return subnamelist#返回该分类下所有套图的名字和地址为一个2维列表
def getpics(subnamelist):#对该套图进行处理
global namelist,head
x=1;y=1
if not os.path.exists('D:\\pics\\'):
os.mkdir('D:\\pics\\')
print('此分类下共有{}个套图'.format(len(subnamelist)))
ifall=eval(input('是否全部抓取:是请输入1:'))
for i in range(len(subnamelist)):#这里为对每一个套图进行遍历
print('\t正在抓取第{}个套图:{}'.format(i+1,subnamelist))
if not os.path.exists('D:\\pics\\'+subnamelist):
os.mkdir('D:\\pics\\'+subnamelist)#这里为创建对应套图名称的文件夹
urllist,url=getdetails(subnamelist,ifall)#urllist为对应套图中每张图片的地址
name='D:\\pics\\'+subnamelist+'\\'
for j in range(len(urllist)):#以地址为索引
print('\t正在抓取第{}张'.format(j+1))
threading.Thread(target=write,args=(urllist,name,j,url)).start() #引入多线程,加快速度
if j==len(urllist)-1:
print('此套图抓取完毕')
print('已抓取全部')
pass
def write(urlls,name,j,url):#对套图中的单个图片写入
f=open(name+str(j)+'.jpg','wb')
r=requests.get(urlls,headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Referer':url
})
r.raise_for_status()
f.write(r.content)
f.close()
pass
def getdetails(url,ifall):#得到套图中每张图片的地址返回一个列表
urllist=[]
code=re.search(r'\d{4}',url).group(0)
url2='https://img1.mmmw.net/pic/'+code+'.html'
num=getnum(url)
string='\t此套图共有{}张'
print(string.format(num))
if not ifall==1:
n=eval(input('要获得多少张'))#https://img1.mmmw.net/pic/5513/1.jpg
else:
n=eval(num)
for j in range(1,n+1): #https://img1.mmmw.net/pic/5520/0.jpg
urlj=url2.strip('.html')+'/'+str(j)+'.jpg'#https://www.mm131.net/xinggan/5520/1.jpg
urlj=urlj.replace('ps','https')
urllist.append(urlj)
return urllist,url
def main(head,namelist):
subnamelist=entername(namelist,head)
return namelist
def getnum(url):
r=requests.get(url,headers=head)
r.raise_for_status()
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text
,'html.parser')
num=soup.find('span').string
num=num.replace('页','')
num=num.replace('共','')
return num
head = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
namelist=getmainname(head)
while True:
main(head,namelist)
key_loop=eval(input('是否继续?是请按1:'))
if not key_loop:
print('程序结束')
break
谢谢分享 嘿嘿嘿,这个我喜欢 干的漂亮 谢谢分享 感谢分享 可以直接爬妹子吗:$qqq layuai 发表于 2020-8-17 13:49
可以直接爬妹子吗
可以,只要你手速快。妹子就感觉不到 爬取特定网址的吧
可以改成循环自动抓取么,每抓一次都要输入一次,挺闹心的{:1_907:} 最近这类工具较多啊 支持一个