[Python] 纯文本查看 复制代码
import threading
import random
import time
import re
import requests
import os
from bs4 import BeautifulSoup
def getmainname(head):# 得到6大分类
url='https://www.mm131.net/'
r=requests.get(url,headers=head)
namels=[];urlls=[];namelist=[]
try:
r.raise_for_status()
except:
print('r.code!=200')
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text,'html.parser')
mainname=soup.find_all('li')
mainname=mainname[1:7]#mainname为标签列表
mainurl=soup.find_all('a')
mainurl=mainurl[9:15]#mainurl为主名字的标签列表
for i in range(len(mainurl)):
namels.append(mainname[i].string)
urlls.append(mainurl[i].get('href'))
for i in range(len(mainurl)):
namelist.append([namels[i],urlls[i]])
return namelist# 返回六大分类名字返回名字和相应的地址,为一个2维列表
def get_page_num(page_url):
r=requests.get(page_url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'})
r.raise_for_status()
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text,'html.parser')
tag=soup.find_all('a','page-en')[-1]
num_temp=re.search(r'\d+.html',tag.get('href')).group(0)
num=eval(num_temp.strip('.html'))
return num
def entername(namelist,head):#输入要查找的分类名字
subnamelist=[];subnamels=[];suburlls=[]
for i in range(6):
print('{}--{}'.format(i,namelist[i][0]))
n=eval(input('请输入要查找的对象数字0——5之间(按CTRL+C可强制退出程序):'))
print('已选择{}'.format(namelist[n][0]))
dict_page={'性感美女':'6','清纯美眉':'1','美女校花':'2','性感车模':'3','旗袍美女':'4','明星写真':'5'}
page_num=get_page_num(namelist[n][1])
print('该分类下共有{}个套图'.format(page_num*20-20))
duplicate_namelist=namelist[n][1]
for i in range(1,page_num*20-20):
namelist[n][1]=duplicate_namelist
if i==1:
pass
else:
namelist[n][1]=namelist[n][1]+'list_'+dict_page[namelist[n][0]]+'_'+str(i)+'.html'
print(namelist[n][1])
r=requests.get(namelist[n][1],headers=head)
r.raise_for_status()
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text,'html.parser')
subnamelt=soup.find_all('img')
subnamelt=subnamelt[:20]#subnamels标签列表
suburllt=soup.find_all('a')
suburllt=suburllt[17:38]#suburlls为标签列表
for i in range(len(subnamelt)):
subnamels.append(subnamelt[i].get('alt'))
suburlls.append(suburllt[i].get('href'))
for i in range(len(subnamels)):
subnamelist.append([subnamels[i],suburlls[i]])
subnamelist=subnamelist[i-1:20*i]
getpics(subnamelist)
print('此分类已全部获取')
def getpics(subnamelist):#对该套图进行处理
global namelist,head
x=1;y=1
if not os.path.exists('D:\\pics\\'):
os.mkdir('D:\\pics\\')
ifall=eval(input('是否全部抓取:是请输入1:'))
subnamelist_duplicate=subnamelist
for i in range(len(subnamelist)):#这里为对每一个套图进行遍历
print('\t正在抓取第{}个套图:{}'.format(i+1,subnamelist[i][0]))
if not os.path.exists('D:\\pics\\'+subnamelist[i][0]):
os.mkdir('D:\\pics\\'+subnamelist[i][0])#这里为创建对应套图名称的文件夹
try:
urllist,url=getdetails(subnamelist[i][1],ifall)#urllist为对应套图中每张图片的地址
name='D:\\pics\\'+subnamelist[i][0]+'\\'
for j in range(len(urllist)):#以地址为索引
print('\t正在抓取第{}张'.format(j+1))
threading.Thread(target=write,args=(urllist[j],name,j,url)).start() #引入多线程,加快速度
if j==len(urllist)-1:
print('此套图抓取完毕')
except:
pass
def write(urlls,name,j,url):#对套图中的单个图片写入
f=open(name+str(j)+'.jpg','wb')
r=requests.get(urlls,headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Referer':url
})
r.raise_for_status()
f.write(r.content)
f.close()
pass
def getdetails(url,ifall):#得到套图中每张图片的地址返回一个列表
urllist=[]
code=re.search(r'\d{3,4}',url).group(0)
url2='https://img1.mmmw.net/pic/'+code+'.html'
num=getnum(url)
string='\t此套图共有{}张'
print(string.format(num))
if not ifall==1:
n=eval(input('要获得多少张'))#[img]https://img1.mmmw.net/pic/5513/1.jpg[/img]
else:
n=eval(num)
for j in range(1,n+1): #[img]https://img1.mmmw.net/pic/5520/0.jpg[/img]
urlj=url2.strip('.html')+'/'+str(j)+'.jpg'#[img]https://www.mm131.net/xinggan/5520/1.jpg[/img]
urlj=urlj.replace('ps','https')
urllist.append(urlj)
return urllist,url
def main(head,namelist):
entername(namelist,head)
def getnum(url):
r=requests.get(url,headers=head)
r.raise_for_status()
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text
,'html.parser')
num=soup.find('span').string
num=num.replace('页','')
num=num.replace('共','')
return num
head = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
namelist=getmainname(head)
main(head,namelist)