【python爬虫】福利,手把手教你python爬取美足船袜网源码!
【python爬虫】福利,手把手教你python爬取美足船袜网源码!【python爬虫】福利,手把手教你python爬取美足船袜网源码!
#http://mzsock.com 美足船袜网
# -*- coding: UTF-8 -*-
import requests
import re,os
import time
from urllib import request
from fake_useragent import UserAgent
class Mzsock():
def __init__(self):
self.ua=UserAgent()
self.headers={"User-Agent":self.ua.random}
def get_categroy_url(self):
url="http://mzsock.com"
response=requests.get(url,headers=self.headers).text
ul=re.findall(r'<ul id="chenxing_menu" class="cx_menu l">(.+?)</ul>',response,re.S)
categroy_urls=re.findall(r'<li id=".+?"><a href="(.+?)">.+?</a></li>',ul,re.S)
return categroy_urls
def get_urllist(self,categroy_urls):
urllist=[]
for url in categroy_urls:
response=requests.get(url,verify=False,headers=self.headers).text
num=re.findall(r'</i>共找到.+?>(.+?)</em>篇帖子</span>',response,re.S)
pagenum=round(int(num)/20) #取整,四舍五入
print(pagenum)
for i in range(1,pagenum+1):
pageurl=f'{url}page/{i}/'
urllist.append(pageurl)
return urllist
def get_contentlist(self,urllist):
contentlist=[]
for url in urllist:
response = requests.get(url,headers=self.headers).text
div=re.findall(r'<ul class="post-list cl" id="post-list">(.+?)</ul>',response,re.S)
hrefs=re.findall(r'<a class="img" href="(.+?)" title=".+?" target="_blank">',div,re.S)
contentlist.extend(hrefs)
print(hrefs)
return contentlist
def get_content(self,contentlist):
for url in contentlist:
response = requests.get(url,headers=self.headers).text
h1=re.findall(r'<h1>(.+?)[(](.+?)[)]</h1>',response,re.S)
title=h1
title= re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", title)# 剔除不合法字符
print(title)
os.makedirs(f'mzsock/{title}/',exist_ok=True) #创建目录
page_num = h1
page_num = page_num.split('/')
print(page_num)
for i in range(1,int(page_num)+1):
content_url=f'{url[:-5]}_{i}.html'
content_response = requests.get(content_url, headers=self.headers).text
div=re.findall(r'<div class="picsbox picsboxcenter chenxing_pic_images">(.+?)</div>',content_response,re.S)
img_urls=re.findall(r'<img src="(.+?)"alt=".+?" width',div,re.S)
x=1
for img_url in img_urls:
img_name=f'{i}_{x}{img_url[-4:]}'
self.bctp(f'mzsock/{title}/', img_url, img_name)
x=x+1
def bctp(self,lj, img_url, img_name):
print("开始下载图片!")
try:
r = requests.get(img_url, timeout=5, headers=self.headers)
with open(f'{lj}/{img_name}', 'wb') as f:
f.write(r.content)
print(f'下载{img_name}图片成功!')
time.sleep(1)
except Exception as e:
if "port=443): Read timed out" in str(e):
time.sleep(2)
try:
r = requests.get(img_url, timeout=5, headers=self.headers)
with open(f'{lj}/{img_name}', 'wb') as f:
f.write(r.content)
print(f'下载{img_name}图片成功!')
except Exception as e:
print(f'下载{img_name}图片失败!')
print(f'错误代码:{e}')
with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f:
f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')
else:
print(f'下载{img_name}图片失败!')
print(f'错误代码:{e}')
with open(f'{lj}/spider.txt', 'a+', encoding='utf-8') as f:
f.write(f'错误代码:{e}---下载 {img_url} 图片失败\n')
if __name__ == '__main__':
spider=Mzsock()
categroy_urls=spider.get_categroy_url()
urllist=spider.get_urllist(categroy_urls)
contentlist=spider.get_contentlist(urllist)
spider.get_content(contentlist)
相关教学贴:
【福利】从零开始,手把手教你python爬取美足船袜网!
这个玩意,之前在七月份我刚刚接触爬虫的时候,一个老哥发这个网站问问能爬吗?我直接拒了他,我怎么可能是这种好se之徒呢???
第二天,这个网站被我爬了3.4g的图片给崩了{:1_926:}
晚上我独自享受着这些,最后全删了,md什么玩意,啥都不漏 python3.6实测错误fake_useragent 已经没有了。 只有 my_ fake_useragent 就算改了还是错误 。。 老哥稳,看来天天得喝营养快线了。:lol 爬下来的有味吗{:301_995:} 手机可不可以运行py{:301_997:} 太刺激了吧,老哥 能不能爬小草的 亲 看起来很厉害的样子{:301_1009:} 赞一个纯正则表达式 厉害啊 。我只会beautifulsoup的 品如的衣服 你好sao啊