爬取漂亮小姐姐(有点反爬机制)
第一次发自己的爬虫。。。有违请删import os
import requests
from lxml import etree
import re
import json
import time
headers = {
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59',
}
#保存文件
def xzwj(xz,head,path):
w_na = xz.split("/")[-1] #拆分地址,以最后一段为存盘的文件名
# print(w_na)
w_rar = requests.get(xz,headers=head).content
with open(path+f'//{w_na}','wb') as f: #修改e://debug//处,改变文件存放位置。文件目录必须提前建好。
f.write(w_rar)
print("正在下载,请耐心等待。。。")
print(w_na,"下载完成")
#解释压缩文件网址
def rar_xz(rar_url,head):
xz_res = requests.get(rar_url,headers=head).text
xz_rar = re.compile(r'window.location=\'(.*?)\'')
xz = xz_rar.findall(xz_res)
# print(xz_res)
# print(xz)
return xz
def xzxz(xx): #文件下载
print("要下载的内容和网址是:",pna,pli)
print("---------开始下载展示图片------------")
r_resp = requests.get(pli,headers=headers)
r_tree = etree.HTML(r_resp.text)
r_imgs = r_tree.xpath('//div/div/p/img/@src') #获取每张图片的网址
r_nas = r_tree.xpath('//div/div/p/img/@title')#获取下载的图片的文件名
rar_url = r_tree.xpath('//div/div[@class="pay-box"]/a/@href')#获取下载文件的网址
# print(rar_url)
# print(len(r_nas),len(r_imgs))
# print(r_nas,r_imgs)
n=1
path = f"e://debug//{r_nas}"
if not os.path.exists(path):
os.mkdir(path)
head ={
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59',
'referer':pli
}
for img in r_imgs:
# print(img)
xzwj(img,head,path)
print(f"{r_nas[:-2]},第{n}张下载完成---")
n += 1
print("共",len(r_imgs),"张下载完成")
print("==============开始下载压缩文件============")
xz = rar_xz(rar_url,head)
xzwj(xz,head,path)
def main_xz():
print("1.性感美女 2.清纯可爱 3.性感御姐 4.制服诱惑")
ms = input("请选择分类:")
url1 = "https://dimgw.us/xinggan"
url2 = "https://dimgw.us/qc"
url3 = "https://dimgw.us/yj"
url4 = "https://dimgw.us/zf"
if int(ms) == 1:
url = url1
elif int(ms) == 2:
url = url2
elif int(ms) == 3:
url = url3
elif int(ms) == 4:
url = url4
# print(url)
resp = requests.get(url,headers=headers)
pages = re.findall(r'<a class="page-numbers" href="(.*?)</a>',resp.text)[-1]
#print(resp.status_code)
page = pages.split(">")[-1]
print("此类共",page,"页!")
print("*"*50)
pna = []
pli = []
for p in range(1,int(page)+1):
purl = url + f"/page/{p}"
presp = requests.get(purl,headers=headers)
tree = etree.HTML(presp.text)
rw_li = tree.xpath('//div[@class="row posts-wrapper"]//div/a[@target="_blank"]/@href')
rw_na = tree.xpath('//div[@class="row posts-wrapper"]//div/a[@target="_blank"]/img/@alt')
# print(rw_na)2
pna.extend(rw_na)
pli.extend(rw_li)
print("-"*50,"\n","共有以下美女可选:")
for na in pna:
print(pna.index(na)+1,na)
return pna,pli
pna,pli = main_xz()
var = 1
while var != len(pna):
xx = int(input("请输入要下载的序号(0退出):"))
if xx != 0:
xzxz(xx)
else:
print("退出")
break
var +=1
优化了一下,现在可以了
import re
import json
import time
headers = {
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59',
}
#保存文件
def xzwj(xz,head,path):
w_na = xz.split("/")[-1] #拆分地址,以最后一段为存盘的文件名
# print(w_na)
w_rar = requests.get(xz,headers=head).content
with open(path+f'//{w_na}','wb') as f:
f.write(w_rar)
print("正在下载,请耐心等待。。。")
print(w_na,"下载完成")
#解释压缩文件网址
def rar_xz(rar_url,head):
xz_res = requests.get(rar_url,headers=head).text
xz_rar = re.compile(r'window.location=\'(.*?)\'')
xz = xz_rar.findall(xz_res)
# print(xz_res)
# print(xz)
return xz
def xzxz(xx): #文件下载
print("要下载的内容和网址是:",pna,pli)
print("---------开始下载展示图片------------")
r_resp = requests.get(pli,headers=headers)
r_tree = etree.HTML(r_resp.text)
r_imgs = r_tree.xpath('//div/div/p/img/@src') #获取每张图片的网址
r_nas = r_tree.xpath('//div/div/p/img/@title')#获取下载的图片的文件名
rar_url = r_tree.xpath('//div/div[@class="pay-box"]/a/@href')#获取下载文件的网址
# print(rar_url)
# print(len(r_nas),len(r_imgs))
# print(r_nas,r_imgs)
n=1
path = f"e://debug//{r_nas}" #修改e://debug//处,改变文件存放位置。以美女名为目录名
if not os.path.exists(path):
os.mkdir(path)
print('文件存盘位置',path,'\n\n')
head ={
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59',
'referer':pli
}
for img in r_imgs:
# print(img)
xzwj(img,head,path)
print(f"{r_nas[:-2]},第{n}张下载完成---")
n += 1
print("共",len(r_imgs),"张下载完成")
print("==============开始下载压缩文件============")
xz = rar_xz(rar_url,head)
xzwj(xz,head,path)
def main_xz():
print("1.性感美女 2.清纯可爱 3.性感御姐 4.制服诱惑")
ms = input("请选择分类:")
url = ["https://dimgw.us/xinggan","https://dimgw.us/qc","https://dimgw.us/yj","https://dimgw.us/zf"]
# url = "https://dimgw.us/qc"
pna = []
pli = []
for p in range(1,1000):
purl = url + f"/page/{p}"
presp = requests.get(purl,headers=headers)
if presp.status_code == 200:
tree = etree.HTML(presp.text)
rw_li = tree.xpath('//div[@class="row posts-wrapper"]//div/a[@target="_blank"]/@href')
rw_na = tree.xpath('//div[@class="row posts-wrapper"]//div/a[@target="_blank"]/img/@alt')
# print(rw_na,rw_li)
else:
break
pna.extend(rw_na)
pli.extend(rw_li)
print("-"*50,"\n","共有以下美女可选:","\n","-"*50,"\n")
for na,li in zip(pna,pli):
print(pna.index(na)+1,na)
return pna,pli
pna,pli = main_xz()
var = 1
while var != len(pna):
xx = int(input("请输入要下载的序号(0退出):"))
print('正在提取信息。。。')
if xx != 0:
xzxz(xx)
else:
print("退出")
break
var +=1
本帖最后由 imhuihui 于 2022-5-30 17:27 编辑
https://wwd.lanzoum.com/b01vehgdi
密码:czxo
我打包好的exe可执行程序,需要的可自取【已更新】
在楼主贴的代码上 做了一点小修改,用的pyinstaller打包的
图片下载都存在电脑D盘,下载的时候会自动创建名为 美女写真 的文件夹,图片都下载到里面了
楼主原来的代码下载下来是webp格式的图片,电脑不能直接打开,所以我改了一下代码,把图片转成 jpg格式的了
求支持 求评分!{:1_919:} lanlano 发表于 2022-4-20 00:03
光是代码不会用啊。。老大。。弄个成品软件啊老大。。
第一种方法(自己运行):安装Anaconda(集成Python运行环境及相关库)和PyCharm(Python编译软件)这两个软件,百度有相关教程;之后创建Python文件复制楼主的代码进去,即可运行。
第二种方法:你可以叫楼主把上述代码打包为exe文件;直接点击exe运行。
ps:https://dimgw.us ←这是楼主给的福利网址
本帖最后由 imhuihui 于 2022-5-27 20:37 编辑
可以的话给个评分呗{:1_919:} 更新后的代码如下:
import os
import requests
from lxml import etree
import re
from io import BytesIO
from PIL import Image
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59'
}
# 保存文件
def xzwj(xz, head, path):
w_na = xz.split("/")[-1].rsplit(".", 1) + '.jpg'# 拆分地址,以最后一段为存盘的文件名
w_rar = requests.get(xz, headers=head)
byte_stream = BytesIO(w_rar.content)
im = Image.open(byte_stream)
if im.mode == "RGBA":
im.load()# required for png.split()
background = Image.new("RGB", im.size, (255, 255, 255))
background.paste(im, mask=im.split())
im.save(path + f'/{w_na}', 'JPEG')
print("正在下载,请耐心等待。。。")
def xzwj2(xz, head, path):
w_na = xz.split("/")[-1]# 拆分地址,以最后一段为存盘的文件名
w_rar = requests.get(xz, headers=head).content
with open(path + f'//{w_na}', 'wb') as f:# 修改e://debug//处,改变文件存放位置。文件目录必须提前建好。
f.write(w_rar)
print("正在下载,请耐心等待。。。")
print(w_na, "下载完成")
# 解释压缩文件网址
def rar_xz(rar_url, head):
xz_res = requests.get(rar_url, headers=head).text
xz_rar = re.compile(r"window.location='(.*?)'")
xz = xz_rar.findall(xz_res)
return xz
def xzxz(xx):# 文件下载
print("要下载的内容和网址是:", pna, pli)
print("开始下载展示图片".center(30, '-'))
r_resp = requests.get(pli, headers=headers)
r_tree = etree.HTML(r_resp.text)
# 使用xpth选择元素
r_imgs = r_tree.xpath('//div/div/p/img/@src')# 获取每张图片的网址
r_nas = r_tree.xpath('//div/div/p/img/@title')# 获取下载的图片的文件名
rar_url = r_tree.xpath('//div/div[@class="pay-box"]/a/@href')# 获取下载文件的网址
n = 1
path = f"d://美女写真//{r_nas}"
if not os.path.exists(path):
os.makedirs(path)
head = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59',
'referer': pli
}
for img in r_imgs:
xzwj(img, head, path)
print(f"{r_nas[:-2]},第{n}张下载完成---")
n += 1
print("共", len(r_imgs), "张下载完成")
print("============开始下载压缩文件(文件很大)==========")
xz = rar_xz(rar_url, head)
xzwj2(xz, head, path)
def main_xz():
print("1.性感美女 2.清纯可爱 3.性感御姐 4.制服诱惑")
ms = input("请选择分类:")
url1 = "https://dimgw.us/xinggan"
url2 = "https://dimgw.us/qc"
url3 = "https://dimgw.us/yj"
url4 = "https://dimgw.us/zf"
if int(ms) == 1:
url = url1
elif int(ms) == 2:
url = url2
elif int(ms) == 3:
url = url3
elif int(ms) == 4:
url = url4
# print(url)
resp = requests.get(url, headers=headers)
t = re.findall('<h2 class="entry-title">(.*?)</a></h2>', resp.text, re.S)
a = 0
pna = []
pli = []
print("此类共", len(t), "页!")
print("*" * 50 + '\n')
for i in t:
# print(i)
t1 = re.findall('>(.*)', i, re.S)
t2 = re.findall('href=\"(.*?)\"', i, re.S)
a += 1
# 把t1,t2列表中的每个元素分别添加到列表pna,pli
pna.extend(t1)
pli.extend(t2)
# print(t2)
print(str(a) + ')', t1)
return pna, pli
pna, pli = main_xz()
var = 1
while var != len(pna):
xx = input("请输入要下载的序号(0退出):")
if xx == '':
continue
elif int(xx) != 0:
xzxz(int(xx))
else:
print("已退出!")
break
var += 1
光是代码不会用啊。。老大。。弄个成品软件啊老大。。{:301_1003:} yysyWang 发表于 2022-6-4 16:47
一个美女一个文件夹,还是统一一个文件夹?
一人一个屋。。。。。 本帖最后由 小姐姐的男朋友 于 2022-4-21 13:10 编辑
songlangwei 发表于 2022-4-21 10:47
安装Anaconda(集成Python运行环境及相关库)和PyCharm(Python编译软件)这两个软件,百度有相关教程; ...
①打开pycharm,然后打开Terminal(Alt+F12),输入:pip install pyinstaller
②安装好pyinstaller库之后,然后在其中输入:pyinstaller -F *.py
*.py为你要打包的.py后缀文件,如:爬取小姐姐.py ;最后生成的exe文件即在dist文件内
PS:这是我打包好的压缩文件,需要自取→:「小姐姐.exe(要解压两次)」https://www.aliyundrive.com/s/DvwhdqPZrjq 提取码: 6f6w
点击链接保存,或者复制本段内容,打开「阿里云盘」APP ,无需下载极速在线查看,视频原画倍速播放。 谢谢楼主,肾已亏,不说了,先去买点六味地黄丸了 Asy_少洋 发表于 2022-4-20 20:06
www.mly6.com这个站,很多免费的 ,可以爬吗
找到要下的小姐姐的页面,复制网址,,填入就行。
不要下太多噢,注意身体{:1_899:}# -*- coding:utf-8 -*-
import os,sys
import requests
from lxml import etree
import re
import json
import time
headers = {
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59',
}
# url = "https://www.mly6.com/19157.html"
url = input("请输入要下载的网址(0退出):")
while url == str(0):
break
else:
header = {
"Referer": url,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
resp = requests.get(url,headers=headers)
tree = etree.HTML(resp.text)
u_img = tree.xpath('//div/article/p/a/@href')
u_na = tree.xpath('//div//h1/@title').split(r'[')
u_na ="".join(u_na.split())
# print(u_na)
# print(u_img)
path = f'e://deubug//{u_na}'
if not os.path.exists(path):
os.makedirs(path)
print(u_na,"目录新建成功")
else:
print( f"老司机,<<{u_na}>>目录已存在。。。")
n = 0
for img in u_img:
img_na = img.split('/')[-1]
print(img_na)
if os.path.exists(path+"//"+img_na) == True:
print("文件已存在---","退出。")
break
else:
imgresp = requests.get(img,headers=header)
# print(imgresp.status_code)
with open(path+fr"//{img_na}",'wb') as f:
f.write(imgresp.content)
print("正在下载,请耐心等待。。。")
print("下载完成")
n += 1
print(f"共{n}张全部下完。")
# -*- coding:utf-8 -*-
这个要加上 谢谢兄弟,图太顶了,已无心学习{:1_911:} 顶贴,这是怎么判断的
感谢楼主,我来试试, 楼主有心了 很不错,代码修改下也适用于其他网站。 谢谢兄弟分享 不错不错 有点意思!不过用中文的话 编码那边要搞好!你这个代码在python官方的编辑器里面是跑不了的!