吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 6617|回复: 16
收起左侧

[Python 转载] 图集谷图片爬虫脚本

[复制链接]
zyjsuper 发表于 2022-1-30 20:50
[Python] 纯文本查看 复制代码
001
002
003
004
005
006
007
008
009
010
011
012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041
042
043
044
045
046
047
048
049
050
051
052
053
054
055
056
057
058
059
060
061
062
063
064
065
066
067
068
069
070
071
072
073
074
075
076
077
078
079
080
081
082
083
084
085
086
087
088
089
090
091
092
093
094
095
096
097
098
099
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3
# encoding: utf-8
"""
@version: 1.0
@author: zyjsuper
@license: Apache Licence
@software: PyCharm
@file: tujigu_spider1.py
@time: 2020/8/29 21:10
"""
  
from lxml import etree
import requests
import math
import os
import re
import random
  
USER_AGENTS = [
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    "UCWEB7.0.2.37/28/999",
    "NOKIA5700/ UCWEB7.0.2.37/28/999",
    "Openwave/ UCWEB7.0.2.37/28/999",
    "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
    # iPhone 6:
    "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
]
  
headers = {
    'User-Agent':random.choice(USER_AGENTS)
}
url = "https://www.tujigu.com/"
response = requests.get(url,headers=headers,timeout=5)
html = etree.HTML(response.content.decode())
lis = html.xpath("//li[@id='tag']//li/a/text()")
lis1 = html.xpath("//li[@id='tag']//li/a/@href")
  
links_dict = {}
for i in range(0,len(lis)):
    links_dict[i] = lis[i]
  
# print(links_dict)
  
for k in links_dict.items():
    print(k,end = '\n')
  
while True:
    try:
        choose = int(input("请选择分类对应的数字>>>:"))
        choose_url = lis1[choose]
        print("即将打开链接: %s" %choose_url)
        break
    except:
        print("请输入0-%d之间的数字。" %len(lis))
  
resp = requests.get(choose_url,headers=headers,timeout=5)
f = etree.HTML(resp.content.decode())
count = f.xpath('//div[@class="shoulushuliang"]//span/text()')[0]
pages = math.ceil(int(count)/40)
print("此分类共包含%s套写真集,共计%s页。" %(count,pages))
  
while True:
    try:
        page = int(input("请选择第几页下载>>>:"))
        if page == 1:
            target_url = choose_url
        else:
            target_url = choose_url + "index_" + str(page-1) +".html"
        print(target_url)
        if page == pages:
            count1 = int(count)%40
        else:
            count1 = 40
        print(count1)
        print("即将爬取第%s页面,共计%s个模特。" %(page,count1))
        break
    except:
        print("请输入0-%d之间的数字。" %pages)
  
  
response2 = requests.get(target_url,headers=headers,timeout=5)
  
html = etree.HTML(response2.content.decode())
  
biaoti_list = html.xpath('//div[@class="hezi"]//li/p[@class="biaoti"]/a/text()')
mode_links = html.xpath('//div[@class="hezi"]//li/a/@href')
  
print(biaoti_list)
print(mode_links)
username = os.getenv("USERNAME")
savepath = "C:\\Users\\" + username + "\\Desktop\\tujigu\\"
try:os.mkdir(savepath)
except:pass
  
for i in range(len(biaoti_list)):
    try:
        os.mkdir(savepath + biaoti_list[i])
    except:
        pass
  
    mode_num = re.findall('\d{1,6}',mode_links[i])[0]
    response3 = requests.get(mode_links[i])
    html = etree.HTML(response3.content.decode())
    pics_count = html.xpath('//p[contains(text(),"图片数量")]/text()')[0]
    print("模特的号码是:%s,%s" %(mode_num,pics_count))
    pics = re.findall('\d{1,3}',pics_count)[0]
    for j in range(1,int(pics)+1):
        pic_link =  "https://lns.hywly.com/a/1/" + mode_num + "/" + str(j) +".jpg"
        print("开始爬取%s" %pic_link)
        with open(savepath + biaoti_list[i] + "\\" + str(j) + ".jpg" , "wb" ) as file:
            file.write(requests.get(pic_link).content)
        print("保存为图片%s" %(savepath + biaoti_list[i] + "\\" + str(j) + ".jpg"))

免费评分

参与人数 6吾爱币 +5 热心值 +4 收起 理由
yiwangguli + 1 + 1 谢谢@Thanks!
夫子点灯 + 1 热心回复!
lgc81034 + 1 谢谢@Thanks!
为之奈何? + 1 + 1 我很赞同!
ylixx + 1 我很赞同!
xxxlsy + 1 + 1 热心回复!

查看全部评分

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

光之继承者 发表于 2022-1-31 15:36
失效脚本,请不要尝试了
无缺i 发表于 2022-1-30 22:35
无缺i 发表于 2022-1-31 13:35

爬的那个网页都打不开,而且不出意外的话这个代码是20年写的
weliong 发表于 2022-1-30 21:05
暂时用不到 收藏一下。
头像被屏蔽
dongse 发表于 2022-1-30 21:13
提示: 作者被禁止或删除 内容自动屏蔽
AngryMuGuL 发表于 2022-1-31 01:57
无缺i 发表于 2022-1-30 22:35
失效的代码也发出来吗

失效了嘛?
fzwasser 发表于 2022-1-31 06:52
我来测试下
Wapj_Wolf 发表于 2022-1-31 08:09
谢谢楼主分享,PY小白好学习学习。
zm55555 发表于 2022-1-31 09:52
谢谢分享,好东东!
photocs 发表于 2022-1-31 10:09
试试看吧!
MyModHeaven 发表于 2022-1-31 10:54
Snipaste_2022-01-31_10-54-13.jpg
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2025-4-10 22:15

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表