图集谷图片爬虫脚本

zyjsuper · 发表于 2022-1-30 20:50

[Python] 纯文本查看 复制代码

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

016

017

018

019

020

021

022

023

024

025

026

027

028

029

030

031

032

033

034

035

036

037

038

039

040

041

042

043

044

045

046

047

048

049

050

051

052

053

054

055

056

057

058

059

060

061

062

063

064

065

066

067

068

069

070

071

072

073

074

075

076

077

078

079

080

081

082

083

084

085

086

087

088

089

090

091

092

093

094

095

096

097

098

099

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

#!/usr/bin/env python3
# encoding: utf-8
"""
@version: 1.0
@author: zyjsuper
@license: Apache Licence
@software: PyCharm
@file: tujigu_spider1.py
@time: 2020/8/29 21:10
"""
  
from lxml import etree
import requests
import math
import os
import re
import random
  
USER_AGENTS = [
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    "UCWEB7.0.2.37/28/999",
    "NOKIA5700/ UCWEB7.0.2.37/28/999",
    "Openwave/ UCWEB7.0.2.37/28/999",
    "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
    # iPhone 6：
    "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
]
  
headers = {
    'User-Agent':random.choice(USER_AGENTS)
}
url = "https://www.tujigu.com/"
response = requests.get(url,headers=headers,timeout=5)
html = etree.HTML(response.content.decode())
lis = html.xpath("//li[@id='tag']//li/a/text()")
lis1 = html.xpath("//li[@id='tag']//li/a/@href")
  
links_dict = {}
for i in range(0,len(lis)):
    links_dict[i] = lis[i]
  
# print(links_dict)
  
for k in links_dict.items():
    print(k,end = '\n')
  
while True:
    try:
        choose = int(input("请选择分类对应的数字>>>:"))
        choose_url = lis1[choose]
        print("即将打开链接: %s" %choose_url)
        break
    except:
        print("请输入0-%d之间的数字。" %len(lis))
  
resp = requests.get(choose_url,headers=headers,timeout=5)
f = etree.HTML(resp.content.decode())
count = f.xpath('//div[@class="shoulushuliang"]//span/text()')[0]
pages = math.ceil(int(count)/40)
print("此分类共包含%s套写真集,共计%s页。" %(count,pages))
  
while True:
    try:
        page = int(input("请选择第几页下载>>>:"))
        if page == 1:
            target_url = choose_url
        else:
            target_url = choose_url + "index_" + str(page-1) +".html"
        print(target_url)
        if page == pages:
            count1 = int(count)%40
        else:
            count1 = 40
        print(count1)
        print("即将爬取第%s页面,共计%s个模特。" %(page,count1))
        break
    except:
        print("请输入0-%d之间的数字。" %pages)
  
  
response2 = requests.get(target_url,headers=headers,timeout=5)
  
html = etree.HTML(response2.content.decode())
  
biaoti_list = html.xpath('//div[@class="hezi"]//li/p[@class="biaoti"]/a/text()')
mode_links = html.xpath('//div[@class="hezi"]//li/a/@href')
  
print(biaoti_list)
print(mode_links)
username = os.getenv("USERNAME")
savepath = "C:\\Users\\" + username + "\\Desktop\\tujigu\\"
try:os.mkdir(savepath)
except:pass
  
for i in range(len(biaoti_list)):
    try:
        os.mkdir(savepath + biaoti_list[i])
    except:
        pass
  
    mode_num = re.findall('\d{1,6}',mode_links[i])[0]
    response3 = requests.get(mode_links[i])
    html = etree.HTML(response3.content.decode())
    pics_count = html.xpath('//p[contains(text(),"图片数量")]/text()')[0]
    print("模特的号码是：%s，%s" %(mode_num,pics_count))
    pics = re.findall('\d{1,3}',pics_count)[0]
    for j in range(1,int(pics)+1):
        pic_link =  "https://lns.hywly.com/a/1/" + mode_num + "/" + str(j) +".jpg"
        print("开始爬取%s" %pic_link)
        with open(savepath + biaoti_list[i] + "\\" + str(j) + ".jpg" , "wb" ) as file:
            file.write(requests.get(pic_link).content)
        print("保存为图片%s" %(savepath + biaoti_list[i] + "\\" + str(j) + ".jpg"))

光之继承者 · 发表于 2022-1-31 15:36

失效脚本，请不要尝试了

无缺i · 发表于 2022-1-30 22:35

失效的代码也发出来吗

无缺i · 发表于 2022-1-31 13:35

AngryMuGuL 发表于 2022-1-31 01:57
失效了嘛？

爬的那个网页都打不开，而且不出意外的话这个代码是20年写的

weliong · 发表于 2022-1-30 21:05

暂时用不到收藏一下。

dongse · 发表于 2022-1-30 21:13

提示: 作者被禁止或删除内容自动屏蔽

AngryMuGuL · 发表于 2022-1-31 01:57

无缺i 发表于 2022-1-30 22:35
失效的代码也发出来吗

失效了嘛？

fzwasser · 发表于 2022-1-31 06:52

我来测试下

Wapj_Wolf · 发表于 2022-1-31 08:09

谢谢楼主分享，PY小白好学习学习。

zm55555 · 发表于 2022-1-31 09:52

谢谢分享，好东东！

photocs · 发表于 2022-1-31 10:09

试试看吧！

MyModHeaven · 发表于 2022-1-31 10:54

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] 图集谷图片爬虫脚本

免费评分

浏览过的版块

dongse dongse 当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽	dongse 发表于 2022-1-30 21:13 提示: 作者被禁止或删除内容自动屏蔽

	回复支持举报