import
requests
from
lxml
import
etree
import
time
import
os
import
re
import
multiprocessing as mp
def
geturl(url
=
'https://www.tujigu.com/'
):
headers
=
{
'Accept'
:
'*/*'
,
'Accept-Language'
:
'en-US,en;q=0.8'
,
'Cache-Control'
:
'max-age=0'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
,
'Connection'
:
'keep-alive'
,
'Referer'
:
'http://www.baidu.com/'
}
try
:
res
=
requests.get(url , headers
=
headers , timeout
=
16
)
time.sleep(
2
)
res.encoding
=
res.apparent_encoding
xml
=
etree.HTML(res.text)
return
xml
except
:
print
(f
'获取{url}页面失败,已经放弃!'
)
return
'error'
def
getmaxpage (count):
maxpage
=
geturl()
if
maxpage
=
=
'error'
:
print
(
'取得最大页失败!'
)
return
'max page error'
else
:
atlnum
=
maxpage.xpath(
'//div[5]/ul/li[1]/a/@href'
)
maxnum
=
atlnum[
0
].split(
'/'
)
if
int
(count)
+
6
>
int
(maxnum[
-
2
]):
print
(f
'输入的数值超过现有图集数量,已经把数量更改为现有图集数量{maxnum[-2]}'
)
print
(
'准备开始>>>>>>>'
)
return
maxnum[
-
2
]
else
:
print
(
'准备开始>>>>>>>'
)
return
count
def
getatlname(url):
atlcode
=
geturl(url)
if
atlcode
=
=
'error'
:
print
(
'获取源码失败!'
)
return
'code error'
else
:
atlname
=
atlcode.xpath(
'//div[@class="tuji"]/div[@class="weizhi"]/h1/text()'
)
atlnumber
=
atlcode.xpath(
'//div[@class="tuji"]/p[contains(text(),"图片数量")]/text()'
)
if
atlnumber !
=
[]:
atlnumber1
=
atlnumber[
0
].split(
'P'
)
atlnum
=
atlnumber1[
0
].split(
' '
)
atllink
=
atlcode.xpath(
'//*[@id="pages"]/a/@href'
)
pagenum
=
atlcode.xpath(
'//*[@id="pages"]/text()/following-sibling::a/text()'
)
return
atlnum[
1
] , atlname ,atllink,pagenum[
-
2
]
else
:
print
(
'未获取到图片数量!'
)
return
'code error'
total
=
0
count
=
666103
maxnum2
=
getmaxpage(count)
if
maxnum2
=
=
'max page error'
:
print
(f
'没有获取到最大页,已经默认输入的{count}为最大页'
)
maxnum
=
int
(count)
else
:
maxnum
=
int
(maxnum2)
def
getpic(i):
atlas
=
'https://www.tujigu.com/a/'
+
str
(i)
+
'/'
atlasinfo
=
getatlname(atlas)
if
atlasinfo
=
=
'code error'
:
print
(
'获得源码失败!放弃这个图集'
)
print
(f
'url地址:{atlas}'
)
else
:
picname
=
1
atlname
=
re.sub(
'\s'
, '
' , atlasinfo[1][0]).replace('
/
' , '
')
path
=
'D:/tujigu/'
+
str
(atlname)
if
not
os.path.exists(path):
os.makedirs(path)
print
(f
'正在采集{atlasinfo[1][0]},图集编号为【{i}】,本图集一共{atlasinfo[0]}张'
)
for
j
in
range
(
1
,
int
(atlasinfo[
3
])
+
1
):
if
j
=
=
1
:
atllink
=
'https://www.tujigu.com/a/'
+
str
(i)
+
'/'
else
:
atllink
=
'https://www.tujigu.com/a/'
+
str
(i)
+
'/'
+
str
(j)
+
'.html'
picxml
=
geturl(atllink)
if
picxml
=
=
'error'
:
print
(
'获取图片列表失败'
)
continue
else
:
piclist
=
picxml.xpath(
'//div[@class="content"]/img/@src'
)
for
pic
in
piclist:
piccon
=
requests.get(pic)
with
open
(path
+
'/'
+
str
(picname)
+
'.jpg'
,
'wb'
) as f:
f.write(piccon.content)
global
total
total
+
=
1
picname
+
=
1
if
__name__
=
=
'__main__'
:
pool
=
mp.Pool(processes
=
8
)
res
=
pool.
map
(getpic , (
range
(
6
,maxnum
+
1
)))
print
(res)