import
requests, os
from
lxml
import
etree
import
urllib
from
urllib.request
import
urlopen
import
re
def
geturl(url):
global
list
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Mobile Safari/537.36 Edg/84.0.522.52'
}
req
=
urllib.request.Request(url, headers
=
headers)
print
(
'1、正在打开网址...'
+
url)
website
=
urlopen(req,timeout
=
120
)
html
=
website.read().decode(
'utf8'
)
website.close()
print
(
'2、正在查找符合条件的图片网址...'
)
links
=
re.findall(
'<p class="biaoti"><a target="_blank">'
,html)
list
=
[]
print
(
'3、开始准备图片网址列表内容。。。'
)
for
link
in
links:
aurl
=
'https://www.tujigu.com/a/'
+
link
+
'/'
list
.append(aurl)
print
(
'列表内容准备完毕,下面开始下载图片。。。'
)
return
list
def
downimg(imgurl):
newcount
=
len
(
list
)
h
=
1
while
h < newcount:
url
=
list
[h]
print
(url)
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Mobile Safari/537.36 Edg/84.0.522.52'
}
list1
=
[]
res
=
requests.get(url, headers
=
headers).text
res
=
etree.HTML(res)
title
=
res.xpath(
'/html/body/div[2]/div[1]/h1/text()'
)[
0
].encode(
'ISO-8859-1'
).decode(
'UTF-8'
)
page
=
res.xpath(
'//*[@id="pages"]/a/text()'
)
data
=
res.xpath(
'//div[@class="content"]/img/@src'
)
print
(title)
for
j
in
range
(
len
(data)):
print
(data[j])
list1.append(data[j])
i
=
2
while
i <
int
(page[
-
2
])
+
1
:
urls
=
url
+
'%s.html'
%
i
res
=
requests.get(url
=
urls, headers
=
headers).text
res
=
etree.HTML(res)
data
=
res.xpath(
'//div[@class="content"]/img/@src'
)
for
j
in
range
(
len
(data)):
print
(data[j])
list1.append(data[j])
i
+
=
1
path
=
'./%s/'
%
title
if
not
os.path.exists(path):
os.makedirs(path)
print
(
"目录创建成功"
)
else
:
print
(
"目录已经存在"
)
print
(
'开始下载!!!'
)
for
i
in
range
(
len
(list1)):
jpg_url
=
list1[i]
res
=
requests.get(jpg_url).content
with
open
(
'%s/%s.jpg'
%
(title, i),
'wb'
) as fp:
fp.write(res)
print
(
'第'
+
str
(i)
+
'张图片下载完成!'
)
print
(
'第'
+
str
(h)
+
'个图片网址下载完成!!!'
)
h
+
=
1
if
__name__
=
=
'__main__'
:
print
(
'准备开始工作了。。。'
)
page
=
1
while
page <
50
:
url
=
'https://www.tujigu.com/zhongguo/'
+
str
(page)
+
'.html'
geturl(url)
downimg(
list
)