import
requests,time,os
from
lxml
import
etree
from
urllib
import
request
name_url
=
{}
def
sort():
req
=
requests.get(
'https://www.tujigu.com/'
)
req.encoding
=
'utf-8'
req_xp
=
etree.HTML(req.text)
text_list
=
req_xp.xpath(
'//*[@class="menu"]/li/a/text()|//*[@id="tag_ul"]/li/a/text()'
)
href_list
=
req_xp.xpath(
'//*[@class="menu"]/li/a/@href|//*[@id="tag_ul"]/li/a/@href'
)
for
href,text
in
zip
(href_list,text_list):
name_url[text]
=
href
return
text_list
def
dow(url,name):
if
not
os.path.exists(
"图集谷"
):
os.mkdir(
'图集谷'
)
if
not
os.path.exists(
"图集谷/{}"
.
format
(name)):
os.mkdir(
'图集谷/{}'
.
format
(name))
atlas
=
requests.get(url)
atlas.encoding
=
'utf-8'
atlas_xp
=
etree.HTML(atlas.text)
text_list
=
atlas_xp.xpath(
'//*[@class="biaoti"]/a/text()'
)
href_list
=
atlas_xp.xpath(
'//*[@class="biaoti"]/a/@href'
)
for
text,href
in
zip
(text_list,href_list):
req
=
requests.get(href)
req.encoding
=
'utf-8'
req_xp1
=
etree.HTML(req.text)
src_list
=
req_xp1.xpath(
'//*[@class="content"]/img/@src'
)
num
=
1
text
=
text.replace(
'\n'
, '
').replace('
/
', '
').replace('
\\
', '
').replace('
:
', '
').replace('
*
', '
').replace('
"
', '
').replace('
<
', '
').replace('
>
', '
').replace('
|
', '
').replace('
?
', '
')
if
not
os.path.exists(
"图集谷/{}/{}"
.
format
(name,text)):
os.mkdir(
"图集谷/{}/{}"
.
format
(name,text))
for
src
in
src_list:
request.urlretrieve(src,
"图集谷/{}/{}/{}.jpg"
.
format
(name,text,num))
num
+
=
1
print
(
'{}-------------成功下载'
.
format
(text))
else
:
print
(
'{}--------------内容已下载'
.
format
(text))
def
get():
while
1
:
text_list
=
sort()
i
=
1
for
text
in
text_list[
2
:
-
1
]:
print
(
'%02d.{}'
.
format
(text)
%
i)
i
+
=
1
opt
=
input
(
'输入您要爬取的内容(首页为默认)>>>>> '
)
if
not
opt.isdigit():
print
(
'傻X输入中文懂么'
)
time.sleep(
3
)
continue
opt
=
int
(opt)
if
not
0
< opt <
len
(text_list)
-
3
:
print
(
'输入范围错误'
)
time.sleep(
3
)
continue
opt
+
=
1
url
=
name_url[text_list[opt]]
name
=
text_list[opt]
print
(
'{}====开始爬取'
.
format
(name))
dow(url,name)
input
(
'爬取完成,按下回车重新开始'
)
if
__name__
=
=
'__main__'
:
get()