import
os,requests,re
from
time
import
sleep
from
bs4
import
BeautifulSoup
from
random
import
uniform
def
url_open(url):
headers
=
{}
headers[
"User-Agent"
]
=
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
response
=
requests.get(url,headers
=
headers)
response.encoding
=
"utf-8"
html
=
response.text
return
html
def
collect_url(html,root_url):
print
(
"正在收集全站已完结的小说链接....."
)
novel_name_all
=
[]
novel_url_all
=
[]
soup
=
BeautifulSoup(html,
"html.parser"
)
totle_pages
=
int
((soup.find(
"div"
,
class_
=
"pagelink"
).em.text).split(
"/"
)[
1
])
print
(
"总页数:"
+
str
(totle_pages))
for
page
in
range
(
1
,totle_pages
+
1
):
url
=
root_url
+
'modules/article/articlelist.php?fullflag=1&page={}'
.
format
(page)
print
(
"正在打开"
+
url)
html
=
url_open(url)
p_novel_url
=
fr
'<a href="(.+?/xiaoshuo/.+\.html)">'
print
(p_novel_url)
novel_url_temp
=
re.findall(p_novel_url,html)
print
(
len
(novel_url_temp))
for
novel_url
in
novel_url_temp:
novel_url_all.append(novel_url)
p_novel_name
=
fr
'{novel_url}">(.+)</a>'
novel_name_temp
=
re.findall(p_novel_name,html)[
0
]
novel_name_all.append(novel_name_temp)
break
data
=
[novel_name_all,novel_url_all]
print
(
"收集工作已完成,准备进入小说内容下载....."
)
sleep(
1
)
return
data
def
get_and_save_data(data):
novel_name_all
=
data[
0
]
novel_url_all
=
data[
1
]
i
=
-
1
for
novel_url
in
novel_url_all:
i
+
=
1
novel_name
=
novel_name_all[i]
print
()
print
(
"正在下载小说:《%s》"
%
novel_name)
print
()
html_1
=
url_open(novel_url)
soup_1
=
BeautifulSoup(html_1,
"html.parser"
)
chapters_url
=
soup_1.find(
"p"
,
class_
=
"btnlinks"
).a[
"href"
]
html_2
=
url_open(chapters_url)
soup_2
=
BeautifulSoup(html_2,
"html.parser"
)
chapters_url_all
=
soup_2.find_all(
"td"
,
class_
=
"L"
)
for
each
in
chapters_url_all:
chapters_url
=
each.a[
"href"
]
html
=
url_open(chapters_url)
soup
=
BeautifulSoup(html,
"html.parser"
)
chapters_name
=
soup.find(
"dd"
).h1.text
chapters_name
=
re.sub(
' +'
,
' '
,chapters_name).strip()
print
(
"正在下载《%s》:%s"
%
(novel_name,chapters_name))
contents
=
soup.find(
"dd"
,
id
=
"contents"
).text
with
open
(
"%s.txt"
%
novel_name,
"a"
,encoding
=
"utf-8"
) as g:
g.write(
"\n"
*
3
+
chapters_name
+
str
(
"\n"
)
*
3
)
g.write(
" "
+
contents)
slee_time
=
uniform(
0.35
,
0.75
)
sleep(slee_time)
print
(
"小说%s已下载完毕"
%
novel_name)
print
(
"准备进入下一部小说下载"
)
sleep(
2
)
break
def
main():
path
=
r
'C:\Users\Administrator\Desktop\test'
if
os.getcwd() !
=
path:
if
os.path.exists(path)
=
=
False
:
os.mkdir(path)
os.chdir(path)
else
:
os.chdir(path)
root_url
=
"https://www.ddxsku.com/"
target_url
=
root_url
+
"full.html"
data
=
collect_url(url_open(target_url),root_url)
get_and_save_data(data)
if
__name__
=
=
"__main__"
:
main()