import
os
import
re
import
random
import
time
import
requests
import
threading
from
queue
import
Queue
from
lxml
import
etree
from
requests.adapters
import
HTTPAdapter
user_agents
=
[
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363'
,
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
,
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0'
,
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
,
'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
]
session
=
requests.Session()
adapter
=
HTTPAdapter(pool_connections
=
100
, pool_maxsize
=
100
)
session.mount(
'http://'
, adapter)
session.mount(
'https://'
, adapter)
def
get_chaptercontent(chapter_url, temp_file, queue, semaphore, session, max_retries
=
5
):
semaphore.acquire()
try
:
retry_count
=
0
chaptercontent
=
''
while
retry_count < max_retries:
try
:
time.sleep(
0.2
)
headers
=
{
'User-Agent'
: random.choice(user_agents),
'Accept-Language'
:
'en-US,en;q=0.9'
}
response
=
session.get(chapter_url, headers
=
headers, timeout
=
60
)
response.close()
if
response.status_code
=
=
429
:
wait_time
=
int
(response.headers.get(
'Retry-After'
,
2
))
time.sleep(wait_time)
continue
if
response.status_code !
=
200
:
if
retry_count
=
=
max_retries
-
1
:
print
(f
"未能获取章节: {chapter_url} - 状态码: {response.status_code}"
)
queue.put(
None
)
return
html
=
response.content.decode(response.apparent_encoding)
selector
=
etree.HTML(html)
title
=
selector.xpath(
'//div[@class="content"]/h1/text()'
)
contents
=
selector.xpath(
'//div[@id="chaptercontent"]/text()'
)
for
content
in
contents:
chaptercontent
+
=
'\n '
+
str
(content).strip()
chaptercontent
=
re.sub(r
'请收藏.*?m.bigee.cc'
, '', chaptercontent,
flags
=
re.S)
chaptercontent
=
re.sub(r
'[\s ]{0,6}第.{1,10}[部分章节卷页]{1,2}.{0,30}[\s \n]{0,6}'
,
'', chaptercontent)
if
not
title
or
not
contents:
if
retry_count
=
=
max_retries
-
1
:
print
(f
"未能找到章节内容: {chapter_url}"
)
queue.put(
None
)
return
title
=
title[
0
]
print
(f
"\t正在下载:{title}"
)
with
open
(temp_file,
'w'
, encoding
=
'utf-8'
) as f:
f.write(title
+
'\n\n'
)
f.writelines(chaptercontent)
queue.put(temp_file)
break
except
requests.exceptions.RequestException as e:
retry_count
+
=
1
if
retry_count
=
=
max_retries:
print
(f
"达到最大重试次数,未能下载章节: {chapter_url} - 错误: {e}"
)
queue.put(
None
)
return
else
:
time.sleep(
5
)
if
retry_count
=
=
max_retries:
queue.put(
None
)
finally
:
semaphore.release()
def
download_chapters(base_url):
retry_count
=
0
max_retries
=
5
while
retry_count < max_retries:
try
:
response
=
session.get(base_url, headers
=
{
'User-Agent'
: random.choice(user_agents)}, timeout
=
60
)
response.close()
if
response.status_code !
=
200
:
if
retry_count
=
=
max_retries
-
1
:
print
(f
"未能获取URL: {response.status_code}"
)
return
html
=
response.content.decode(response.apparent_encoding)
selector
=
etree.HTML(html)
chapter_links
=
selector.xpath(
'//dd/a/@href'
)
if
not
chapter_links:
if
retry_count
=
=
max_retries
-
1
:
print
(
"未找到章节链接。"
)
return
book_name
=
selector.xpath(
'//h1/text()'
)[
0
]
print
(f
'\n正在下载小说:{book_name}\n'
)
save_directory
=
os.path.join(os.getcwd(),
'我的小说'
)
os.makedirs(save_directory, exist_ok
=
True
)
result_queue
=
Queue()
semaphore
=
threading.BoundedSemaphore(max_threads)
threads
=
[]
for
index, href
in
enumerate
(chapter_links[
0
:
-
5
], start
=
1
):
if
href !
=
"javascript:dd_show()"
:
chapter_url
=
f
'https://www.bigee.cc{href}'
temp_file
=
os.path.join(save_directory, f
'temp_{index:04d}.txt'
)
thread
=
threading.Thread(target
=
get_chaptercontent,
args
=
(chapter_url, temp_file, result_queue, semaphore, session, max_retries))
threads.append(thread)
thread.start()
for
thread
in
threads:
thread.join()
temp_files
=
[]
while
not
result_queue.empty():
temp_file
=
result_queue.get()
if
temp_file:
temp_files.append(temp_file)
temp_files.sort(key
=
lambda
x:
int
(x.split(
'_'
)[
-
1
].split(
'.'
)[
0
]))
append_temp_files_to_main(temp_files, save_directory, book_name)
break
except
requests.exceptions.RequestException as e:
retry_count
+
=
1
if
retry_count
=
=
max_retries:
print
(f
"达到最大重试次数,未能下载章节列表。 - 错误: {e}"
)
return
else
:
time.sleep(
5
)
if
retry_count
=
=
max_retries:
print
(f
"达到最大重试次数,未能下载章节列表。"
)
def
append_temp_files_to_main(temp_files, save_directory, book_name):
book_path
=
os.path.join(save_directory, f
'{book_name}.txt'
)
with
open
(book_path,
'w'
, encoding
=
'utf-8'
) as main_file:
for
temp_file
in
temp_files:
with
open
(temp_file,
'r'
, encoding
=
'utf-8'
) as tf:
chapter_text
=
tf.read().strip()
if
chapter_text:
main_file.write(chapter_text
+
'\n\n'
)
os.remove(temp_file)
if
__name__
=
=
"__main__"
:
base_url
=
'https://www.bigee.cc/'
url
=
input
(f
"请输入网站({base_url})内选定小说目录页所在页网址:"
)
max_threads
=
100
if
url
=
=
'':
url
=
'https://www.bigee.cc/book/59507/'
start_time
=
time.time()
download_chapters(url)
end_time
=
time.time()
print
(f
'\n总耗时:{end_time - start_time:.2f}秒。\n'
)
input
(
"下载完成,小说保存在“我的小说”文件夹内,回车退出!"
)