selector = parsel.Selector(context)
lis = selector.css('#play_0 > ul > li')
for li in lis:
info = {}
info['name'] = li.xpath('a/text()').get()
info['url'] = li.xpath('a/@href').get()
l_ret.append(info)
selector = parsel.Selector(context)
lc = selector.css('#content > p::text').getall()[1:]
for l in lc:
r = r + ' ' + l + '\r\n'
#第二页
# ‘#content > div.m-tpage > ul > li.col-md-4.col-xs-12.col-sm-12 > a’
next_page_name = selector.css("#content > div.m-tpage > ul > li.col-md-4.col-xs-12.col-sm-12 > a::text").get()
if next_page_name == '下一页':
next_page_href = selector.css("#content > div.m-tpage > ul > li.col-md-4.col-xs-12.col-sm-12 > a::attr(href)").get()
next_page_url = url[0 : url.index('/book')] + next_page_href
next_page_content = get_url_html(next_page_url)
next_selector = parsel.Selector(next_page_content)
next_lc = next_selector.css('#content > p::text').getall()[1:]
for l in next_lc:
r = r + ' ' + l + '\r\n'
r = r.replace('&nb','')
return r
# 线程获取单章节内容
def thread_spider(chapter_info : dict):
name = chapter_info['name']
url_c = chapter_info['url']
# 第一页
context = get_url_html(url_c)
rc = parse_get_one_chapter(context)
chapter_info['context'] = rc
return chapter_info
def main():
# 获取主网页内容
html_content = get_url_html(url)
# 获取小说标题
story_title = parse_get_title(html_content)
# 获取章节列表
list_chapter_info = parse_get_chapter(html_content)
# 线程池 获取所有章节内容 最大16个线程同时跑
with ThreadPoolExecutor(max_workers=16) as t:
obj_list = []
# 把要做的事情挂到线程
for chapter_info in list_chapter_info:
obj = t.submit(thread_spider, chapter_info)
obj_list.append(obj)
# 等待所有线程结束
for future in as_completed(obj_list):
data = future.result()
print(data)
print('*' * 50)
# 获取到的内容写入文件
with open(story_title + '.txt','w') as f:
for chapter_info in list_chapter_info:
f.write('\r\n')
f.write(chapter_info['name'])
f.write('\r\n')
f.write(chapter_info['context'])