本帖最后由 小天0027 于 2020-5-8 22:29 编辑
因某人的要求,写了个小说爬虫,应该大家都能看的懂,我下面就直接上代码了(更新个翻页+随机头,断点续传以后再说,哈哈哈)。
最后声明:本代码仅供技术交流,请勿商用,如有侵权,请告知本人,本人立即删除。
import re, os, time
from lxml import etree
from requests import get
from fake_useragent import UserAgent
class NovelDownload(object):
def __init__(self):
self.next = True # 翻页标记
self.page = 1
self.url = 'http://book.zongheng.com/store/c0/c0/b0/u0/p{}/v0/s1/t0/u0/i1/ALL.html' # 纵横免费小说
self.catalog_url = 'http://book.zongheng.com/showchapter/' # 目录页面前面的部分
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.36 Safari/537.36',
}
def get_ids(self):
res = get(url=self.url.format(self.page), headers=self.headers)
# print(res.text)
html = etree.HTML(res.text)
novel_list = html.xpath('//div[@class="bookname"]/a/text()')
novel_url_list = html.xpath('//div[@class="bookname"]/a/@href')
try:
next_page = html.xpath('//div[@class="pagenumber pagebar"]/a[@class="block scrollpage"]/@title')
if len(next_page) > 1 or self.page == 1:
self.page +=1
else:
self.next =False
except :
self.next =False
ids = []
for i in novel_url_list:
id = re.findall(r'http://book.zongheng.com/book/(\d+)\.html', i)[0]
ids.append(id)
return ids, novel_list
def get_detail(self, id):
catalog_url = self.catalog_url + id + '.html'
res = get(catalog_url, self.headers)
# print(res.text)
html = etree.HTML(res.text)
detail_url = html.xpath('//li[@class=" col-4"]/a/@href')
# print(detail_url)
return detail_url
def get_content(self, detail_url):
detail = get(detail_url, self.headers)
res = etree.HTML(detail.text)
content = res.xpath('//div[@class="content"]/p/text()')
title = res.xpath('//div[@class="title_txtbox"]/text()')[0]
detail = '\r\n'.join(content).strip().replace('\u3000', '')
return title, detail
def save_file(self, title, detail, name):
# print(title,detail,name)
with open('f:/novel/{}/{}.txt'.format(name, title), 'w', encoding='utf-8') as f:
f.write(detail)
def main(self):
n = 0
ids, novel_list = self.get_ids()
print('这是第{}页'.format(self.page-1))
for name in novel_list:
if not os.path.exists('f:/novel'):
os.mkdir('f:/novel')
if not os.path.exists('f:/novel/{}'.format(name)):
os.mkdir('f:/novel/{}'.format(name))
b = self.get_detail(ids[n])
for detail_url in b:
title, detail = self.get_content(detail_url)
self.save_file(title, detail, name)
print('《{}》,{}下载完毕'.format(name,title))
time.sleep(5)
ua = UserAgent()
self.headers['User-Agent'] = ua.random
# print(self.headers)
n += 1
if self.next:
self.main()
else:
print('已经完了')
if __name__ == '__main__':
a =NovelDownload()
a.main()
emmm,没有图片,也没有说明,看函数定义就知道每个函数要做的什么,然后希望能给有需要的同学提供下思路(~~,其实我也是渣渣,大声BB)
最后,谢谢各位的浏览
|