爬取某书网小说打包成txt
本帖最后由 应真先生 于 2019-8-12 23:47 编辑有个bug,有部分章节无法提取出来,打开查看,网页源代码为空,测试用beutifufsoup也是这样,但是有时候也会回复正常,很玄学,估计是我网络的问题,替换同网址内的小说目录可更换下载的小说
更换成手机版网址就不会为空了,但是速度慢了点
import requests
from lxml import etree
class QuanshuSpider(object):
def __init__(self):
self.session = requests.Session()
self.index_url = 'http://www.quanshuwang.com/book/9/9055'#网址可更换为全书网里面的书目录页
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
}
def get_index_page(self):
index_page = self.session.get(self.index_url, headers=self.headers).content.decode('gbk')
html = etree.HTML(index_page)
self.title = html.xpath('//*[@id="chapter"]/div/div/strong/text()')
authors = html.xpath('//*[@id="chapter"]/div/div/span/text()')
self.author = authors
print(self.title,self.author)
chapter_urls = html.xpath('//*[@id="chapter"]/div/div/ul/div/li/a/@href')
return chapter_urls
def parse_chapter_page(self):
for chapter_url in self.get_index_page():
try:
chapter_page = self.session.get(chapter_url,).content.decode('gbk')
html = etree.HTML(chapter_page)
chapter_content = html.xpath('//*[@id="content"]/text()')
chapter_titles = html.xpath('//*[@id="directs"]/div/h1/strong/text()')
chapter_title = chapter_titles + '\n\n'
# print(chapter_content.strip())
self.save_data(chapter_title)
print('正在保存 ' + chapter_url)
print(chapter_title)
for content in chapter_content:
contents = ' ' + content.strip() + '\n'
self.save_data(contents)
except Exception as e:
print(e)
continue
def save_data(self,content):
with open(self.title + ' ' + self.author + '.txt', 'a+', encoding='utf-8') as f:
f.write(content)
f.close()
if __name__ == '__main__':
spider = QuanshuSpider()
spider.parse_chapter_page()
手机版
import re
import requests
from lxml import etree
class MquanshuSpider(object):
def __init__(self):
self.session = requests.Session()
self.url = 'http://m.quanshuwang.com/list/9055_{}01.html'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Mobile Safari/537.36'
}
def get_paging(self):
for i in range(0, 100):
paging_urls = self.url.format(str(i))
yield paging_urls
def parse_paging(self):
try:
for paging_url in spider.get_paging():
paging_page = self.session.get(paging_url, headers=self.headers).content.decode('gbk')
html = etree.HTML(paging_page)
self.name = html.xpath('//*[@id="htmlbookinfo"]/h1/text()')
self.author = html.xpath('//*[@id="htmlbookinfo"]/ul/li/a/text()')
chapter_titles = html.xpath('//*[@id="alllist"]/li/a/@title')
chapter_urls = html.xpath('//*[@id="alllist"]/li/a/@href')
if len(chapter_titles) == 0:
break
for chapter_url, chapter_title in zip(chapter_urls, chapter_titles):
yield chapter_url, chapter_title
except Exception as e:
print(e)
def parse_chapter_page(self):
try:
for chapter_url, chapter_titles in self.parse_paging():
chapter_title = chapter_titles + '\n'
chapter_url = 'http://m.quanshuwang.com' + str(chapter_url)
chapter_page = self.session.get(chapter_url).content.decode('gbk')
html = etree.HTML(chapter_page)
chapter_conts = html.xpath('//*[@id="htmlContent"]/p/text()')
self.save_book(chapter_title)
print('正在保存\n', chapter_url, chapter_title)
for content in chapter_conts:
chapter_cont = ' ' + content.strip() + '\n'
self.save_book(chapter_cont)
except Exception as e:
print(e)
def save_book(self, content):
with open(self.name + ' ' +self.author + '.txt', 'a+',encoding='utf-8') as f:
f.write(content)
if __name__ == '__main__':
spider = MquanshuSpider()
spider.parse_chapter_page()
ghoob321 发表于 2019-8-12 07:54
顶贴是给楼主热心分享的动力!!!
笔趣 阁这个就趴下来是乱码
我没有看楼主的代码, 但是因该猜想 就是编码问题 笔趣阁(包括某些伪站点)都是GBK编码代码里面设置一下就行了 应该 感谢分享,吾爱有你更精彩。
感谢分享 等了好久终于等到 感谢分享,吾爱有你更精彩。
用魔爪好像也可以0,0我试试…这个代码,谢谢分享 很好的东西,谢谢分享。 好东西,谢谢分享。 感谢分享 支持分享。