刚刚开始学习爬虫,爬取小说,只能爬取一个网站,第一次写有什么不对的请大家指出来,代码就直接贴出来了
[Python] 纯文本查看 复制代码 from lxml import etree
from urllib import parse
import requests
import re
# 搜索
def search():
txt = input("请输入书的全名:")
txt = parse.quote(txt)
txt = parse.quote(txt)
url = 'https://www.bookbao8.com/Search/q_' + txt
u = requests.request('GET', url, headers={
'user - agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 80.0.3987.122Safari / 537.36'
})
url = etree.HTML(u.text)
url = url.xpath("//div[@class='txt']/span[@class='t']/a/@href")
url = url[0]
url = ''.join(url)
url = 'https://www.bookbao8.com' + url
u = requests.request('GET', url, headers={
'user - agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 80.0.3987.122Safari / 537.36'
})
txt = etree.HTML(u.text)
# 名称
name = txt.xpath("//div[@id='info']/h1/text()")
name = ''.join(name)
book_name = name
name = '名称:' + name + '\n'
# 作者
author = txt.xpath("//div[@id='info']/p/a/text()")
author = author[0]
author = '作者:' + author + '\n'
# 类别
sort = txt.xpath("//div[@id='info']/p/a/text()")
sort = sort[1]
sort = '类别:' + sort + '\n'
# 信息总和
search_content = name + author + sort
print(search_content)
num = int(input("输入1下载:"))
down(u, num, book_name)
def down(u, num, name):
file = open('%s.txt' % name, 'a', encoding='utf-8')
if num == 1:
href = etree.HTML(u.text)
href = href.xpath("//div[@class='wp b2 info_chapterlist']/ul/li/a/@href")
for h in href:
h = ''.join(h)
url = requests.request('GET', 'https://www.bookbao8.com/' + h, headers={
'user - agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 80.0.3987.122Safari / 537.36'
})
# 文章内容
text = re.search(r'<dd id="contents">(.*?)</dd>', url.text, re.S)
text = text[0]
text = str(text)
text = re.sub(r'<dd id="contents">', '', text)
text = re.sub(r' ', '', text)
text = re.sub(r'<br />', '', text)
text = re.sub(r'</dd>', '', text)
# 章节
title = etree.HTML(url.text)
title = title.xpath("//div[@class='bdsub']/dl/dd/h1/text()")
title = ''.join(title)
file.write(title)
file.write('\n')
file.write(text)
file.write('\n')
print("%s 下载完成" % title)
file.close()
print("下载完成!!!!!")
else:
exit()
search() |