小说网站爬取,使用内置模块tkinter进行简单封装
本帖最后由 klmatao 于 2020-6-11 02:36 编辑使用tkinter对小说网站爬取到的小说内容进行简单的封装展示,做成简单的阅读器
https://static.52pojie.cn/static/image/hrline/1.gif
```
from tkinter import *
from tkinter import messagebox
from fake_useragent import UserAgent
import requests, time,os.path
from lxml import etree
'''当前版本只能获取到xx小说网中的免费章节'''
window_width = 1225
window_height = 750
version = "1.0"
base_path = r"E:\myconfig\爬虫\练习\纵横中文网\xiaoshuo"
second = 1
class Application(Frame):
def __init__(self, master=None):
super().__init__(master)
self.master = master
self.pack()
self.createWidgets()
self.default_load_txt()
# 用来计数下一章
self.next_content_count = 0
def createWidgets(self):
# 头部
self.headr = Canvas(self,width=900,height=50,bg="#D7E3BC")
self.headr.create_text(450,25,text="纵横中文网小说下载工具",font=("kaiti",30))
self.headr.grid(row=0,column=0,columnspan=3,ipadx=5)
# 创建书名标签
self.search_book = Label(self,text="搜 索 书 名",font=("kaiti",15),width=10)
self.search_book.grid(row=1,column=0,sticky="e")
# 创建小说搜索框
v1 = StringVar()
self.entry = Entry(self, text=v1, width=50, font=("kaiti", 15))
self.entry.grid(row=1, column=1)
v1.set("圣墟魔神")
# 创建下载按钮
self.search_button = Button(self,text="下载",width=15,font=("kaiti",15),command=self.download_txt)
self.search_button.grid(row=1,column=2)
# 创建滚动条,必须传入self作为root的子组件
scroll = Scrollbar(self)
self.text = Text(self,width=120,height=28,bg="#90EE90",font=("kaiti",15))
scroll.grid(row=2, column=3, sticky="ns")
self.text.grid(row=2,column=0,columnspan=3,pady=4)
scroll.config(command=self.text.yview)
self.text.config(yscrollcommand=scroll.set)
# 创建上一页按钮
self.pageup_but = Button(self,text="上 一 章",width=15,font=("kaiti",15),command=self.last_content)
self.pageup_but.grid(row=3,column=0)
# 显示当前正在看的章节名称
self.current_files_lab = Label(self,width=30, font=("kaiti", 15))
self.current_files_lab.grid(row=3, column=1,pady=3)
# 创建下一页按钮
self.pageup_but = Button(self, text="下 一 章", width=15, font=("kaiti", 15),command=self.next_content)
self.pageup_but.grid(row=3,column=2)
def default_load_txt(self):
default_path_file = r"E:\myconfig\爬虫\练习\纵横中文网\xiaoshuo\{0}\index.txt".format(self.entry.get())
if os.path.isfile(default_path_file):
with open(default_path_file, "r", encoding="utf-8") as f:
titles = f.readlines()
f.close()
# 默认打开第一张
with open(r"E:\myconfig\爬虫\练习\纵横中文网\xiaoshuo\{0}\{1}.txt".format(self.entry.get(),titles.replace("\n","")),"r",encoding="utf-8") as f:
contents = f.readlines()
f.close()
self.current_files_lab["text"] = titles
for content in contents:
self.text.insert("insert", content)
# 记载数据完成后,将text长文本变成不可编辑状态
self.text["state"] = DISABLED
else:
pass
def last_content(self):
self.text["state"] = NORMAL
self.text.delete(1.0,END)
self.next_content_count = self.next_content_count - 1
with open(r"E:\myconfig\爬虫\练习\纵横中文网\xiaoshuo\{0}\index.txt".format(self.entry.get()),"r",encoding="utf-8") as f:
titles = f.readlines()
f.close()
if self.next_content_count == 0:
messagebox.showinfo(title="注意!",message="已经到第一章了")
self.next_content_count = 0
with open(r"E:\myconfig\爬虫\练习\纵横中文网\xiaoshuo\{0}\{1}.txt".format(self.entry.get(),titles.replace("\n","")),"r",encoding="utf-8") as f:
contents = f.readlines()
for content in contents:
self.text.insert("insert", content)
f.close()
self.current_files_lab["text"] = titles
self.text["state"] = DISABLED
def next_content(self):
'''下一章按钮的功能'''
self.text["state"] = NORMAL
self.text.delete(1.0, END)
self.next_content_count = self.next_content_count + 1
# 读取new_path+r"\index.txt"
with open(r"E:\myconfig\爬虫\练习\纵横中文网\xiaoshuo\{0}\index.txt".format(self.entry.get()),"r",encoding="utf-8") as f:
titles = f.readlines()
f.close()
if self.next_content_count == len(titles):
messagebox.showinfo(title="注意!",message="已经到最后一章了")
self.next_content_count = 0
with open(r"E:\myconfig\爬虫\练习\纵横中文网\xiaoshuo\{0}\{1}.txt".format(self.entry.get(),titles.replace("\n","")),"r",encoding="utf-8") as f:
contents = f.readlines()
for content in contents:
self.text.insert("insert", content)
f.close()
self.current_files_lab["text"] = titles
# 记载数据完成后,将text长文本变成不可编辑状态
self.text["state"] = DISABLED
# if flag:
# flag = False
def download_txt(self):
want_book = self.entry.get()
print(want_book)
main(want_book)
self.default_load_txt()
def read_write_to_text(self):
pass
headers = {"User-Agent": UserAgent().random}
def get_html(url):
'''仅仅返回一个搜索结果页面'''
serach_result_html = requests.get(url,headers=headers)
serach_result_html.encoding = "utf-8"
if serach_result_html.status_code == 200:
return serach_result_html.text
else:
return messagebox.showinfo(title="错误!",message="未能搜索到!!!")
def parse_html(html,book):
e = etree.HTML(html)
# 返回的是含有书名标签的etree对象,还需要进一步解析
booknames_list = e.xpath('//h2[@class="tit"]/a')
booknames = []
# 返回含有跳转书简介的url
book_introduction_hrefs = e.xpath('//h2[@class="tit"]/a/@href')
for i in range(len(booknames_list)):
booknames.append( booknames_list.xpath('string(.)'))
# 判断该书是否在booknames中,如果存在返回该标签中的href
if book in booknames:
for i in range(len(booknames)):
if booknames == book:
# 此时需要拿到book对应的标签下的href
book_introduction_href = book_introduction_hrefs
# 函数返回book对应的简介url
cato_href_resp = requests.get(book_introduction_href,headers=headers)
e = etree.HTML(cato_href_resp.text)
cato_href = e.xpath('//a[@class="all-catalog"]/@href')
# 返回的是简介目录中的跳转章节列表链接
return cato_href
else:
print("搜索不到《{0}》!".format(book))
return None
def parse_all_cotapages(new_url):
# 请求获取目录列表页面
catapages_resp = requests.get(new_url,headers=headers)
catapages_resp.encoding = "utf-8"
# 获取章节标题中的href跳转链接
e = etree.HTML(catapages_resp.text)
title_hrefs_list = e.xpath('//li[@class=" col-4"]/a/@href')
# 返回获取到的章节跳转链接
return title_hrefs_list
def get_content(url_list,second,your_want_book):
# 传入章节链接列表进行遍历
for title_href in url_list:
content_html = requests.get(title_href,headers=headers)
content_html.encoding = "utf-8"
e = etree.HTML(content_html.text)
# 获取章节标题 -->str
title = e.xpath('//div[@class="title_txtbox"]/text()')
# 获取章节内容列表
content_list = e.xpath('//div[@class="content"]/p/text()')
new_path = base_path+r"\{0}".format(your_want_book)
if os.path.exists(new_path):
pass
else:
os.makedirs(new_path)
# 存储标题索引title_next
with open(new_path+r"\index.txt","a",encoding="utf-8") as f:
f.write(title+"\n")
f.close()
with open(new_path+r"\{0}.txt".format(title),"a",encoding="utf-8") as f:
try:
print("开始下载{0}".format(title))
for content in content_list:
f.write(str(content)+"\n")
except Exception as e:
print("下载错误!")
finally:
f.close()
# 程序休眠,防止ip被封
time.sleep(second)
return messagebox.showinfo(title="成功!", message="下载成功!")
def main(your_want_book):
search_url = "http://search.zongheng.com/s?keyword={0}".format(your_want_book)
search_html = get_html(search_url)
introduce_url = parse_html(search_html,your_want_book)
content_url_lists = parse_all_cotapages(introduce_url)
get_content(content_url_lists,second,your_want_book)
if __name__ == '__main__':
root = Tk()
root.title("纵横小说网下载工具v{0}".format(version))
root.geometry("{0}x{1}".format(window_width,window_height))
app = Application(root)
app.mainloop()
```
https://static.52pojie.cn/static/image/hrline/1.gif
爬不了收费就没有亮点了 问一下,楼主看什么教程学得爬虫? 学习了,尝试着自己爬取点资源https://cdn.jsdelivr.net/gh/hishis/forum-master/public/images/patch.gif 学习学习 免费的该用爬吗? 还有这样的操作 学习一下 cnss 发表于 2020-6-9 01:05
爬不了收费就没有亮点了
{:1_926:}学习过程中继续完善 大兵马元帅 发表于 2020-6-9 06:05
免费的该用爬吗?
刚开始学习,之后完善 Tony丶W 发表于 2020-6-9 01:45
问一下,楼主看什么教程学得爬虫?
网上找的教程
页:
[1]
2