小说网站爬取，使用内置模块tkinter进行简单封装

klmatao 发表于 2020-6-8 23:48

本帖最后由 klmatao 于 2020-6-11 02:36 编辑

使用tkinter对小说网站爬取到的小说内容进行简单的封装展示，做成简单的阅读器
https://static.52pojie.cn/static/image/hrline/1.gif
```
from tkinter import *
from tkinter import messagebox
from fake_useragent import UserAgent
import requests, time,os.path
from lxml import etree

'''当前版本只能获取到xx小说网中的免费章节'''
window_width = 1225
window_height = 750
version = "1.0"
base_path = r"E:\myconfig\爬虫\练习\纵横中文网\xiaoshuo"
second = 1

class Application(Frame):
def __init__(self, master=None):
   super().__init__(master)
   self.master = master
   self.pack()
   self.createWidgets()
   self.default_load_txt()
   # 用来计数下一章
   self.next_content_count = 0

def createWidgets(self):
   # 头部
   self.headr = Canvas(self,width=900,height=50,bg="#D7E3BC")
   self.headr.create_text(450,25,text="纵横中文网小说下载工具",font=("kaiti",30))
   self.headr.grid(row=0,column=0,columnspan=3,ipadx=5)

   # 创建书名标签
   self.search_book = Label(self,text="搜索书名",font=("kaiti",15),width=10)
   self.search_book.grid(row=1,column=0,sticky="e")
   # 创建小说搜索框
   v1 = StringVar()
   self.entry = Entry(self, text=v1, width=50, font=("kaiti", 15))
   self.entry.grid(row=1, column=1)
   v1.set("圣墟魔神")

   # 创建下载按钮
   self.search_button = Button(self,text="下载",width=15,font=("kaiti",15),command=self.download_txt)
   self.search_button.grid(row=1,column=2)

   # 创建滚动条,必须传入self作为root的子组件
   scroll = Scrollbar(self)
   self.text = Text(self,width=120,height=28,bg="#90EE90",font=("kaiti",15))
   scroll.grid(row=2, column=3, sticky="ns")
   self.text.grid(row=2,column=0,columnspan=3,pady=4)
   scroll.config(command=self.text.yview)
   self.text.config(yscrollcommand=scroll.set)

   # 创建上一页按钮
   self.pageup_but = Button(self,text="上一章",width=15,font=("kaiti",15),command=self.last_content)
   self.pageup_but.grid(row=3,column=0)

   # 显示当前正在看的章节名称
   self.current_files_lab = Label(self,width=30, font=("kaiti", 15))
   self.current_files_lab.grid(row=3, column=1,pady=3)

   # 创建下一页按钮
   self.pageup_but = Button(self, text="下一章", width=15, font=("kaiti", 15),command=self.next_content)
   self.pageup_but.grid(row=3,column=2)

def default_load_txt(self):
   default_path_file = r"E:\myconfig\爬虫\练习\纵横中文网\xiaoshuo\{0}\index.txt".format(self.entry.get())
   if os.path.isfile(default_path_file):
         with open(default_path_file, "r", encoding="utf-8") as f:
            titles = f.readlines()
            f.close()
         # 默认打开第一张
         with open(r"E:\myconfig\爬虫\练习\纵横中文网\xiaoshuo\{0}\{1}.txt".format(self.entry.get(),titles.replace("\n","")),"r",encoding="utf-8") as f:
            contents = f.readlines()
            f.close()
         self.current_files_lab["text"] = titles
         for content in contents:
            self.text.insert("insert", content)
         # 记载数据完成后，将text长文本变成不可编辑状态
         self.text["state"] = DISABLED
   else:
         pass

def last_content(self):
   self.text["state"] = NORMAL
   self.text.delete(1.0,END)
   self.next_content_count = self.next_content_count - 1
   with open(r"E:\myconfig\爬虫\练习\纵横中文网\xiaoshuo\{0}\index.txt".format(self.entry.get()),"r",encoding="utf-8") as f:
         titles = f.readlines()
         f.close()
   if self.next_content_count == 0:
         messagebox.showinfo(title="注意！",message="已经到第一章了")
         self.next_content_count = 0
   with open(r"E:\myconfig\爬虫\练习\纵横中文网\xiaoshuo\{0}\{1}.txt".format(self.entry.get(),titles.replace("\n","")),"r",encoding="utf-8") as f:
         contents = f.readlines()
         for content in contents:
            self.text.insert("insert", content)
         f.close()
   self.current_files_lab["text"] = titles
   self.text["state"] = DISABLED

def next_content(self):
   '''下一章按钮的功能'''
   self.text["state"] = NORMAL
   self.text.delete(1.0, END)
   self.next_content_count = self.next_content_count + 1
   # 读取new_path+r"\index.txt"
   with open(r"E:\myconfig\爬虫\练习\纵横中文网\xiaoshuo\{0}\index.txt".format(self.entry.get()),"r",encoding="utf-8") as f:
         titles = f.readlines()
         f.close()
   if self.next_content_count == len(titles):
         messagebox.showinfo(title="注意！",message="已经到最后一章了")
         self.next_content_count = 0
   with open(r"E:\myconfig\爬虫\练习\纵横中文网\xiaoshuo\{0}\{1}.txt".format(self.entry.get(),titles.replace("\n","")),"r",encoding="utf-8") as f:
         contents = f.readlines()
         for content in contents:
            self.text.insert("insert", content)
         f.close()
   self.current_files_lab["text"] = titles
         # 记载数据完成后，将text长文本变成不可编辑状态
   self.text["state"] = DISABLED
         # if flag:
         # flag = False

def download_txt(self):
   want_book = self.entry.get()
   print(want_book)
   main(want_book)
   self.default_load_txt()

def read_write_to_text(self):
   pass

headers = {"User-Agent": UserAgent().random}

def get_html(url):
'''仅仅返回一个搜索结果页面'''
serach_result_html = requests.get(url,headers=headers)
serach_result_html.encoding = "utf-8"
if serach_result_html.status_code == 200:
   return serach_result_html.text
else:
   return messagebox.showinfo(title="错误！",message="未能搜索到！！！")

def parse_html(html,book):
e = etree.HTML(html)
# 返回的是含有书名标签的etree对象，还需要进一步解析
booknames_list = e.xpath('//h2[@class="tit"]/a')
booknames = []
# 返回含有跳转书简介的url
book_introduction_hrefs = e.xpath('//h2[@class="tit"]/a/@href')
for i in range(len(booknames_list)):
   booknames.append( booknames_list.xpath('string(.)'))
# 判断该书是否在booknames中，如果存在返回该标签中的href
if book in booknames:
   for i in range(len(booknames)):
         if booknames == book:
            # 此时需要拿到book对应的标签下的href
            book_introduction_href = book_introduction_hrefs
            # 函数返回book对应的简介url
            cato_href_resp = requests.get(book_introduction_href,headers=headers)
            e = etree.HTML(cato_href_resp.text)
            cato_href = e.xpath('//a[@class="all-catalog"]/@href')
            # 返回的是简介目录中的跳转章节列表链接
            return cato_href
else:
   print("搜索不到《{0}》!".format(book))
   return None

def parse_all_cotapages(new_url):
# 请求获取目录列表页面
catapages_resp = requests.get(new_url,headers=headers)
catapages_resp.encoding = "utf-8"
# 获取章节标题中的href跳转链接
e = etree.HTML(catapages_resp.text)
title_hrefs_list = e.xpath('//li[@class=" col-4"]/a/@href')
# 返回获取到的章节跳转链接
return title_hrefs_list

def get_content(url_list,second,your_want_book):
# 传入章节链接列表进行遍历
for title_href in url_list:
   content_html = requests.get(title_href,headers=headers)
   content_html.encoding = "utf-8"
   e = etree.HTML(content_html.text)
   # 获取章节标题 -->str

   title = e.xpath('//div[@class="title_txtbox"]/text()')
   # 获取章节内容列表
   content_list = e.xpath('//div[@class="content"]/p/text()')
   new_path = base_path+r"\{0}".format(your_want_book)
   if os.path.exists(new_path):
         pass
   else:
         os.makedirs(new_path)
   # 存储标题索引title_next
   with open(new_path+r"\index.txt","a",encoding="utf-8") as f:
         f.write(title+"\n")
         f.close()
   with open(new_path+r"\{0}.txt".format(title),"a",encoding="utf-8") as f:
         try:
            print("开始下载{0}".format(title))
            for content in content_list:
               f.write(str(content)+"\n")
         except Exception as e:
            print("下载错误！")
         finally:
            f.close()
            # 程序休眠，防止ip被封
            time.sleep(second)
return messagebox.showinfo(title="成功！", message="下载成功！")

def main(your_want_book):
search_url = "http://search.zongheng.com/s?keyword={0}".format(your_want_book)
search_html = get_html(search_url)
introduce_url = parse_html(search_html,your_want_book)
content_url_lists = parse_all_cotapages(introduce_url)
get_content(content_url_lists,second,your_want_book)

if __name__ == '__main__':
root = Tk()
root.title("纵横小说网下载工具v{0}".format(version))
root.geometry("{0}x{1}".format(window_width,window_height))
app = Application(root)
app.mainloop()
```

https://static.52pojie.cn/static/image/hrline/1.gif

cnss 发表于 2020-6-9 01:05

爬不了收费就没有亮点了

Tony丶W 发表于 2020-6-9 01:45

问一下，楼主看什么教程学得爬虫？

netpeng 发表于 2020-6-9 03:09

学习了，尝试着自己爬取点资源https://cdn.jsdelivr.net/gh/hishis/forum-master/public/images/patch.gif

wljily 发表于 2020-6-9 05:02

学习学习

大兵马元帅 发表于 2020-6-9 06:05

免费的该用爬吗？

西瓜大爷 发表于 2020-6-9 07:39

还有这样的操作学习一下

klmatao 发表于 2020-6-9 07:54

cnss 发表于 2020-6-9 01:05
爬不了收费就没有亮点了

{:1_926:}学习过程中继续完善

klmatao 发表于 2020-6-9 07:56

大兵马元帅发表于 2020-6-9 06:05
免费的该用爬吗？

刚开始学习，之后完善

klmatao 发表于 2020-6-9 08:07

Tony丶W 发表于 2020-6-9 01:45
问一下，楼主看什么教程学得爬虫？

网上找的教程

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

小说网站爬取，使用内置模块tkinter进行简单封装