python爬虫进阶——添加搜索功能

luoluoovo 发表于 2018-7-1 20:24

本帖最后由 luoluoovo 于 2018-7-5 17:08 编辑

又经过了一星期，在本星期中，学会了python各种库和数据类型的调用
可能有些部分还不完善:
1.总章节抓取不出来
2.作者过滤的规则写不出来

本来期待的是写一个python的小说GUI(下载+阅读)
尝试了一下，发现还得继续研究才行(python的GUI教程参考:关于阅读器的没找到)
所以这次就发了一个基于上次作品的进阶功能——不再是要复制链接来实现抓取了

python3.6写的
代码太简陋希望各位帮忙改进！
希望各位多回复，评分鼓励一下
代码如下:(论坛有的字符显示有问题，运行出错，补上一份链接在后面)(发现超过100章节网页会400。已解决400和500错误问题，链接重发，解决方法是添加headers头和添加延迟,太快会被服务器ban掉)
2018年7月5日17:08:39 链接已重补
import re
import urllib.request
import os
import time
from urllib.parse import quote

search_input = input("请输入搜索的书名或作者:")
search_Book = "https://www.qidian.com/search?kw=" + quote(search_input)
page = urllib.request.urlopen(search_Book).read()
txt = page.decode('utf-8')

def bookname():
filter_bookname = r'<h4><a href=".+?" target="_blank" data-eid=".+?" data-bid=".+?" data-algrid="0.0.0">(.+?)</h4>'
book_name_nosub = re.findall(filter_bookname, txt, re.S)
book_name_sub = re.sub(r'<cite class="red-kw">|</a>|</cite>', '', " ".join(book_name_nosub))
book_name = book_name_sub.split(' ')# 书名
return book_name

def bookChapter():
filter_Chapter = r'<p class="update"><a href=".+?>(.+?)</a>'
state_Chapter = re.findall(filter_Chapter, txt, re.S)
return state_Chapter
def state():
filter_time = r'<em>·</em><span>(.+?)</span>'
state_time = re.findall(filter_time, txt, re.S)
return state_time

def booklist():
list1 = [" "] * 10
list2 = [" "] * 10
number=["⓪","⓵","⓶","⓷","⓸","⓹","⓺","⓻","⓼","⓽"]
a = list(map(lambda c,x, a, y, b, z,: c+x + a + y + b + z, number,bookname(), list1, bookChapter(), list2, state()))
b ="\n".join(a)
return b
print(booklist())

i=int(input("输入你要下载书的号数:"))

def book_link(i):
filter_book_id = r'<a class="red-btn" href="(.+?)"'
book_id = re.findall(filter_book_id, txt, re.S)
book_id_add = " http:".join(book_id)
book_id_list = ("http:" + book_id_add).split(' ')
book_id_love=book_id_list
return book_id_love

def book_First():
page = urllib.request.urlopen(book_link(i=i)).read()
txt = page.decode('utf-8')
filter_book_id = r'data-firstchapterjumpurl="(.+?)">'
book_first = "http:" + re.findall(filter_book_id, txt, re.S)
return book_first

######################################################################################################
def mkdir(path):          #创建文件夹
floder=os.path.exists(path)
if not floder:
   os.makedirs(path)
   print("创建成功")
else:
   print("文件已存在")
img_path="E:/txt/txt/"
mkdir(img_path)

z = 0
url = []
link = book_First()# url为第几页就从第几页开始获取
print("VIP章节暂不支持，不可以超过本书免费章节数")
x=int(input("请输入要下载的章节:"))
for read in (range(0,x)):# 下载几章，这里默认5章
url.append(link)# append() 方法用于在列表末尾添加新的对象。
page = urllib.request.urlopen(url).read().decode('UTF-8')
filter_page = r'p>\u3000\u3000(.+?)<'# 小说的文本<p>　****<p>　\u3000 代表空格
html = re.findall(filter_page, page, re.S)
filter_bookname = r'60c;</em>(.+?)</a>'
bookname = re.findall(filter_bookname, page, re.S)

filter_chaptername = r'<h3 class="j_chapterName">(.+?)</h3>'# <h3 class="j_chapterName">第4章继任者</h3>
chaptername = re.findall(filter_chaptername, page, re.S)# 获取章节和章节名字
i = 0
for txt in html:
   line = html
   f = open(img_path + chaptername + ".txt", "a")# a代表追加模式，不覆盖
   f.write(line + "\n")
   f.close()
   i = i + 1
print(chaptername + "下载完成")

next = r'<a id="j_chapterNext".+?href="//(.+?)"'# <a id="j_chapterNext" href="//read.qidian.com/chapter/HZe9IzSe3h3iUReBXKVubw2/mvMfZ61JMBHM5j8_3RRvhw2" data-eid="qd_R109" >下一章</a>
nextread = re.findall(next, page, re.S)
b = ''
link = "https://" + b.join(nextread)# 本页的下一章链接
z = z + 1
img_path2 = "E:/txt/" + bookname

c = 0
if not os.path.exists(img_path2):# 如果文件不存在，则重命名文件
os.rename(img_path, img_path2)# 文件夹重命名
print("下载完成")
else:
path = "E:/txt/txt/"
downloadtime = time.strftime("%Y%m%d%I%M%S", time.localtime())
os.rename(img_path, img_path2 + downloadtime)
print("文件名:" + bookname + " 已存在,重命名为:" + bookname + downloadtime + "\n" + "请勿重复操作")

链接：https://pan.baidu.com/s/1BnsnFv9tm36JurvgYsSu4Q 密码：9x7b

小爱同学. 发表于 2018-7-1 21:03

本帖最后由小爱同学. 于 2018-7-1 21:04 编辑

# -*- coding: utf-8 -*-
import json

class Who:
def __init__(self):
   self.key = {
         "oe": "0", "n": "0", "z": "0", "on": "0",
         "oK": "1", "6": "1", "5": "1",
         "ow": "2", "-": "2", "A": "2", "oc": "2",
         "oi": "3", "i": "3", "o": "3", "oz": "3",
         "7e": "4", "v": "4", "P": "4", "7n": "4",
         "7K": "5", "4": "5", "k": "5", "7": "5", "7v": "5",
         "7w": "6", "C": "6", "s": "6", "7c": "6",
         "7i": "7", "S": "7", "l": "7", "7z": "7",
         "Ne": "8", "c": "8", "F": "8", "Nn": "8", "ov": "8",
         "NK": "9", "E": "9", "q": "9", "Nv": "9"
   }

def calc(self, string, debug=False):
   result = ""
   string = string.replace("*S1*", "")
   while string:
         if len(string) > 1:
            if string not in self.key:
               if debug:
                     print(string, self.key])
               result += self.key]
               string = string
            else:
               if debug:
                     print(string, self.key])
               result += self.key]
               string = string
         else:
            if debug:
               print(string, self.key)
            result += self.key
            string = ""
   return result

luoluoovo 发表于 2018-7-1 21:12

小爱同学. 发表于 2018-7-1 21:03
# -*- coding: utf-8 -*-
import json

我也是新手。看起来应该是一个python文件。你复制这个代码保存为x.py。在有python环境下运行就可以了

我是裤子啊 发表于 2018-7-1 20:42

正需要谢谢啊

jimo 发表于 2018-7-1 20:58

谢谢，一直困扰这个问题

小爱同学. 发表于 2018-7-1 21:02

大佬，问一下，这个应该怎么用

向往的歌 发表于 2018-7-1 21:32

表示支持……

lzhhshuai 发表于 2018-7-1 21:33

喜欢啊，还有谁，我想问

zangdi 发表于 2018-7-1 21:55

厉害，楼主，厉害。支持楼主{:1_919:}

夏橙M兮 发表于 2018-7-1 22:10

楼主加油。谢谢分享。

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

python爬虫进阶——添加搜索功能