python爬虫进阶——添加搜索功能
本帖最后由 luoluoovo 于 2018-7-5 17:08 编辑又经过了一星期,在本星期中,学会了python各种库和数据类型的调用
可能有些部分还不完善:
1.总章节抓取不出来
2.作者过滤的规则写不出来
本来期待的是写一个python的小说GUI(下载+阅读)
尝试了一下,发现还得继续研究才行(python的GUI教程参考:关于阅读器的没找到)
所以这次就发了一个基于上次作品的进阶功能——不再是要复制链接来实现抓取了
python3.6写的
代码太简陋希望各位帮忙改进!
希望各位多回复,评分鼓励一下
代码如下:(论坛有的字符显示有问题,运行出错,补上一份链接在后面)(发现超过100章节网页会400。已解决400和500错误问题,链接重发,解决方法是添加headers头和添加延迟,太快会被服务器ban掉)
2018年7月5日17:08:39 链接已重补
import re
import urllib.request
import os
import time
from urllib.parse import quote
search_input = input("请输入搜索的书名或作者:")
search_Book = "https://www.qidian.com/search?kw=" + quote(search_input)
page = urllib.request.urlopen(search_Book).read()
txt = page.decode('utf-8')
def bookname():
filter_bookname = r'<h4><a href=".+?" target="_blank" data-eid=".+?" data-bid=".+?" data-algrid="0.0.0">(.+?)</h4>'
book_name_nosub = re.findall(filter_bookname, txt, re.S)
book_name_sub = re.sub(r'<cite class="red-kw">|</a>|</cite>', '', " ".join(book_name_nosub))
book_name = book_name_sub.split(' ')# 书名
return book_name
def bookChapter():
filter_Chapter = r'<p class="update"><a href=".+?>(.+?)</a>'
state_Chapter = re.findall(filter_Chapter, txt, re.S)
return state_Chapter
def state():
filter_time = r'<em>·</em><span>(.+?)</span>'
state_time = re.findall(filter_time, txt, re.S)
return state_time
def booklist():
list1 = [" "] * 10
list2 = [" "] * 10
number=["⓪","⓵","⓶","⓷","⓸","⓹","⓺","⓻","⓼","⓽"]
a = list(map(lambda c,x, a, y, b, z,: c+x + a + y + b + z, number,bookname(), list1, bookChapter(), list2, state()))
b ="\n".join(a)
return b
print(booklist())
i=int(input("输入你要下载书的号数:"))
def book_link(i):
filter_book_id = r'<a class="red-btn" href="(.+?)"'
book_id = re.findall(filter_book_id, txt, re.S)
book_id_add = " http:".join(book_id)
book_id_list = ("http:" + book_id_add).split(' ')
book_id_love=book_id_list
return book_id_love
def book_First():
page = urllib.request.urlopen(book_link(i=i)).read()
txt = page.decode('utf-8')
filter_book_id = r'data-firstchapterjumpurl="(.+?)">'
book_first = "http:" + re.findall(filter_book_id, txt, re.S)
return book_first
######################################################################################################
def mkdir(path): #创建文件夹
floder=os.path.exists(path)
if not floder:
os.makedirs(path)
print("创建成功")
else:
print("文件已存在")
img_path="E:/txt/txt/"
mkdir(img_path)
z = 0
url = []
link = book_First()# url为第几页就从第几页开始获取
print("VIP章节暂不支持,不可以超过本书免费章节数")
x=int(input("请输入要下载的章节:"))
for read in (range(0,x)):# 下载几章,这里默认5章
url.append(link)# append() 方法用于在列表末尾添加新的对象。
page = urllib.request.urlopen(url).read().decode('UTF-8')
filter_page = r'p>\u3000\u3000(.+?)<'# 小说的文本<p> ****<p> \u3000 代表空格
html = re.findall(filter_page, page, re.S)
filter_bookname = r'60c;</em>(.+?)</a>'
bookname = re.findall(filter_bookname, page, re.S)
filter_chaptername = r'<h3 class="j_chapterName">(.+?)</h3>'# <h3 class="j_chapterName">第4章 继任者</h3>
chaptername = re.findall(filter_chaptername, page, re.S)# 获取章节和章节名字
i = 0
for txt in html:
line = html
f = open(img_path + chaptername + ".txt", "a")# a代表追加模式,不覆盖
f.write(line + "\n")
f.close()
i = i + 1
print(chaptername + "下载完成")
next = r'<a id="j_chapterNext".+?href="//(.+?)"'# <a id="j_chapterNext" href="//read.qidian.com/chapter/HZe9IzSe3h3iUReBXKVubw2/mvMfZ61JMBHM5j8_3RRvhw2" data-eid="qd_R109" >下一章</a>
nextread = re.findall(next, page, re.S)
b = ''
link = "https://" + b.join(nextread)# 本页的下一章链接
z = z + 1
img_path2 = "E:/txt/" + bookname
c = 0
if not os.path.exists(img_path2):# 如果文件不存在,则重命名文件
os.rename(img_path, img_path2)# 文件夹重命名
print("下载完成")
else:
path = "E:/txt/txt/"
downloadtime = time.strftime("%Y%m%d%I%M%S", time.localtime())
os.rename(img_path, img_path2 + downloadtime)
print("文件名:" + bookname + " 已存在,重命名为:" + bookname + downloadtime + "\n" + "请勿重复操作")
链接:https://pan.baidu.com/s/1BnsnFv9tm36JurvgYsSu4Q 密码:9x7b
本帖最后由 小爱同学. 于 2018-7-1 21:04 编辑
# -*- coding: utf-8 -*-
import json
class Who:
def __init__(self):
self.key = {
"oe": "0", "n": "0", "z": "0", "on": "0",
"oK": "1", "6": "1", "5": "1",
"ow": "2", "-": "2", "A": "2", "oc": "2",
"oi": "3", "i": "3", "o": "3", "oz": "3",
"7e": "4", "v": "4", "P": "4", "7n": "4",
"7K": "5", "4": "5", "k": "5", "7": "5", "7v": "5",
"7w": "6", "C": "6", "s": "6", "7c": "6",
"7i": "7", "S": "7", "l": "7", "7z": "7",
"Ne": "8", "c": "8", "F": "8", "Nn": "8", "ov": "8",
"NK": "9", "E": "9", "q": "9", "Nv": "9"
}
def calc(self, string, debug=False):
result = ""
string = string.replace("*S1*", "")
while string:
if len(string) > 1:
if string not in self.key:
if debug:
print(string, self.key])
result += self.key]
string = string
else:
if debug:
print(string, self.key])
result += self.key]
string = string
else:
if debug:
print(string, self.key)
result += self.key
string = ""
return result 小爱同学. 发表于 2018-7-1 21:03
# -*- coding: utf-8 -*-
import json
我也是新手。看起来应该是一个python文件。你复制这个代码保存为x.py。在有python环境下运行就可以了 正需要 谢谢啊 谢谢,一直困扰这个问题 大佬,问一下,这个应该怎么用 表示支持…… 喜欢啊,还有谁,我想问
厉害,楼主,厉害。支持楼主{:1_919:} 楼主加油。谢谢分享。
页:
[1]
2