小说网站爬虫作品分享
本帖最后由 zqc8 于 2020-3-22 22:30 编辑新版代码已更新,由于时间过去已久,所以新代码思路/写法与就代码略有不同,且由于时间仓促,代码不作注释,如有问题留言
不知由于何因,新版代码 33行novel_name代码无法正常显示,正确代码为
由于上周考试,所以耽误了一周,没有发布分享,这次跟大家分享的是关于小说爬取的,理论上同样可以爬取全站已经完结的小说。拿到代码请合理学习,不要给服务器带来压力,谢谢!
同样的,如果对本代码有任何意见或者建议,欢迎提出,有不懂的欢迎留言评论,作者看到后会尽快回复!
另外跟大家预告下,目前作者正在爬取12306,想凭借自己的努力做一款抢票软件,目前余票查询、模拟登录两大核心部分已基本解决,开发完毕我会一并分享给大家,敬请期待!
新版代码:
# -*- coding: UTF-8 -*-
# !/usr/bin/env python3
# Author:Murphy
#Blog :www.moyo1.cn
import os
import re
import requests
from time import sleep
from bs4 import BeautifulSoup
class CollectNovel(object):
def __init__(self):
self.novel_data = {}
self.start_url = "https://www.ddxsku.com/full.html"
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"}
def collect_url(self):
print("Collecting novel basic data.....")
start_response = requests.get(self.start_url, headers=self.headers)
total_page = re.search(r'<em id="pagestats">1/(.+)</em>', start_response.text).group(1)
novel_navigation_urls =
for novel_navigation_url in novel_navigation_urls:
novel_navigation_response = requests.get(novel_navigation_url, headers=self.headers)
novel_index_urls = re.findall('<td class="L"><a href="(.+)" title=".+" target="_blank">.+</a></td>', novel_navigation_response.text)
for novel_index_url in novel_index_urls:
novel_index_response = requests.get(novel_index_url, headers=self.headers)
novel_index_response.encoding = "utf-8"
novel_name = re.search(fr'.+<a >(.+)</a>.+', novel_index_response.text).group(1)
novel_author = re.search(r'<dd><h3>作者:(.+)</h3><br>.+</h3></dd>', novel_index_response.text).group(1)
self.novel_data = {novel_name: [("novel_author", novel_author)]}
print("Collecting novel:《%s》--%s" % (novel_name, novel_author))
index_soup = BeautifulSoup(novel_index_response.text, "html.parser")
novel_text_urls = index_soup.find_all("td", class_="L")
for each in novel_text_urls:
chapters_title = each.text
chapters_url = each.a["href"]
self.novel_data.append((chapters_title, chapters_url))
sleep(1)
# break# 调试减少运行时间使用,爬取全站删除此处即可。
break# 调试减少运行时间使用,爬取全站删除此处即可。
def novel_copy(self):
self.collect_url()
if self.novel_data:
for name in self.novel_data:
count = 0
print("Downloading:《%s》" % name, end="\n"*2)
work_path = r"C:/Users/Administrator/Desktop/NovelCopy/%s-%s" % (name, self.novel_data)
if not os.path.exists(work_path):
os.makedirs(work_path)
os.chdir(work_path)
else:
os.chdir(work_path)
for chapter_data in self.novel_data:
count += 1
print("Downloading:《%s》--%s" % (name, chapter_data))
chapter_response = requests.get(chapter_data, headers=self.headers)
chapter_response.encoding = "utf-8"
chapter_soup = BeautifulSoup(chapter_response.text, "html.parser")
chapter_text = chapter_soup.find("dd", id="contents")
with open("%d-%s.txt" % (count, chapter_data), "w", encoding="utf-8") as f:
f.write(chapter_text.text)
sleep(2)
print()
break
else:
print("Collect data failed")
if __name__ == "__main__":
novel = CollectNovel()
novel.novel_copy()
旧版代码:
#1.需要安装的第三方库:requests,bs4
import os,requests,re
from time import sleep
from bs4 import BeautifulSoup
from random import uniform
#网址解析
def url_open(url):
headers = {}
headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
response = requests.get(url,headers=headers)
response.encoding = "utf-8"#该网站的编码格式:utf-8
html =response.text
return html
#目标链接收集
def collect_url(html,root_url):
print("正在收集全站已完结的小说链接.....")
novel_name_all = []
novel_url_all = []
soup = BeautifulSoup(html,"html.parser")
totle_pages = int((soup.find("div",class_="pagelink").em.text).split("/"))#查找总页数
#print(totle_pages)
#逐页打开手收集小说链接
for page in range(1,totle_pages+1):
url = root_url + 'modules/article/articlelist.php?fullflag=1&page={}'.format(page)
#print(url)
html = url_open(url)
#收集当前页面的小说链接
p_novel_url = fr'<a href="({root_url}xiaoshuo/.+.html)">'
novel_url_temp = re.findall(p_novel_url,html)
#将小说链接添加到URL总列表并获取小说名称。
for novel_url in novel_url_temp:
novel_url_all.append(novel_url)
#获取小说名称
p_novel_name = fr'{novel_url}">(.+)</a>'
novel_name_temp = re.findall(p_novel_name,html)
novel_name_all.append(novel_name_temp)
break #减少代码运行时间使用(若爬取全站则此处删除即可)
data = #将数据进行打包,以便返回多个数据
print("收集工作已完成,准备进入小说内容下载.....")
sleep(1)
returndata
#小说内容获取与保存
def get_and_save_data(data):
novel_name_all = data
novel_url_all = data
i = -1#用于索引获取小说名称
for novel_url in novel_url_all:
i += 1
novel_name = novel_name_all# 获取小说名称
print()
print("正在下载小说:《%s》"%novel_name)
print()
html_1 = url_open(novel_url)
soup_1 = BeautifulSoup(html_1, "html.parser")
chapters_url = soup_1.find("p", class_="btnlinks").a["href"]
#获取所有小说章节URL
html_2 = url_open(chapters_url)
soup_2 = BeautifulSoup(html_2, "html.parser")
chapters_url_all = soup_2.find_all("td", class_="L")
#逐页打开小说章节网址并获取内容保存
for each in chapters_url_all:
chapters_url = each.a["href"]
html = url_open(chapters_url)
soup = BeautifulSoup(html,"html.parser")
chapters_name = soup.find("dd").h1.text#抓取章节名称
print("正在下载《%s》:%s"%(novel_name,chapters_name))
#小说内容抓取
contents = soup.find("dd",id="contents").text
with open("%s.txt"%novel_name,"a",encoding="utf-8") as g:
g.write("\n"*3 + " "+chapters_name+str("\n")*3)
g.write(" "+contents)
slee_time = uniform(0.35,0.75)
sleep(slee_time)
print("小说%s已下载完毕"%novel_name)
print("准备进入下一部小说下载")
sleep(2)
break #减少代码运行时间使用(若爬取全站则此处删除即可)
#主程序
def main():
#设置工作路径
path = r'C:\Users\Administrator\Desktop\test'
if os.getcwd() != path:
if os.path.exists(path) == False:
os.mkdir(path)
os.chdir(path)
else:
os.chdir(path)
root_url = "https://www.ddxsku.com/"
target_url = root_url + "full.html"
data = collect_url(url_open(target_url),root_url)
get_and_save_data(data)
if __name__ == "__main__":
main()
网站是https的,连接是http的,修改了下
#1.需要安装的第三方库:requests,bs4
import os,requests,re
from time import sleep
from bs4 import BeautifulSoup
from random import uniform
#网址解析
def url_open(url):
headers = {}
headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
response = requests.get(url,headers=headers)
response.encoding = "utf-8"#该网站的编码格式:utf-8
html =response.text
return html
#目标链接收集
def collect_url(html,root_url):
print("正在收集全站已完结的小说链接.....")
novel_name_all = []
novel_url_all = []
soup = BeautifulSoup(html,"html.parser")
totle_pages = int((soup.find("div",class_="pagelink").em.text).split("/"))#查找总页数
#print(totle_pages)
print("总页数:" + str(totle_pages))
#逐页打开手收集小说链接
for page in range(1,totle_pages+1):
url = root_url + 'modules/article/articlelist.php?fullflag=1&page={}'.format(page)
print("正在打开" + url)
html = url_open(url)
#收集当前页面的小说链接
p_novel_url = fr'<a href="(.+?/xiaoshuo/.+\.html)">'
print(p_novel_url)
novel_url_temp = re.findall(p_novel_url,html)
print(len(novel_url_temp))
#将小说链接添加到URL总列表并获取小说名称。
for novel_url in novel_url_temp:
novel_url_all.append(novel_url)
#获取小说名称
p_novel_name = fr'{novel_url}">(.+)</a>'
novel_name_temp = re.findall(p_novel_name,html)
novel_name_all.append(novel_name_temp)
break #减少代码运行时间使用(若爬取全站则此处删除即可)
data = #将数据进行打包,以便返回多个数据
print("收集工作已完成,准备进入小说内容下载.....")
sleep(1)
returndata
#小说内容获取与保存
def get_and_save_data(data):
novel_name_all = data
novel_url_all = data
i = -1#用于索引获取小说名称
for novel_url in novel_url_all:
i += 1
novel_name = novel_name_all# 获取小说名称
print()
print("正在下载小说:《%s》"%novel_name)
print()
html_1 = url_open(novel_url)
soup_1 = BeautifulSoup(html_1, "html.parser")
chapters_url = soup_1.find("p", class_="btnlinks").a["href"]
#获取所有小说章节URL
html_2 = url_open(chapters_url)
soup_2 = BeautifulSoup(html_2, "html.parser")
chapters_url_all = soup_2.find_all("td", class_="L")
#逐页打开小说章节网址并获取内容保存
for each in chapters_url_all:
chapters_url = each.a["href"]
html = url_open(chapters_url)
soup = BeautifulSoup(html,"html.parser")
chapters_name = soup.find("dd").h1.text#抓取章节名称
chapters_name = re.sub(' +',' ',chapters_name).strip()
print("正在下载《%s》:%s"%(novel_name,chapters_name))
#小说内容抓取
contents = soup.find("dd",id="contents").text
with open("%s.txt"%novel_name,"a",encoding="utf-8") as g:
g.write("\n"*3 +chapters_name+str("\n")*3)
g.write(" "+contents)
slee_time = uniform(0.35,0.75)
sleep(slee_time)
print("小说%s已下载完毕"%novel_name)
print("准备进入下一部小说下载")
sleep(2)
break #减少代码运行时间使用(若爬取全站则此处删除即可)
#主程序
def main():
#设置工作路径
path = r'C:\Users\Administrator\Desktop\test'
if os.getcwd() != path:
if os.path.exists(path) == False:
os.mkdir(path)
os.chdir(path)
else:
os.chdir(path)
root_url = "https://www.ddxsku.com/"
target_url = root_url + "full.html"
data = collect_url(url_open(target_url),root_url)
get_and_save_data(data)
if __name__ == "__main__":
main() 本帖最后由 zqc8 于 2020-3-22 22:26 编辑
新版代码已更新,由于时间仓促,不作注释,如有问题留言
不知由于何因,新版代码 33行novel_name代码无法正常显示,正确代码为
# -*- coding: UTF-8 -*-
# !/usr/bin/env python3
# Author:Murphy
#Blog :www.moyo1.cn
import os
import re
import requests
from time import sleep
from bs4 import BeautifulSoup
class CollectNovel(object):
def __init__(self):
self.novel_data = {}
self.start_url = "https://www.ddxsku.com/full.html"
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"}
def collect_url(self):
print("Collecting novel basic data.....")
start_response = requests.get(self.start_url, headers=self.headers)
total_page = re.search(r'<em id="pagestats">1/(.+)</em>', start_response.text).group(1)
novel_navigation_urls =
for novel_navigation_url in novel_navigation_urls:
novel_navigation_response = requests.get(novel_navigation_url, headers=self.headers)
novel_index_urls = re.findall('<td class="L"><a href="(.+)" title=".+" target="_blank">.+</a></td>', novel_navigation_response.text)
for novel_index_url in novel_index_urls:
novel_index_response = requests.get(novel_index_url, headers=self.headers)
novel_index_response.encoding = "utf-8"
novel_name = re.search(fr'.+<a >(.+)</a>.+', novel_index_response.text).group(1)
novel_author = re.search(r'<dd><h3>作者:(.+)</h3><br>.+</h3></dd>', novel_index_response.text).group(1)
self.novel_data = {novel_name: [("novel_author", novel_author)]}
print("Collecting novel:《%s》--%s" % (novel_name, novel_author))
index_soup = BeautifulSoup(novel_index_response.text, "html.parser")
novel_text_urls = index_soup.find_all("td", class_="L")
for each in novel_text_urls:
chapters_title = each.text
chapters_url = each.a["href"]
self.novel_data.append((chapters_title, chapters_url))
sleep(1)
# break# 调试减少运行时间使用,爬取全站删除此处即可。
break# 调试减少运行时间使用,爬取全站删除此处即可。
def novel_copy(self):
self.collect_url()
if self.novel_data:
for name in self.novel_data:
count = 0
print("Downloading:《%s》" % name, end="\n"*2)
work_path = r"C:/Users/Administrator/Desktop/NovelCopy/%s-%s" % (name, self.novel_data)
if not os.path.exists(work_path):
os.makedirs(work_path)
os.chdir(work_path)
else:
os.chdir(work_path)
for chapter_data in self.novel_data:
count += 1
print("Downloading:《%s》--%s" % (name, chapter_data))
chapter_response = requests.get(chapter_data, headers=self.headers)
chapter_response.encoding = "utf-8"
chapter_soup = BeautifulSoup(chapter_response.text, "html.parser")
chapter_text = chapter_soup.find("dd", id="contents")
with open("%d-%s.txt" % (count, chapter_data), "w", encoding="utf-8") as f:
f.write(chapter_text.text)
sleep(2)
print()
break
else:
print("Collect data failed")
if __name__ == "__main__":
novel = CollectNovel()
novel.novel_copy()
支持支持 不错,谢谢分享
谢谢分享,楼主辛苦了 谢谢大家的支持,至于为什么要写12306抢票软件,一方面是想锻炼下自己的动手能力,看看大网站是如何处理反爬的,来为自己积累经验,另一方面本人有时候确实会用的到,而且我还有个不错的想法等待验证{:301_997:}
谢谢分享,楼主辛苦了 好漂亮的代码 求救,怎么提示字符非法 感谢分享 小白来学习了,感谢分享!