小说网站爬虫作品分享

zqc8 发表于 2019-11-3 15:26

本帖最后由 zqc8 于 2020-3-22 22:30 编辑

新版代码已更新，由于时间过去已久，所以新代码思路/写法与就代码略有不同，且由于时间仓促，代码不作注释，如有问题留言

不知由于何因，新版代码 33行novel_name代码无法正常显示，正确代码为

由于上周考试，所以耽误了一周，没有发布分享，这次跟大家分享的是关于小说爬取的，理论上同样可以爬取全站已经完结的小说。拿到代码请合理学习，不要给服务器带来压力，谢谢！

同样的，如果对本代码有任何意见或者建议，欢迎提出，有不懂的欢迎留言评论，作者看到后会尽快回复！
另外跟大家预告下，目前作者正在爬取12306，想凭借自己的努力做一款抢票软件，目前余票查询、模拟登录两大核心部分已基本解决，开发完毕我会一并分享给大家，敬请期待！
新版代码：
# -*- coding: UTF-8 -*-
# ！/usr/bin/env python3
# Author:Murphy
#Blog :www.moyo1.cn

import os
import re
import requests
from time import sleep
from bs4 import BeautifulSoup

class CollectNovel(object):
def __init__(self):
   self.novel_data = {}
   self.start_url = "https://www.ddxsku.com/full.html"
   self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"}

def collect_url(self):
   print("Collecting novel basic data.....")
   start_response = requests.get(self.start_url, headers=self.headers)
   total_page = re.search(r'<em id="pagestats">1/(.+)</em>', start_response.text).group(1)
   novel_navigation_urls =

   for novel_navigation_url in novel_navigation_urls:
         novel_navigation_response = requests.get(novel_navigation_url, headers=self.headers)
         novel_index_urls = re.findall('<td class="L"><a href="(.+)" title=".+" target="_blank">.+</a></td>', novel_navigation_response.text)

         for novel_index_url in novel_index_urls:
            novel_index_response = requests.get(novel_index_url, headers=self.headers)
            novel_index_response.encoding = "utf-8"

            novel_name = re.search(fr'.+<a >(.+)</a>.+', novel_index_response.text).group(1)
            novel_author = re.search(r'<dd><h3>作者：(.+)</h3><br>.+</h3></dd>', novel_index_response.text).group(1)
            self.novel_data = {novel_name: [("novel_author", novel_author)]}
            print("Collecting novel:《%s》--%s" % (novel_name, novel_author))

            index_soup = BeautifulSoup(novel_index_response.text, "html.parser")
            novel_text_urls = index_soup.find_all("td", class_="L")
            for each in novel_text_urls:
               chapters_title = each.text
               chapters_url = each.a["href"]
               self.novel_data.append((chapters_title, chapters_url))
            sleep(1)
            # break# 调试减少运行时间使用，爬取全站删除此处即可。
         break# 调试减少运行时间使用，爬取全站删除此处即可。

def novel_copy(self):
   self.collect_url()
   if self.novel_data:
         for name in self.novel_data:
            count = 0
            print("Downloading:《%s》" % name, end="\n"*2)

            work_path = r"C:/Users/Administrator/Desktop/NovelCopy/%s-%s" % (name, self.novel_data)
            if not os.path.exists(work_path):
               os.makedirs(work_path)
               os.chdir(work_path)
            else:
               os.chdir(work_path)

            for chapter_data in self.novel_data:
               count += 1
               print("Downloading:《%s》--%s" % (name, chapter_data))
               chapter_response = requests.get(chapter_data, headers=self.headers)
               chapter_response.encoding = "utf-8"

               chapter_soup = BeautifulSoup(chapter_response.text, "html.parser")
               chapter_text = chapter_soup.find("dd", id="contents")
               with open("%d-%s.txt" % (count, chapter_data), "w", encoding="utf-8") as f:
                     f.write(chapter_text.text)
               sleep(2)
            print()
            break
   else:
         print("Collect data failed")

if __name__ == "__main__":
novel = CollectNovel()
novel.novel_copy()

旧版代码：

#1.需要安装的第三方库:requests,bs4

import os,requests,re
from time import sleep
from bs4 import BeautifulSoup
from random import uniform

#网址解析
def url_open(url):
headers = {}
headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
response = requests.get(url,headers=headers)
response.encoding = "utf-8"#该网站的编码格式：utf-8

html =response.text

return html

#目标链接收集
def collect_url(html,root_url):
print("正在收集全站已完结的小说链接.....")
novel_name_all = []
novel_url_all = []

soup = BeautifulSoup(html,"html.parser")
totle_pages = int((soup.find("div",class_="pagelink").em.text).split("/"))#查找总页数
#print(totle_pages)

#逐页打开手收集小说链接
for page in range(1,totle_pages+1):
   url = root_url + 'modules/article/articlelist.php?fullflag=1&page={}'.format(page)
   #print(url)
   html = url_open(url)
   #收集当前页面的小说链接
   p_novel_url = fr'<a href="({root_url}xiaoshuo/.+.html)">'
   novel_url_temp = re.findall(p_novel_url,html)

   #将小说链接添加到URL总列表并获取小说名称。
   for novel_url in novel_url_temp:
         novel_url_all.append(novel_url)
         #获取小说名称
         p_novel_name = fr'{novel_url}">(.+)</a>'
         novel_name_temp = re.findall(p_novel_name,html)
         novel_name_all.append(novel_name_temp)

   break #减少代码运行时间使用(若爬取全站则此处删除即可)

data = #将数据进行打包，以便返回多个数据

print("收集工作已完成，准备进入小说内容下载.....")
sleep(1)

returndata

#小说内容获取与保存
def get_and_save_data(data):
novel_name_all = data
novel_url_all = data
i = -1#用于索引获取小说名称

for novel_url in novel_url_all:
   i += 1
   novel_name = novel_name_all# 获取小说名称
   print()
   print("正在下载小说：《%s》"%novel_name)
   print()

   html_1 = url_open(novel_url)
   soup_1 = BeautifulSoup(html_1, "html.parser")
   chapters_url = soup_1.find("p", class_="btnlinks").a["href"]

   #获取所有小说章节URL
   html_2 = url_open(chapters_url)
   soup_2 = BeautifulSoup(html_2, "html.parser")
   chapters_url_all = soup_2.find_all("td", class_="L")

   #逐页打开小说章节网址并获取内容保存
   for each in chapters_url_all:
         chapters_url = each.a["href"]
         html = url_open(chapters_url)

         soup = BeautifulSoup(html,"html.parser")
         chapters_name = soup.find("dd").h1.text#抓取章节名称
         print("正在下载《%s》:%s"%(novel_name,chapters_name))

         #小说内容抓取
         contents = soup.find("dd",id="contents").text
         with open("%s.txt"%novel_name,"a",encoding="utf-8") as g:
            g.write("\n"*3 + "                            "+chapters_name+str("\n")*3)
            g.write(" "+contents)

         slee_time = uniform(0.35,0.75)
         sleep(slee_time)

   print("小说%s已下载完毕"%novel_name)
   print("准备进入下一部小说下载")
   sleep(2)

   break #减少代码运行时间使用(若爬取全站则此处删除即可)

#主程序
def main():
#设置工作路径
path = r'C:\Users\Administrator\Desktop\test'
if os.getcwd() != path:
   if os.path.exists(path) == False:
         os.mkdir(path)
         os.chdir(path)
   else:
         os.chdir(path)
root_url = "https://www.ddxsku.com/"
target_url = root_url + "full.html"
data = collect_url(url_open(target_url),root_url)
get_and_save_data(data)

if __name__ == "__main__":
main()

ttyp 发表于 2020-3-19 16:27

网站是https的，连接是http的，修改了下

#1.需要安装的第三方库:requests,bs4

import os,requests,re
from time import sleep
from bs4 import BeautifulSoup
from random import uniform

#网址解析
def url_open(url):
headers = {}
headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
response = requests.get(url,headers=headers)
response.encoding = "utf-8"#该网站的编码格式：utf-8

html =response.text

return html

#目标链接收集
def collect_url(html,root_url):
print("正在收集全站已完结的小说链接.....")
novel_name_all = []
novel_url_all = []

soup = BeautifulSoup(html,"html.parser")
totle_pages = int((soup.find("div",class_="pagelink").em.text).split("/"))#查找总页数
#print(totle_pages)
print("总页数：" + str(totle_pages))

#逐页打开手收集小说链接
for page in range(1,totle_pages+1):
   url = root_url + 'modules/article/articlelist.php?fullflag=1&page={}'.format(page)

   print("正在打开" + url)

   html = url_open(url)

   #收集当前页面的小说链接
   p_novel_url = fr'<a href="(.+?/xiaoshuo/.+\.html)">'
   print(p_novel_url)
   novel_url_temp = re.findall(p_novel_url,html)

   print(len(novel_url_temp))

   #将小说链接添加到URL总列表并获取小说名称。
   for novel_url in novel_url_temp:
         novel_url_all.append(novel_url)
         #获取小说名称
         p_novel_name = fr'{novel_url}">(.+)</a>'
         novel_name_temp = re.findall(p_novel_name,html)
         novel_name_all.append(novel_name_temp)

   break #减少代码运行时间使用(若爬取全站则此处删除即可)

data = #将数据进行打包，以便返回多个数据

print("收集工作已完成，准备进入小说内容下载.....")
sleep(1)

returndata

#小说内容获取与保存
def get_and_save_data(data):
novel_name_all = data
novel_url_all = data
i = -1#用于索引获取小说名称

for novel_url in novel_url_all:
   i += 1
   novel_name = novel_name_all# 获取小说名称
   print()
   print("正在下载小说：《%s》"%novel_name)
   print()

   html_1 = url_open(novel_url)
   soup_1 = BeautifulSoup(html_1, "html.parser")
   chapters_url = soup_1.find("p", class_="btnlinks").a["href"]

   #获取所有小说章节URL
   html_2 = url_open(chapters_url)
   soup_2 = BeautifulSoup(html_2, "html.parser")
   chapters_url_all = soup_2.find_all("td", class_="L")

   #逐页打开小说章节网址并获取内容保存
   for each in chapters_url_all:
         chapters_url = each.a["href"]
         html = url_open(chapters_url)

         soup = BeautifulSoup(html,"html.parser")
         chapters_name = soup.find("dd").h1.text#抓取章节名称
         chapters_name = re.sub(' +',' ',chapters_name).strip()
         print("正在下载《%s》:%s"%(novel_name,chapters_name))

         #小说内容抓取
         contents = soup.find("dd",id="contents").text
         with open("%s.txt"%novel_name,"a",encoding="utf-8") as g:
            g.write("\n"*3 +chapters_name+str("\n")*3)
            g.write(" "+contents)

         slee_time = uniform(0.35,0.75)
         sleep(slee_time)

   print("小说%s已下载完毕"%novel_name)
   print("准备进入下一部小说下载")
   sleep(2)

   break #减少代码运行时间使用(若爬取全站则此处删除即可)

#主程序
def main():
#设置工作路径
path = r'C:\Users\Administrator\Desktop\test'
if os.getcwd() != path:
   if os.path.exists(path) == False:
         os.mkdir(path)
         os.chdir(path)
   else:
         os.chdir(path)
root_url = "https://www.ddxsku.com/"
target_url = root_url + "full.html"
data = collect_url(url_open(target_url),root_url)
get_and_save_data(data)

if __name__ == "__main__":
main()

zqc8 发表于 2020-3-22 22:19

本帖最后由 zqc8 于 2020-3-22 22:26 编辑

新版代码已更新，由于时间仓促，不作注释，如有问题留言

不知由于何因，新版代码 33行novel_name代码无法正常显示，正确代码为

# -*- coding: UTF-8 -*-
# ！/usr/bin/env python3
# Author:Murphy
#Blog :www.moyo1.cn

import os
import re
import requests
from time import sleep
from bs4 import BeautifulSoup

class CollectNovel(object):
def __init__(self):
   self.novel_data = {}
   self.start_url = "https://www.ddxsku.com/full.html"
   self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"}

def collect_url(self):
   print("Collecting novel basic data.....")
   start_response = requests.get(self.start_url, headers=self.headers)
   total_page = re.search(r'<em id="pagestats">1/(.+)</em>', start_response.text).group(1)
   novel_navigation_urls =

   for novel_navigation_url in novel_navigation_urls:
         novel_navigation_response = requests.get(novel_navigation_url, headers=self.headers)
         novel_index_urls = re.findall('<td class="L"><a href="(.+)" title=".+" target="_blank">.+</a></td>', novel_navigation_response.text)

         for novel_index_url in novel_index_urls:
            novel_index_response = requests.get(novel_index_url, headers=self.headers)
            novel_index_response.encoding = "utf-8"

            novel_name = re.search(fr'.+<a >(.+)</a>.+', novel_index_response.text).group(1)
            novel_author = re.search(r'<dd><h3>作者：(.+)</h3><br>.+</h3></dd>', novel_index_response.text).group(1)
            self.novel_data = {novel_name: [("novel_author", novel_author)]}
            print("Collecting novel:《%s》--%s" % (novel_name, novel_author))

            index_soup = BeautifulSoup(novel_index_response.text, "html.parser")
            novel_text_urls = index_soup.find_all("td", class_="L")
            for each in novel_text_urls:
               chapters_title = each.text
               chapters_url = each.a["href"]
               self.novel_data.append((chapters_title, chapters_url))
            sleep(1)
            # break# 调试减少运行时间使用，爬取全站删除此处即可。
         break# 调试减少运行时间使用，爬取全站删除此处即可。

def novel_copy(self):
   self.collect_url()
   if self.novel_data:
         for name in self.novel_data:
            count = 0
            print("Downloading:《%s》" % name, end="\n"*2)

            work_path = r"C:/Users/Administrator/Desktop/NovelCopy/%s-%s" % (name, self.novel_data)
            if not os.path.exists(work_path):
               os.makedirs(work_path)
               os.chdir(work_path)
            else:
               os.chdir(work_path)

            for chapter_data in self.novel_data:
               count += 1
               print("Downloading:《%s》--%s" % (name, chapter_data))
               chapter_response = requests.get(chapter_data, headers=self.headers)
               chapter_response.encoding = "utf-8"

               chapter_soup = BeautifulSoup(chapter_response.text, "html.parser")
               chapter_text = chapter_soup.find("dd", id="contents")
               with open("%d-%s.txt" % (count, chapter_data), "w", encoding="utf-8") as f:
                     f.write(chapter_text.text)
               sleep(2)
            print()
            break
   else:
         print("Collect data failed")

if __name__ == "__main__":
novel = CollectNovel()
novel.novel_copy()

yonghermit 发表于 2019-11-3 15:51

支持支持

xzgxp 发表于 2019-11-3 16:20

不错，谢谢分享

fys2008 发表于 2019-11-3 17:00

谢谢分享，楼主辛苦了

zqc8 发表于 2019-11-3 17:11

谢谢大家的支持，至于为什么要写12306抢票软件，一方面是想锻炼下自己的动手能力，看看大网站是如何处理反爬的，来为自己积累经验，另一方面本人有时候确实会用的到，而且我还有个不错的想法等待验证{:301_997:}

mei251617 发表于 2019-11-3 17:20

谢谢分享，楼主辛苦了

YuLoo 发表于 2019-11-3 18:47

好漂亮的代码

zcmrp 发表于 2019-11-3 19:59

求救，怎么提示字符非法

万丅冧 发表于 2019-11-3 21:48

感谢分享

topvip 发表于 2019-11-3 23:17

小白来学习了，感谢分享！

页: [1] 2 3 4

吾爱破解 - 52pojie.cn's Archiver

小说网站爬虫作品分享