小说网站爬虫作品分享

zqc8 · 发表于 2019-11-3 15:26

本帖最后由 zqc8 于 2020-3-22 22:30 编辑

新版代码已更新，由于时间过去已久，所以新代码思路/写法与就代码略有不同，且由于时间仓促，代码不作注释，如有问题留言

不知由于何因，新版代码 33行novel_name代码无法正常显示，正确代码为

由于上周考试，所以耽误了一周，没有发布分享，这次跟大家分享的是关于小说爬取的，理论上同样可以爬取全站已经完结的小说。拿到代码请合理学习，不要给服务器带来压力，谢谢！

同样的，如果对本代码有任何意见或者建议，欢迎提出，有不懂的欢迎留言评论，作者看到后会尽快回复！
另外跟大家预告下，目前作者正在爬取12306，想凭借自己的努力做一款抢票软件，目前余票查询、模拟登录两大核心部分已基本解决，开发完毕我会一并分享给大家，敬请期待！
新版代码：

[Python] 纯文本查看 复制代码

01

02

03

04

05

06

07

08

09

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

# -*- coding: UTF-8 -*-
# ！/usr/bin/env python3
# Author:  Murphy
#  Blog :  www.moyo1.cn
 
import os
import re
import requests
from time import sleep
from bs4 import BeautifulSoup
 
 
class CollectNovel(object):
    def __init__(self):
        self.novel_data = {}
        self.start_url = "https://www.ddxsku.com/full.html"
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"}
 
    def collect_url(self):
        print("Collecting novel basic data.....")
        start_response = requests.get(self.start_url, headers=self.headers)
        total_page = re.search(r'<em id="pagestats">1/(.+)</em>', start_response.text).group(1)
        novel_navigation_urls = [fr"http://www.ddxsku.com/modules/article/articlelist.php?fullflag=1&page={i}" for i in range(1, int(total_page)+1)]
 
        for novel_navigation_url in novel_navigation_urls:
            novel_navigation_response = requests.get(novel_navigation_url, headers=self.headers)
            novel_index_urls = re.findall('<td class="L"><a href="(.+)" title=".+" target="_blank">.+</a></td>', novel_navigation_response.text)
 
            for novel_index_url in novel_index_urls:
                novel_index_response = requests.get(novel_index_url, headers=self.headers)
                novel_index_response.encoding = "utf-8"
 
                novel_name = re.search(fr'.+<a >(.+)</a>.+', novel_index_response.text).group(1)
                novel_author = re.search(r'<dd><h3>作者：(.+)</h3><br>.+</h3></dd>', novel_index_response.text).group(1)
                self.novel_data = {novel_name: [("novel_author", novel_author)]}
                print("Collecting novel:  《%s》--%s" % (novel_name, novel_author))
 
                index_soup = BeautifulSoup(novel_index_response.text, "html.parser")
                novel_text_urls = index_soup.find_all("td", class_="L")
                for each in novel_text_urls:
                    chapters_title = each.text
                    chapters_url = each.a["href"]
                    self.novel_data[novel_name].append((chapters_title, chapters_url))
                sleep(1)
                # break  # 调试减少运行时间使用，爬取全站删除此处即可。
            break  # 调试减少运行时间使用，爬取全站删除此处即可。
 
    def novel_copy(self):
        self.collect_url()
        if self.novel_data:
            for name in self.novel_data:
                count = 0
                print("Downloading:  《%s》" % name, end="\n"*2)
 
                work_path = r"C:/Users/Administrator/Desktop/NovelCopy/%s-%s" % (name, self.novel_data[name][0][1])
                if not os.path.exists(work_path):
                    os.makedirs(work_path)
                    os.chdir(work_path)
                else:
                    os.chdir(work_path)
 
                for chapter_data in self.novel_data[name][1:]:
                    count += 1
                    print("Downloading:  《%s》--%s" % (name, chapter_data[0]))
                    chapter_response = requests.get(chapter_data[1], headers=self.headers)
                    chapter_response.encoding = "utf-8"
 
                    chapter_soup = BeautifulSoup(chapter_response.text, "html.parser")
                    chapter_text = chapter_soup.find("dd", id="contents")
                    with open("%d-%s.txt" % (count, chapter_data[0]), "w", encoding="utf-8") as f:
                        f.write(chapter_text.text)
                    sleep(2)
                print()
                break
        else:
            print("Collect data failed")
 
 
if __name__ == "__main__":
    novel = CollectNovel()
    novel.novel_copy()

旧版代码：

[Python] 纯文本查看 复制代码

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

016

017

018

019

020

021

022

023

024

025

026

027

028

029

030

031

032

033

034

035

036

037

038

039

040

041

042

043

044

045

046

047

048

049

050

051

052

053

054

055

056

057

058

059

060

061

062

063

064

065

066

067

068

069

070

071

072

073

074

075

076

077

078

079

080

081

082

083

084

085

086

087

088

089

090

091

092

093

094

095

096

097

098

099

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

#1.需要安装的第三方库:requests,bs4
 
import os,requests,re
from time import sleep
from bs4 import BeautifulSoup
from random import uniform
 
#网址解析
def url_open(url):
    headers = {}
    headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
    response = requests.get(url,headers=headers)
    response.encoding = "utf-8"  #该网站的编码格式：utf-8
 
    html =response.text
 
    return html
 
 
#目标链接收集
def collect_url(html,root_url):
    print("正在收集全站已完结的小说链接.....")
    novel_name_all = []
    novel_url_all = []
 
    soup = BeautifulSoup(html,"html.parser")
    totle_pages = int((soup.find("div",class_="pagelink").em.text).split("/")[1])  #查找总页数
    #print(totle_pages)
 
    #逐页打开手收集小说链接
    for page in range(1,totle_pages+1):
        url = root_url + 'modules/article/articlelist.php?fullflag=1&page={}'.format(page)
        #print(url)
        html = url_open(url)
        #收集当前页面的小说链接
        p_novel_url = fr'<a href="({root_url}xiaoshuo/.+.html)">'
        novel_url_temp = re.findall(p_novel_url,html)
 
        #将小说链接添加到URL总列表并获取小说名称。
        for novel_url in novel_url_temp:
            novel_url_all.append(novel_url)
            #获取小说名称
            p_novel_name = fr'{novel_url}">(.+)</a>'
            novel_name_temp = re.findall(p_novel_name,html)[0]
            novel_name_all.append(novel_name_temp)
 
        break #减少代码运行时间使用(若爬取全站则此处删除即可)
 
    data = [novel_name_all,novel_url_all]  #将数据进行打包，以便返回多个数据
 
    print("收集工作已完成，准备进入小说内容下载.....")
    sleep(1)
 
    return  data
 
 
#小说内容获取与保存
def get_and_save_data(data):
    novel_name_all = data[0]
    novel_url_all = data[1]
    i = -1  #用于索引获取小说名称
 
    for novel_url in novel_url_all:
        i += 1
        novel_name = novel_name_all[i]  # 获取小说名称
        print()  
        print("正在下载小说：《%s》"%novel_name)
        print()  
 
        html_1 = url_open(novel_url)
        soup_1 = BeautifulSoup(html_1, "html.parser")
        chapters_url = soup_1.find("p", class_="btnlinks").a["href"]
 
        #获取所有小说章节URL
        html_2 = url_open(chapters_url)
        soup_2 = BeautifulSoup(html_2, "html.parser")
        chapters_url_all = soup_2.find_all("td", class_="L")
 
        #逐页打开小说章节网址并获取内容保存
        for each in chapters_url_all:
            chapters_url = each.a["href"]
            html = url_open(chapters_url)
 
            soup = BeautifulSoup(html,"html.parser")
            chapters_name = soup.find("dd").h1.text  #抓取章节名称
            print("正在下载《%s》:%s"%(novel_name,chapters_name))
 
            #小说内容抓取
            contents = soup.find("dd",id="contents").text
            with open("%s.txt"%novel_name,"a",encoding="utf-8") as g:
                g.write("\n"*3 + "                               "+chapters_name+str("\n")*3)
                g.write("    "+contents)
 
            slee_time = uniform(0.35,0.75)
            sleep(slee_time) 
 
        print("小说%s已下载完毕"%novel_name)
        print("准备进入下一部小说下载")
        sleep(2)
 
        break #减少代码运行时间使用(若爬取全站则此处删除即可)
 
 
#主程序
def main():
    #设置工作路径
    path = r'C:\Users\Administrator\Desktop\test'
    if os.getcwd() != path:
        if os.path.exists(path) == False:
            os.mkdir(path)
            os.chdir(path)
        else:
            os.chdir(path)
    root_url = "https://www.ddxsku.com/"
    target_url = root_url + "full.html"
    data = collect_url(url_open(target_url),root_url)
    get_and_save_data(data)
 
 
if __name__ == "__main__":
    main()

ttyp · 发表于 2020-3-19 16:27

网站是https的，连接是http的，修改了下

[Python] 纯文本查看 复制代码

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

016

017

018

019

020

021

022

023

024

025

026

027

028

029

030

031

032

033

034

035

036

037

038

039

040

041

042

043

044

045

046

047

048

049

050

051

052

053

054

055

056

057

058

059

060

061

062

063

064

065

066

067

068

069

070

071

072

073

074

075

076

077

078

079

080

081

082

083

084

085

086

087

088

089

090

091

092

093

094

095

096

097

098

099

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

#1.需要安装的第三方库:requests,bs4
  
import os,requests,re
from time import sleep
from bs4 import BeautifulSoup
from random import uniform
  
#网址解析
def url_open(url):
    headers = {}
    headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
    response = requests.get(url,headers=headers)
    response.encoding = "utf-8"  #该网站的编码格式：utf-8
  
    html =response.text
  
    return html
  
  
#目标链接收集
def collect_url(html,root_url):
    print("正在收集全站已完结的小说链接.....")
    novel_name_all = []
    novel_url_all = []
  
    soup = BeautifulSoup(html,"html.parser")
    totle_pages = int((soup.find("div",class_="pagelink").em.text).split("/")[1])  #查找总页数
    #print(totle_pages)
    print("总页数：" + str(totle_pages))
  
    #逐页打开手收集小说链接
    for page in range(1,totle_pages+1):
        url = root_url + 'modules/article/articlelist.php?fullflag=1&page={}'.format(page)
 
        print("正在打开" + url)
 
        html = url_open(url)
 
        #收集当前页面的小说链接
        p_novel_url = fr'<a href="(.+?/xiaoshuo/.+\.html)">'
        print(p_novel_url)
        novel_url_temp = re.findall(p_novel_url,html)
 
        print(len(novel_url_temp))
  
        #将小说链接添加到URL总列表并获取小说名称。
        for novel_url in novel_url_temp:
            novel_url_all.append(novel_url)
            #获取小说名称
            p_novel_name = fr'{novel_url}">(.+)</a>'
            novel_name_temp = re.findall(p_novel_name,html)[0]
            novel_name_all.append(novel_name_temp)
  
        break #减少代码运行时间使用(若爬取全站则此处删除即可)
  
    data = [novel_name_all,novel_url_all]  #将数据进行打包，以便返回多个数据
  
    print("收集工作已完成，准备进入小说内容下载.....")
    sleep(1)
  
    return  data
  
  
#小说内容获取与保存
def get_and_save_data(data):
    novel_name_all = data[0]
    novel_url_all = data[1]
    i = -1  #用于索引获取小说名称
  
    for novel_url in novel_url_all:
        i += 1
        novel_name = novel_name_all[i]  # 获取小说名称
        print()  
        print("正在下载小说：《%s》"%novel_name)
        print()  
  
        html_1 = url_open(novel_url)
        soup_1 = BeautifulSoup(html_1, "html.parser")
        chapters_url = soup_1.find("p", class_="btnlinks").a["href"]
  
        #获取所有小说章节URL
        html_2 = url_open(chapters_url)
        soup_2 = BeautifulSoup(html_2, "html.parser")
        chapters_url_all = soup_2.find_all("td", class_="L")
  
        #逐页打开小说章节网址并获取内容保存
        for each in chapters_url_all:
            chapters_url = each.a["href"]
            html = url_open(chapters_url)
  
            soup = BeautifulSoup(html,"html.parser")
            chapters_name = soup.find("dd").h1.text  #抓取章节名称
            chapters_name = re.sub(' +',' ',chapters_name).strip()
            print("正在下载《%s》:%s"%(novel_name,chapters_name))
  
            #小说内容抓取
            contents = soup.find("dd",id="contents").text
            with open("%s.txt"%novel_name,"a",encoding="utf-8") as g:
                g.write("\n"*3 +chapters_name+str("\n")*3)
                g.write("    "+contents)
  
            slee_time = uniform(0.35,0.75)
            sleep(slee_time) 
  
        print("小说%s已下载完毕"%novel_name)
        print("准备进入下一部小说下载")
        sleep(2)
  
        break #减少代码运行时间使用(若爬取全站则此处删除即可)
  
  
#主程序
def main():
    #设置工作路径
    path = r'C:\Users\Administrator\Desktop\test'
    if os.getcwd() != path:
        if os.path.exists(path) == False:
            os.mkdir(path)
            os.chdir(path)
        else:
            os.chdir(path)
    root_url = "https://www.ddxsku.com/"
    target_url = root_url + "full.html"
    data = collect_url(url_open(target_url),root_url)
    get_and_save_data(data)
  
  
if __name__ == "__main__":
    main()

zqc8 · 发表于 2020-3-22 22:19

本帖最后由 zqc8 于 2020-3-22 22:26 编辑

新版代码已更新，由于时间仓促，不作注释，如有问题留言

不知由于何因，新版代码 33行novel_name代码无法正常显示，正确代码为

[Python] 纯文本查看 复制代码

01

02

03

04

05

06

07

08

09

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

# -*- coding: UTF-8 -*-
# ！/usr/bin/env python3
# Author:  Murphy
#  Blog :  www.moyo1.cn
 
import os
import re
import requests
from time import sleep
from bs4 import BeautifulSoup
 
 
class CollectNovel(object):
    def __init__(self):
        self.novel_data = {}
        self.start_url = "https://www.ddxsku.com/full.html"
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"}
 
    def collect_url(self):
        print("Collecting novel basic data.....")
        start_response = requests.get(self.start_url, headers=self.headers)
        total_page = re.search(r'<em id="pagestats">1/(.+)</em>', start_response.text).group(1)
        novel_navigation_urls = [fr"http://www.ddxsku.com/modules/article/articlelist.php?fullflag=1&page={i}" for i in range(1, int(total_page)+1)]
 
        for novel_navigation_url in novel_navigation_urls:
            novel_navigation_response = requests.get(novel_navigation_url, headers=self.headers)
            novel_index_urls = re.findall('<td class="L"><a href="(.+)" title=".+" target="_blank">.+</a></td>', novel_navigation_response.text)
 
            for novel_index_url in novel_index_urls:
                novel_index_response = requests.get(novel_index_url, headers=self.headers)
                novel_index_response.encoding = "utf-8"
 
                novel_name = re.search(fr'.+<a >(.+)</a>.+', novel_index_response.text).group(1)
                novel_author = re.search(r'<dd><h3>作者：(.+)</h3><br>.+</h3></dd>', novel_index_response.text).group(1)
                self.novel_data = {novel_name: [("novel_author", novel_author)]}
                print("Collecting novel:  《%s》--%s" % (novel_name, novel_author))
 
                index_soup = BeautifulSoup(novel_index_response.text, "html.parser")
                novel_text_urls = index_soup.find_all("td", class_="L")
                for each in novel_text_urls:
                    chapters_title = each.text
                    chapters_url = each.a["href"]
                    self.novel_data[novel_name].append((chapters_title, chapters_url))
                sleep(1)
                # break  # 调试减少运行时间使用，爬取全站删除此处即可。
            break  # 调试减少运行时间使用，爬取全站删除此处即可。
 
    def novel_copy(self):
        self.collect_url()
        if self.novel_data:
            for name in self.novel_data:
                count = 0
                print("Downloading:  《%s》" % name, end="\n"*2)
 
                work_path = r"C:/Users/Administrator/Desktop/NovelCopy/%s-%s" % (name, self.novel_data[name][0][1])
                if not os.path.exists(work_path):
                    os.makedirs(work_path)
                    os.chdir(work_path)
                else:
                    os.chdir(work_path)
 
                for chapter_data in self.novel_data[name][1:]:
                    count += 1
                    print("Downloading:  《%s》--%s" % (name, chapter_data[0]))
                    chapter_response = requests.get(chapter_data[1], headers=self.headers)
                    chapter_response.encoding = "utf-8"
 
                    chapter_soup = BeautifulSoup(chapter_response.text, "html.parser")
                    chapter_text = chapter_soup.find("dd", id="contents")
                    with open("%d-%s.txt" % (count, chapter_data[0]), "w", encoding="utf-8") as f:
                        f.write(chapter_text.text)
                    sleep(2)
                print()
                break
        else:
            print("Collect data failed")
 
 
if __name__ == "__main__":
    novel = CollectNovel()
    novel.novel_copy()

yonghermit · 发表于 2019-11-3 15:51

支持支持

xzgxp · 发表于 2019-11-3 16:20

不错，谢谢分享

fys2008 · 发表于 2019-11-3 17:00

谢谢分享，楼主辛苦了

zqc8 · 发表于 2019-11-3 17:11

谢谢大家的支持，至于为什么要写12306抢票软件，一方面是想锻炼下自己的动手能力，看看大网站是如何处理反爬的，来为自己积累经验，另一方面本人有时候确实会用的到，而且我还有个不错的想法等待验证

mei251617 · 发表于 2019-11-3 17:20

谢谢分享，楼主辛苦了

YuLoo · 发表于 2019-11-3 18:47

好漂亮的代码

zcmrp · 发表于 2019-11-3 19:59

求救，怎么提示字符非法

万丅冧 · 发表于 2019-11-3 21:48

感谢分享

topvip · 发表于 2019-11-3 23:17

小白来学习了，感谢分享！

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] 小说网站爬虫作品分享

免费评分

浏览过的版块