龟仔龟龟 发表于 2020-2-20 14:04

Python爬取稻草人书屋小说站源码

本帖最后由 龟仔龟龟 于 2020-4-7 12:06 编辑

## 代码分享
最近学习了下**Python**爬虫,于是尝试批量爬取了稻草人书屋小说站的小说,下面贴上源码,仅供交流学习
感谢(https://www.52pojie.cn/forum.php?mod=redirect&goto=findpost&ptid=1111047&pid=30202451)给的建议
打包成了可执行文件(windows、mac、Linux): https://www.52pojie.cn/thread-1115415-1-1.html
**可能的错误**
**`AttributeError: 'NoneType' object has no attribute 'get'`**
**重新执行程序即可,这个错误是由于网络原因,某个页面没有爬取成功或爬取不完整导致无法解析到那个元素,从而返回空报错**
**也有可能你的ip被限制了,触发了原站的某些机制,请过会儿再爬!**
**像我爬得太猛,被封了**
`requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',)`

**2020/4/7 更新下源码**
```python
#/bin/python3
import requests
from lxml import etree
import os

def getHtml(url, encoding='utf-8'):
    r = requests.get(url=url , headers=headers)
    r.encoding = encoding
    return r.text

def parseHtml(source):
    global author
    ehtml = etree.HTML(source)
    booknames = ehtml.xpath('//*[@class="col-md-6 col-xs-12"]/a/text()')
    urls = ehtml.xpath('//*[@class="col-md-6 col-xs-12"]/a/@href')
    author = ehtml.xpath('//*[@class="col-md-12"]/text()')
    for num in range(len(urls)):
      urls = 'https://www.daocaorenshuwu.com' + urls
    return booknames, urls

def downloader():
    for url in urls:
      r = getHtml(url, encoding='utf-8')
      ehtml = etree.HTML(r)
      download_url = 'https://www.daocaorenshuwu.com' + \
      ehtml.xpath('//*[@class="col-md-6 text-center"]/a/@href')
      download_urls.append(download_url)
    if not os.path.exists(root):
      os.mkdir(root)
    root2 = root + author
    if not os.path.exists(root2):
      os.mkdir(root2)
    for index in range(len(booknames)):
      if (booknames.find('/') != False):
            booknames = booknames.replace('/', '-')
      if (booknames.find('[') != False ):
            booknames = booknames.replace('[', '')
      if (booknames.find(']') != False ):
            booknames = booknames.replace(']', '')
      fullpath = root2 + '/' + booknames + '.zip'
      if not os.path.exists(fullpath):
            print(booknames + '下载中')
            final_link = download_urls + '&verifications=' + verification_code
            try:
                res = requests.post(url=final_link, headers=headers)
                if (res.status_code == 200 and res.text[:5] != '验证码错误'):
                  f = open(fullpath, 'wb')
                  f.write(res.content)
                  print(booknames + '下载完成')
                else:
                  print('下载出错请检查验证码!!!')
            except Exception:
                print('下载出错')
                continue






if __name__ == '__main__':
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/80.0.3987.149 Chrome/80.0.3987.149 Safari/537.36'}
    download_urls = []
    target_url = input("请输入你要下载的链接,要满足这种形式https://www.daocaorenshuwu.com/txt/8015.html\n")
    verification_code = input("请输入验证码:\n")
    root = input(r"请输入内容保存位置如(/home/hello_world/Desktop/): ")
    if root[-1] != '/':
      root = root + '/'
    source = getHtml(target_url)
    booknames, urls = parseHtml(source)
    downloader()
```

---

**旧源码**
```python
import requests
from bs4 import BeautifulSoup
import os

s = requests.Session()
#root为小说下载保存时的根目录
root = r'D:\\E-book\\literature\\'

#构造浏览器访问
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
         "Referer":"https://www.daocaorenshuwu.com/"}

#通过一本小说的介绍界面,爬取作者所有作品介绍界面URL
#url = 'https://www.daocaorenshuwu.com/txt/11205.html'这个请自行替换,网址格式需和这个类似
r = s.get('https://www.daocaorenshuwu.com/txt/10182.html', headers = headers)
r.encoding = r.apparent_encoding

#用BeautifulSoup解析源码
soup = BeautifulSoup(r.text, 'lxml')

#提取这个作者所有作品介绍界面URL
soup = soup.find_all('div', 'col-md-6 col-xs-12')

#创建介绍界面URL的一个列表
introduction_url = []

for i in range(len(soup)):
    soup = soup.find('a')
   
    #提取出的URL为相对路径
    #将其转换为绝对路径
    part_url = soup.get('href')
    complete_url = 'https://www.daocaorenshuwu.com' + part_url
    introduction_url.append(complete_url)

#依次爬取列表中的小说介绍界面
#从而获取下载认证界面地址
for url in introduction_url:   

    #用rstrip()去掉url后面的\n   
    r = s.get(url = url.rstrip(), headers = headers)
    r.encoding = r.apparent_encoding
   
    #用BeautifulSoup解析源码
    soup = BeautifulSoup(r.text, 'lxml')
   
    #soup_name用于提取书名
    soup_name = soup
   
    #提取小说下载界面认证URL
    soup = soup.find('div', 'col-md-6 text-center')
    soup = soup.find('a')
    soup = soup.get('href')
   
    #构造可以通过认证的POST请求URL
    #可以通过开发者工具得出
    #其实verification后面的数字就是让你关注公众号获取的验证码
    download_url = 'https://www.daocaorenshuwu.com' + soup + '&verification=12022018'
   
    #提取小说名称用于文件保存
    soup_name = soup_name.find('a', 'media-right')
    soup_name = soup_name.find('img')
    novel_name = soup_name.get('alt')
   
    #path为小说完整保存路径
    path = root + novel_name + '.zip'
   
    #下载小说
    print(novel_name + '下载中')
    if not os.path.exists(root):
      os.mkdir(root)
    if not os.path.exists(path):
      res = requests.post(url = download_url, headers = headers)
      if (res.status_code == 200 and r.text[:5] != '验证码错误'):
            with open(path, 'wb') as f:
                f.write(res.content)
                print(novel_name + '下載完成')
```
##全站下载源码
```python
#!/usr/bin/python3
import requests
from bs4 import BeautifulSoup
import os
import time

#收集作者信息函数
def get_writer_list(writer_url):
    #提取作者信息
    r = s.get(url = writer_url, headers = headers)
    r.encoding = r.apparent_encoding
    soup = BeautifulSoup(r.text, 'lxml')
    list1 = soup.find_all('div', 'col-md-2 col-sm-2 col-xs-4 b10')
    for ele in list1:
      url = 'https:' + ele.a['href']
      name = ele.a.img['title']
      print('收集作者--' + '*' + name + '*' + '的信息中')
      writer_lists = url

#这一步提取到小说下载介绍界面

def get_introduce(url):
      #提取该作者其中一本书的URL
      r = s.get(url = url, headers = headers)
      r.encoding = r.apparent_encoding
      soup = BeautifulSoup(r.text, 'lxml')
      url1 = 'https://www.daocaorenshuwu.com' + \
      soup.find('div', 'col-md-12 mb10').a['href']

      #找到这一本书的下载介绍界面
      res = s.get(url = url1, headers = headers)
      res.encoding = res.apparent_encoding
      soup = BeautifulSoup(res.text, 'lxml')
      url2 = 'https://www.daocaorenshuwu.com' + \
      soup.find('a', 'hidden').get('href')
      return url2

#这一步获得最后的下载链接
def get_download_url(url2):
    r = s.get(url = url2, headers = headers)
    r.encoding = r.apparent_encoding
    soup = BeautifulSoup(r.text, 'lxml')
    list1 = []
    soup = soup.find_all('div', 'col-md-6 col-xs-12')

    #提取该作者所有小说下载地址
    for i in range(len(soup)):
      complete_url = 'https://www.daocaorenshuwu.com' + soup.find('a').get('href')
      list1.append(complete_url)


    novel_names = []
    download_urls = []
    for url in list1:
      try:

            #提取书名以
            r = s.get(url = url.rstrip(), headers = headers)
            r.encoding = r.apparent_encoding
            soup = BeautifulSoup(r.text, 'lxml')
            soup_name = soup

            #构造最终下载URL
            download_url = 'https://www.daocaorenshuwu.com' + soup.find('div', 'col-md-6 text-center').a.get('href')
            download_urls.append(download_url)
            soup_name = soup_name.find('a', 'media-right')
            novel_name = soup_name.find('img').get('alt')
            novel_names.append(novel_name)
      except Exception:
            continue
    return download_urls, novel_names

def main():
    #收集所有作者信息
    for i in range(1,61):
      short_url = 'https://www.daocaorenshuwu.com/writer/list_81_' + str(i) + '.html'
      writer_urls.append(short_url)

    for writer_url in writer_urls:
      get_writer_list(writer_url)

    print('作者信息收集完毕!' + '*'*100)
    num = 0

    #根据作者介绍界面URL提取该作者作品的URL及下载URL
    for name, url in writer_lists.items():
      download_urls = []
      novel_names = []
      url2 = get_introduce(url)
      download_urls, novel_names = get_download_url(url2)
      path = root + name + '/'

      #为不同作者创建不同的文件夹
      if not os.path.exists(path):
            try:
                os.mkdir(path)
            except Exception:
                continue
      for i in range(len(novel_names)):
            #去掉特殊字符
            if (novel_names.find('/') != False):
                novel_names = novel_names.replace('/', '-')
            if (novel_names.find('[') != False):
                novel_names = novel_names.replace('[', '')
            if (novel_names.find(']') != False):
                novel_names = novel_names.replace(']', '')            
            full_path = path + novel_names + '.zip'
            if not os.path.exists(full_path):

               #下载书籍
                try:
                  if (num==0):
                        download_urls = download_urls + '&verification=' + secret
                        num += 1
                  print(novel_names + '下载中')
                  start = time.time()
                  r = s.post(download_urls, headers = headers)
                  if (r.status_code == 200 and r.text[:5] != '验证码错误'):

                        f = open(full_path, 'wb')
                        f.write(r.content)
                        end = time.time()
                        print(novel_names + '下载完毕,耗时' + str(end-start) + '秒')
                except Exception:
                  print('下载出错,跳过')
                  continue

if __name__ == '__main__':
    print(r'╭━╮╭━╮╱╱╱╱╭╮╱╱╱╭╮╱╱╱╱╱╱╭━━━╮╭━━━╮\
    ┃┃╰╯┃┃╱╱╱╱┃┃╱╱╱┃┃╱╱╱╱╱╱┃╭━━╯┃╭━╮┃╱╱╱╱╱╱╱╱╱╭╮\
    ┃╭╮╭╮┣━━┳━╯┣━━╮┃╰━┳╮╱╭╮┃╰━━╮╰╯╭╯┃╭━━╮╭━━╮╱╰╯╭╮╭━━╮\
    ┃┃┃┃┃┃╭╮┃╭╮┃┃━┫┃╭╮┃┃╱┃┃╰━━╮┃╭━╯╭╯┃╭╮┃┃╭╮┃╱╭╮┣┫┃┃━┫\
    ┃┃┃┃┃┃╭╮┃╰╯┃┃━┫┃╰╯┃╰━╯┃╭━━╯┃┃┃╰━╮┃╰╯┃┃╰╯┃╱┃┃┃┃┃┃━┫\
    ╰╯╰╯╰┻╯╰┻━━┻━━╯╰━━┻━╮╭╯╰━━━╯╰━━━╯┃╭━╯╰━━╯╱┃┃╰╯╰━━╯\
    ╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╭━╯┃╱╱╱╱╱╱╱╱╱╱╱┃┃╱╱╱╱╱╱╭╯┃\
    ╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╰━━╯╱╱╱╱╱╱╱╱╱╱╱╰╯╱╱╱╱╱╱╰━╯\
    ╭━━┳╮╭┳╮╭━━━┳━━┳╮╭━━┳╮╭┳╮╭━━┳╮╭┳╮\
    ┃╭╮┃┃┃┣┫┣━━┃┃╭╮┣┫┃╭╮┃┃┃┣┫┃╭╮┃┃┃┣┫\
    ┃╰╯┃╰╯┃┃┃┃━━┫╭╮┃┃┃╰╯┃╰╯┃┃┃╰╯┃╰╯┃┃\
    ╰━╮┣━━┻╯╰━━━┻╯╰┻╯╰━╮┣━━┻╯╰━╮┣━━┻╯\
    ╭━╯┃╱╱╱╱╱╱╱╱╱╱╱╱╱╭━╯┃╱╱╱╱╭━╯┃\
    ╰━━╯╱╱╱╱╱╱╱╱╱╱╱╱╱╰━━╯╱╱╱╱╰━━╯')
    print("此工具为稻草人书屋小说站实用下载工具Mac/Linux版\n")
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 \
    (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
             "Referer":"https://www.daocaorenshuwu.com/"}

    print('**自动下载稻草人书屋所有作者作品,并按作者进行分类**')
    print('**最好挂代{过}{滤}理/或者在海外服务器上跑不然时间可能比较长**')
    print('**切勿用于非法目的和商业用途,违者后果自负**')
    root = input(r"请输入你想文件保存的根目录如(/Users/bobmaster/Documents): ")
    secret = input('请输入小说站下载授权密码,(默认为12022018)后期如有失效,自行获取: ')
    #用于ip被封之后的解决方式
    #proxies = {
    #   'http': 'socks5://user:pass@host:port',
    #    'https': 'socks5://user:pass@host:port',
    #}
    if (secret == ''):
      secret = str(12022018)
    s = requests.Session()
    writer_lists = {}
    writer_urls = []
    main()
```
**全站源码推荐用海外服务器来跑速度会比较快,国内很慢**
## 效果图
![](https://fly.myroad.fun/picture/forum/total_site/Snipaste_2020-02-25_20-06-13.png)
![](https://fly.myroad.fun/picture/forum/total_site/Snipaste_2020-02-26_18-21-37.png)
![](https://fly.myroad.fun/picture/forum/total_site/Snipaste_2020-02-26_18-23-15.png)
**服务器下载演示**
![](https://fly.myroad.fun/picture/forum/total_site/Snipaste_2020-02-26_17-47-23.png)
![](https://fly.myroad.fun/picture/forum/total_site/Snipaste_2020-02-26_18-32-05.png)

wkfy 发表于 2020-2-25 16:01

楼主的python是向【风变编程】学习的?BeautifulSoup的匹配还可以更简洁一些。我也是刚学的,以前看到BS4怕的半死,现在知道有简介的用法。给楼主参考一下。

# -*- coding:utf-8 -*-
import requests as CC
from bs4 import BeautifulSoup

headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 \(KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
"Referer":"https://www.daocaorenshuwu.com/"
}


def Down_zip(name,url):
    date=CC.get(url)
    print(date.url)
    if date.status_code==200:
      zip_dat=date.content
      f= open(name+'.zip','wb')
      f.write(zip_dat)
      f.close()
      print(name,'下载完成!')

#//重新伪造URL
def Fomat_url(url):
    r1 = CC.get(url, headers=headers)
    r1.encoding = r1.apparent_encoding
    soup1 = BeautifulSoup(r1.text, 'lxml')
    d_url = 'https://www.daocaorenshuwu.com' + soup1.find('div', class_='col-md-6 text-center').a['href'] + '&verification=12022018'
    return d_url


def main():
    #user_url=input('请粘贴网址:') #'https://www.daocaorenshuwu.com/txt/10182.html'
    try:
      r = CC.get('https://www.daocaorenshuwu.com/txt/10182.html', headers = headers)
      r.encoding = r.apparent_encoding
      soup = BeautifulSoup(r.text,'lxml')
      list1 = soup.find_all('div',class_='col-md-6 col-xs-12') #截取每一个链接
      for i in list1:
            name =i.string
            url_list = 'https://www.daocaorenshuwu.com'+i.a['href']
            d_url = Fomat_url(url_list)
            Down_zip(name,d_url)
    except:
      print(name,'下载失败!')


if __name__ == '__main__':
    main()

lihaisanhui 发表于 2020-2-20 14:17

用with打开文件可以不加close()

shubaowang 发表于 2020-2-20 14:38

谢谢分享,看看,学习学习

52cavid 发表于 2020-2-20 14:45

谢谢楼主,学习学习

whistlelee 发表于 2020-2-20 14:45

看上去很厉害啊~~

netsill 发表于 2020-2-20 14:46

感谢,学习ing

pinhai 发表于 2020-2-20 14:49

谢谢分享,代码很简洁啊,值得学习

大鱼爱吃猫 发表于 2020-2-20 14:55

追书这玩意不知不觉就上瘾了,感谢技术分享

ooo1 发表于 2020-2-20 14:56

谢谢,学习一下

hfxiang 发表于 2020-2-20 15:15

正准备学习Python,但不知道楼主的代码是2.x还是3.x的?
页: [1] 2 3
查看完整版本: Python爬取稻草人书屋小说站源码