Python爬取稻草人书屋小说站源码
本帖最后由 龟仔龟龟 于 2020-4-7 12:06 编辑## 代码分享
最近学习了下**Python**爬虫,于是尝试批量爬取了稻草人书屋小说站的小说,下面贴上源码,仅供交流学习
感谢(https://www.52pojie.cn/forum.php?mod=redirect&goto=findpost&ptid=1111047&pid=30202451)给的建议
打包成了可执行文件(windows、mac、Linux): https://www.52pojie.cn/thread-1115415-1-1.html
**可能的错误**
**`AttributeError: 'NoneType' object has no attribute 'get'`**
**重新执行程序即可,这个错误是由于网络原因,某个页面没有爬取成功或爬取不完整导致无法解析到那个元素,从而返回空报错**
**也有可能你的ip被限制了,触发了原站的某些机制,请过会儿再爬!**
**像我爬得太猛,被封了**
`requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',)`
**2020/4/7 更新下源码**
```python
#/bin/python3
import requests
from lxml import etree
import os
def getHtml(url, encoding='utf-8'):
r = requests.get(url=url , headers=headers)
r.encoding = encoding
return r.text
def parseHtml(source):
global author
ehtml = etree.HTML(source)
booknames = ehtml.xpath('//*[@class="col-md-6 col-xs-12"]/a/text()')
urls = ehtml.xpath('//*[@class="col-md-6 col-xs-12"]/a/@href')
author = ehtml.xpath('//*[@class="col-md-12"]/text()')
for num in range(len(urls)):
urls = 'https://www.daocaorenshuwu.com' + urls
return booknames, urls
def downloader():
for url in urls:
r = getHtml(url, encoding='utf-8')
ehtml = etree.HTML(r)
download_url = 'https://www.daocaorenshuwu.com' + \
ehtml.xpath('//*[@class="col-md-6 text-center"]/a/@href')
download_urls.append(download_url)
if not os.path.exists(root):
os.mkdir(root)
root2 = root + author
if not os.path.exists(root2):
os.mkdir(root2)
for index in range(len(booknames)):
if (booknames.find('/') != False):
booknames = booknames.replace('/', '-')
if (booknames.find('[') != False ):
booknames = booknames.replace('[', '')
if (booknames.find(']') != False ):
booknames = booknames.replace(']', '')
fullpath = root2 + '/' + booknames + '.zip'
if not os.path.exists(fullpath):
print(booknames + '下载中')
final_link = download_urls + '&verifications=' + verification_code
try:
res = requests.post(url=final_link, headers=headers)
if (res.status_code == 200 and res.text[:5] != '验证码错误'):
f = open(fullpath, 'wb')
f.write(res.content)
print(booknames + '下载完成')
else:
print('下载出错请检查验证码!!!')
except Exception:
print('下载出错')
continue
if __name__ == '__main__':
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/80.0.3987.149 Chrome/80.0.3987.149 Safari/537.36'}
download_urls = []
target_url = input("请输入你要下载的链接,要满足这种形式https://www.daocaorenshuwu.com/txt/8015.html\n")
verification_code = input("请输入验证码:\n")
root = input(r"请输入内容保存位置如(/home/hello_world/Desktop/): ")
if root[-1] != '/':
root = root + '/'
source = getHtml(target_url)
booknames, urls = parseHtml(source)
downloader()
```
---
**旧源码**
```python
import requests
from bs4 import BeautifulSoup
import os
s = requests.Session()
#root为小说下载保存时的根目录
root = r'D:\\E-book\\literature\\'
#构造浏览器访问
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
"Referer":"https://www.daocaorenshuwu.com/"}
#通过一本小说的介绍界面,爬取作者所有作品介绍界面URL
#url = 'https://www.daocaorenshuwu.com/txt/11205.html'这个请自行替换,网址格式需和这个类似
r = s.get('https://www.daocaorenshuwu.com/txt/10182.html', headers = headers)
r.encoding = r.apparent_encoding
#用BeautifulSoup解析源码
soup = BeautifulSoup(r.text, 'lxml')
#提取这个作者所有作品介绍界面URL
soup = soup.find_all('div', 'col-md-6 col-xs-12')
#创建介绍界面URL的一个列表
introduction_url = []
for i in range(len(soup)):
soup = soup.find('a')
#提取出的URL为相对路径
#将其转换为绝对路径
part_url = soup.get('href')
complete_url = 'https://www.daocaorenshuwu.com' + part_url
introduction_url.append(complete_url)
#依次爬取列表中的小说介绍界面
#从而获取下载认证界面地址
for url in introduction_url:
#用rstrip()去掉url后面的\n
r = s.get(url = url.rstrip(), headers = headers)
r.encoding = r.apparent_encoding
#用BeautifulSoup解析源码
soup = BeautifulSoup(r.text, 'lxml')
#soup_name用于提取书名
soup_name = soup
#提取小说下载界面认证URL
soup = soup.find('div', 'col-md-6 text-center')
soup = soup.find('a')
soup = soup.get('href')
#构造可以通过认证的POST请求URL
#可以通过开发者工具得出
#其实verification后面的数字就是让你关注公众号获取的验证码
download_url = 'https://www.daocaorenshuwu.com' + soup + '&verification=12022018'
#提取小说名称用于文件保存
soup_name = soup_name.find('a', 'media-right')
soup_name = soup_name.find('img')
novel_name = soup_name.get('alt')
#path为小说完整保存路径
path = root + novel_name + '.zip'
#下载小说
print(novel_name + '下载中')
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
res = requests.post(url = download_url, headers = headers)
if (res.status_code == 200 and r.text[:5] != '验证码错误'):
with open(path, 'wb') as f:
f.write(res.content)
print(novel_name + '下載完成')
```
##全站下载源码
```python
#!/usr/bin/python3
import requests
from bs4 import BeautifulSoup
import os
import time
#收集作者信息函数
def get_writer_list(writer_url):
#提取作者信息
r = s.get(url = writer_url, headers = headers)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'lxml')
list1 = soup.find_all('div', 'col-md-2 col-sm-2 col-xs-4 b10')
for ele in list1:
url = 'https:' + ele.a['href']
name = ele.a.img['title']
print('收集作者--' + '*' + name + '*' + '的信息中')
writer_lists = url
#这一步提取到小说下载介绍界面
def get_introduce(url):
#提取该作者其中一本书的URL
r = s.get(url = url, headers = headers)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'lxml')
url1 = 'https://www.daocaorenshuwu.com' + \
soup.find('div', 'col-md-12 mb10').a['href']
#找到这一本书的下载介绍界面
res = s.get(url = url1, headers = headers)
res.encoding = res.apparent_encoding
soup = BeautifulSoup(res.text, 'lxml')
url2 = 'https://www.daocaorenshuwu.com' + \
soup.find('a', 'hidden').get('href')
return url2
#这一步获得最后的下载链接
def get_download_url(url2):
r = s.get(url = url2, headers = headers)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'lxml')
list1 = []
soup = soup.find_all('div', 'col-md-6 col-xs-12')
#提取该作者所有小说下载地址
for i in range(len(soup)):
complete_url = 'https://www.daocaorenshuwu.com' + soup.find('a').get('href')
list1.append(complete_url)
novel_names = []
download_urls = []
for url in list1:
try:
#提取书名以
r = s.get(url = url.rstrip(), headers = headers)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'lxml')
soup_name = soup
#构造最终下载URL
download_url = 'https://www.daocaorenshuwu.com' + soup.find('div', 'col-md-6 text-center').a.get('href')
download_urls.append(download_url)
soup_name = soup_name.find('a', 'media-right')
novel_name = soup_name.find('img').get('alt')
novel_names.append(novel_name)
except Exception:
continue
return download_urls, novel_names
def main():
#收集所有作者信息
for i in range(1,61):
short_url = 'https://www.daocaorenshuwu.com/writer/list_81_' + str(i) + '.html'
writer_urls.append(short_url)
for writer_url in writer_urls:
get_writer_list(writer_url)
print('作者信息收集完毕!' + '*'*100)
num = 0
#根据作者介绍界面URL提取该作者作品的URL及下载URL
for name, url in writer_lists.items():
download_urls = []
novel_names = []
url2 = get_introduce(url)
download_urls, novel_names = get_download_url(url2)
path = root + name + '/'
#为不同作者创建不同的文件夹
if not os.path.exists(path):
try:
os.mkdir(path)
except Exception:
continue
for i in range(len(novel_names)):
#去掉特殊字符
if (novel_names.find('/') != False):
novel_names = novel_names.replace('/', '-')
if (novel_names.find('[') != False):
novel_names = novel_names.replace('[', '')
if (novel_names.find(']') != False):
novel_names = novel_names.replace(']', '')
full_path = path + novel_names + '.zip'
if not os.path.exists(full_path):
#下载书籍
try:
if (num==0):
download_urls = download_urls + '&verification=' + secret
num += 1
print(novel_names + '下载中')
start = time.time()
r = s.post(download_urls, headers = headers)
if (r.status_code == 200 and r.text[:5] != '验证码错误'):
f = open(full_path, 'wb')
f.write(r.content)
end = time.time()
print(novel_names + '下载完毕,耗时' + str(end-start) + '秒')
except Exception:
print('下载出错,跳过')
continue
if __name__ == '__main__':
print(r'╭━╮╭━╮╱╱╱╱╭╮╱╱╱╭╮╱╱╱╱╱╱╭━━━╮╭━━━╮\
┃┃╰╯┃┃╱╱╱╱┃┃╱╱╱┃┃╱╱╱╱╱╱┃╭━━╯┃╭━╮┃╱╱╱╱╱╱╱╱╱╭╮\
┃╭╮╭╮┣━━┳━╯┣━━╮┃╰━┳╮╱╭╮┃╰━━╮╰╯╭╯┃╭━━╮╭━━╮╱╰╯╭╮╭━━╮\
┃┃┃┃┃┃╭╮┃╭╮┃┃━┫┃╭╮┃┃╱┃┃╰━━╮┃╭━╯╭╯┃╭╮┃┃╭╮┃╱╭╮┣┫┃┃━┫\
┃┃┃┃┃┃╭╮┃╰╯┃┃━┫┃╰╯┃╰━╯┃╭━━╯┃┃┃╰━╮┃╰╯┃┃╰╯┃╱┃┃┃┃┃┃━┫\
╰╯╰╯╰┻╯╰┻━━┻━━╯╰━━┻━╮╭╯╰━━━╯╰━━━╯┃╭━╯╰━━╯╱┃┃╰╯╰━━╯\
╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╭━╯┃╱╱╱╱╱╱╱╱╱╱╱┃┃╱╱╱╱╱╱╭╯┃\
╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╰━━╯╱╱╱╱╱╱╱╱╱╱╱╰╯╱╱╱╱╱╱╰━╯\
╭━━┳╮╭┳╮╭━━━┳━━┳╮╭━━┳╮╭┳╮╭━━┳╮╭┳╮\
┃╭╮┃┃┃┣┫┣━━┃┃╭╮┣┫┃╭╮┃┃┃┣┫┃╭╮┃┃┃┣┫\
┃╰╯┃╰╯┃┃┃┃━━┫╭╮┃┃┃╰╯┃╰╯┃┃┃╰╯┃╰╯┃┃\
╰━╮┣━━┻╯╰━━━┻╯╰┻╯╰━╮┣━━┻╯╰━╮┣━━┻╯\
╭━╯┃╱╱╱╱╱╱╱╱╱╱╱╱╱╭━╯┃╱╱╱╱╭━╯┃\
╰━━╯╱╱╱╱╱╱╱╱╱╱╱╱╱╰━━╯╱╱╱╱╰━━╯')
print("此工具为稻草人书屋小说站实用下载工具Mac/Linux版\n")
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
"Referer":"https://www.daocaorenshuwu.com/"}
print('**自动下载稻草人书屋所有作者作品,并按作者进行分类**')
print('**最好挂代{过}{滤}理/或者在海外服务器上跑不然时间可能比较长**')
print('**切勿用于非法目的和商业用途,违者后果自负**')
root = input(r"请输入你想文件保存的根目录如(/Users/bobmaster/Documents): ")
secret = input('请输入小说站下载授权密码,(默认为12022018)后期如有失效,自行获取: ')
#用于ip被封之后的解决方式
#proxies = {
# 'http': 'socks5://user:pass@host:port',
# 'https': 'socks5://user:pass@host:port',
#}
if (secret == ''):
secret = str(12022018)
s = requests.Session()
writer_lists = {}
writer_urls = []
main()
```
**全站源码推荐用海外服务器来跑速度会比较快,国内很慢**
## 效果图
![](https://fly.myroad.fun/picture/forum/total_site/Snipaste_2020-02-25_20-06-13.png)
![](https://fly.myroad.fun/picture/forum/total_site/Snipaste_2020-02-26_18-21-37.png)
![](https://fly.myroad.fun/picture/forum/total_site/Snipaste_2020-02-26_18-23-15.png)
**服务器下载演示**
![](https://fly.myroad.fun/picture/forum/total_site/Snipaste_2020-02-26_17-47-23.png)
![](https://fly.myroad.fun/picture/forum/total_site/Snipaste_2020-02-26_18-32-05.png)
楼主的python是向【风变编程】学习的?BeautifulSoup的匹配还可以更简洁一些。我也是刚学的,以前看到BS4怕的半死,现在知道有简介的用法。给楼主参考一下。
# -*- coding:utf-8 -*-
import requests as CC
from bs4 import BeautifulSoup
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 \(KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
"Referer":"https://www.daocaorenshuwu.com/"
}
def Down_zip(name,url):
date=CC.get(url)
print(date.url)
if date.status_code==200:
zip_dat=date.content
f= open(name+'.zip','wb')
f.write(zip_dat)
f.close()
print(name,'下载完成!')
#//重新伪造URL
def Fomat_url(url):
r1 = CC.get(url, headers=headers)
r1.encoding = r1.apparent_encoding
soup1 = BeautifulSoup(r1.text, 'lxml')
d_url = 'https://www.daocaorenshuwu.com' + soup1.find('div', class_='col-md-6 text-center').a['href'] + '&verification=12022018'
return d_url
def main():
#user_url=input('请粘贴网址:') #'https://www.daocaorenshuwu.com/txt/10182.html'
try:
r = CC.get('https://www.daocaorenshuwu.com/txt/10182.html', headers = headers)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text,'lxml')
list1 = soup.find_all('div',class_='col-md-6 col-xs-12') #截取每一个链接
for i in list1:
name =i.string
url_list = 'https://www.daocaorenshuwu.com'+i.a['href']
d_url = Fomat_url(url_list)
Down_zip(name,d_url)
except:
print(name,'下载失败!')
if __name__ == '__main__':
main() 用with打开文件可以不加close() 谢谢分享,看看,学习学习 谢谢楼主,学习学习 看上去很厉害啊~~ 感谢,学习ing
谢谢分享,代码很简洁啊,值得学习 追书这玩意不知不觉就上瘾了,感谢技术分享 谢谢,学习一下 正准备学习Python,但不知道楼主的代码是2.x还是3.x的?