代码分享
最近学习了下Python爬虫,于是尝试批量爬取了稻草人书屋小说站的小说,下面贴上源码,仅供交流学习
感谢19楼给的建议
打包成了可执行文件(windows、mac、Linux): https://www.52pojie.cn/thread-1115415-1-1.html
可能的错误
AttributeError: 'NoneType' object has no attribute 'get'
重新执行程序即可,这个错误是由于网络原因,某个页面没有爬取成功或爬取不完整导致无法解析到那个元素,从而返回空报错
也有可能你的ip被限制了,触发了原站的某些机制,请过会儿再爬!
像我爬得太猛,被封了
requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',)
2020/4/7 更新下源码
#/bin/python3
import requests
from lxml import etree
import os
def getHtml(url, encoding='utf-8'):
r = requests.get(url=url , headers=headers)
r.encoding = encoding
return r.text
def parseHtml(source):
global author
ehtml = etree.HTML(source)
booknames = ehtml.xpath('//*[@class="col-md-6 col-xs-12"]/a/text()')
urls = ehtml.xpath('//*[@class="col-md-6 col-xs-12"]/a/@href')
author = ehtml.xpath('//*[@class="col-md-12"][1]/text()')[0][3:]
for num in range(len(urls)):
urls[num] = 'https://www.daocaorenshuwu.com' + urls[num]
return booknames, urls
def downloader():
for url in urls:
r = getHtml(url, encoding='utf-8')
ehtml = etree.HTML(r)
download_url = 'https://www.daocaorenshuwu.com' + \
ehtml.xpath('//*[@class="col-md-6 text-center"]/a[1]/@href')[0]
download_urls.append(download_url)
if not os.path.exists(root):
os.mkdir(root)
root2 = root + author
if not os.path.exists(root2):
os.mkdir(root2)
for index in range(len(booknames)):
if (booknames[index].find('/') != False):
booknames[index] = booknames[index].replace('/', '-')
if (booknames[index].find('[') != False ):
booknames[index] = booknames[index].replace('[', '')
if (booknames[index].find(']') != False ):
booknames[index] = booknames[index].replace(']', '')
fullpath = root2 + '/' + booknames[index] + '.zip'
if not os.path.exists(fullpath):
print(booknames[index] + '下载中')
final_link = download_urls[index] + '&verifications=' + verification_code
try:
res = requests.post(url=final_link, headers=headers)
if (res.status_code == 200 and res.text[:5] != '验证码错误'):
f = open(fullpath, 'wb')
f.write(res.content)
print(booknames[index] + '下载完成')
else:
print('下载出错请检查验证码!!!')
except Exception:
print('下载出错')
continue
if __name__ == '__main__':
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/80.0.3987.149 Chrome/80.0.3987.149 Safari/537.36'}
download_urls = []
target_url = input("请输入你要下载的链接,要满足这种形式https://www.daocaorenshuwu.com/txt/8015.html\n")
verification_code = input("请输入验证码:\n")
root = input(r"请输入内容保存位置如(/home/hello_world/Desktop/): ")
if root[-1] != '/':
root = root + '/'
source = getHtml(target_url)
booknames, urls = parseHtml(source)
downloader()
旧源码
import requests
from bs4 import BeautifulSoup
import os
s = requests.Session()
#root为小说下载保存时的根目录
root = r'D:\\E-book\\literature\\'
#构造浏览器访问
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
"Referer":"https://www.daocaorenshuwu.com/"}
#通过一本小说的介绍界面,爬取作者所有作品介绍界面URL
#url = 'https://www.daocaorenshuwu.com/txt/11205.html' 这个请自行替换,网址格式需和这个类似
r = s.get('https://www.daocaorenshuwu.com/txt/10182.html', headers = headers)
r.encoding = r.apparent_encoding
#用BeautifulSoup解析源码
soup = BeautifulSoup(r.text, 'lxml')
#提取这个作者所有作品介绍界面URL
soup = soup.find_all('div', 'col-md-6 col-xs-12')
#创建介绍界面URL的一个列表
introduction_url = []
for i in range(len(soup)):
soup[i] = soup[i].find('a')
#提取出的URL为相对路径
#将其转换为绝对路径
part_url = soup[i].get('href')
complete_url = 'https://www.daocaorenshuwu.com' + part_url
introduction_url.append(complete_url)
#依次爬取列表中的小说介绍界面
#从而获取下载认证界面地址
for url in introduction_url:
#用rstrip()去掉url后面的\n
r = s.get(url = url.rstrip(), headers = headers)
r.encoding = r.apparent_encoding
#用BeautifulSoup解析源码
soup = BeautifulSoup(r.text, 'lxml')
#soup_name用于提取书名
soup_name = soup
#提取小说下载界面认证URL
soup = soup.find('div', 'col-md-6 text-center')
soup = soup.find('a')
soup = soup.get('href')
#构造可以通过认证的POST请求URL
#可以通过开发者工具得出
#其实verification后面的数字就是让你关注公众号获取的验证码
download_url = 'https://www.daocaorenshuwu.com' + soup + '&verification=12022018'
#提取小说名称用于文件保存
soup_name = soup_name.find('a', 'media-right')
soup_name = soup_name.find('img')
novel_name = soup_name.get('alt')
#path为小说完整保存路径
path = root + novel_name + '.zip'
#下载小说
print(novel_name + '下载中')
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
res = requests.post(url = download_url, headers = headers)
if (res.status_code == 200 and r.text[:5] != '验证码错误'):
with open(path, 'wb') as f:
f.write(res.content)
print(novel_name + '下載完成')
全站下载源码
#!/usr/bin/python3
import requests
from bs4 import BeautifulSoup
import os
import time
#收集作者信息函数
def get_writer_list(writer_url):
#提取作者信息
r = s.get(url = writer_url, headers = headers)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'lxml')
list1 = soup.find_all('div', 'col-md-2 col-sm-2 col-xs-4 b10')
for ele in list1:
url = 'https:' + ele.a['href']
name = ele.a.img['title']
print('收集作者--' + '*' + name + '*' + '的信息中')
writer_lists[name] = url
#这一步提取到小说下载介绍界面
def get_introduce(url):
#提取该作者其中一本书的URL
r = s.get(url = url, headers = headers)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'lxml')
url1 = 'https://www.daocaorenshuwu.com' + \
soup.find('div', 'col-md-12 mb10').a['href']
#找到这一本书的下载介绍界面
res = s.get(url = url1, headers = headers)
res.encoding = res.apparent_encoding
soup = BeautifulSoup(res.text, 'lxml')
url2 = 'https://www.daocaorenshuwu.com' + \
soup.find('a', 'hidden').get('href')
return url2
#这一步获得最后的下载链接
def get_download_url(url2):
r = s.get(url = url2, headers = headers)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'lxml')
list1 = []
soup = soup.find_all('div', 'col-md-6 col-xs-12')
#提取该作者所有小说下载地址
for i in range(len(soup)):
complete_url = 'https://www.daocaorenshuwu.com' + soup[i].find('a').get('href')
list1.append(complete_url)
novel_names = []
download_urls = []
for url in list1:
try:
#提取书名以
r = s.get(url = url.rstrip(), headers = headers)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'lxml')
soup_name = soup
#构造最终下载URL
download_url = 'https://www.daocaorenshuwu.com' + soup.find('div', 'col-md-6 text-center').a.get('href')
download_urls.append(download_url)
soup_name = soup_name.find('a', 'media-right')
novel_name = soup_name.find('img').get('alt')
novel_names.append(novel_name)
except Exception:
continue
return download_urls, novel_names
def main():
#收集所有作者信息
for i in range(1,61):
short_url = 'https://www.daocaorenshuwu.com/writer/list_81_' + str(i) + '.html'
writer_urls.append(short_url)
for writer_url in writer_urls:
get_writer_list(writer_url)
print('作者信息收集完毕!' + '*'*100)
num = 0
#根据作者介绍界面URL提取该作者作品的URL及下载URL
for name, url in writer_lists.items():
download_urls = []
novel_names = []
url2 = get_introduce(url)
download_urls, novel_names = get_download_url(url2)
path = root + name + '/'
#为不同作者创建不同的文件夹
if not os.path.exists(path):
try:
os.mkdir(path)
except Exception:
continue
for i in range(len(novel_names)):
#去掉特殊字符
if (novel_names[i].find('/') != False):
novel_names[i] = novel_names[i].replace('/', '-')
if (novel_names[i].find('[') != False):
novel_names[i] = novel_names[i].replace('[', '')
if (novel_names[i].find(']') != False):
novel_names[i] = novel_names[i].replace(']', '')
full_path = path + novel_names[i] + '.zip'
if not os.path.exists(full_path):
#下载书籍
try:
if (num==0):
download_urls[i] = download_urls[i] + '&verification=' + secret
num += 1
print(novel_names[i] + '下载中')
start = time.time()
r = s.post(download_urls[i], headers = headers)
if (r.status_code == 200 and r.text[:5] != '验证码错误'):
f = open(full_path, 'wb')
f.write(r.content)
end = time.time()
print(novel_names[i] + '下载完毕,耗时' + str(end-start) + '秒')
except Exception:
print('下载出错,跳过')
continue
if __name__ == '__main__':
print(r'╭━╮╭━╮╱╱╱╱╭╮╱╱╱╭╮╱╱╱╱╱╱╭━━━╮╭━━━╮\
┃┃╰╯┃┃╱╱╱╱┃┃╱╱╱┃┃╱╱╱╱╱╱┃╭━━╯┃╭━╮┃╱╱╱╱╱╱╱╱╱╭╮\
┃╭╮╭╮┣━━┳━╯┣━━╮┃╰━┳╮╱╭╮┃╰━━╮╰╯╭╯┃╭━━╮╭━━╮╱╰╯╭╮╭━━╮\
┃┃┃┃┃┃╭╮┃╭╮┃┃━┫┃╭╮┃┃╱┃┃╰━━╮┃╭━╯╭╯┃╭╮┃┃╭╮┃╱╭╮┣┫┃┃━┫\
┃┃┃┃┃┃╭╮┃╰╯┃┃━┫┃╰╯┃╰━╯┃╭━━╯┃┃┃╰━╮┃╰╯┃┃╰╯┃╱┃┃┃┃┃┃━┫\
╰╯╰╯╰┻╯╰┻━━┻━━╯╰━━┻━╮╭╯╰━━━╯╰━━━╯┃╭━╯╰━━╯╱┃┃╰╯╰━━╯\
╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╭━╯┃╱╱╱╱╱╱╱╱╱╱╱┃┃╱╱╱╱╱╱╭╯┃\
╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╱╰━━╯╱╱╱╱╱╱╱╱╱╱╱╰╯╱╱╱╱╱╱╰━╯\
╭━━┳╮╭┳╮╭━━━┳━━┳╮╭━━┳╮╭┳╮╭━━┳╮╭┳╮\
┃╭╮┃┃┃┣┫┣━━┃┃╭╮┣┫┃╭╮┃┃┃┣┫┃╭╮┃┃┃┣┫\
┃╰╯┃╰╯┃┃┃┃━━┫╭╮┃┃┃╰╯┃╰╯┃┃┃╰╯┃╰╯┃┃\
╰━╮┣━━┻╯╰━━━┻╯╰┻╯╰━╮┣━━┻╯╰━╮┣━━┻╯\
╭━╯┃╱╱╱╱╱╱╱╱╱╱╱╱╱╭━╯┃╱╱╱╱╭━╯┃\
╰━━╯╱╱╱╱╱╱╱╱╱╱╱╱╱╰━━╯╱╱╱╱╰━━╯')
print("此工具为稻草人书屋小说站实用下载工具Mac/Linux版\n")
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
"Referer":"https://www.daocaorenshuwu.com/"}
print('**自动下载稻草人书屋所有作者作品,并按作者进行分类**')
print('**最好挂代{过}{滤}理/或者在海外服务器上跑不然时间可能比较长**')
print('**切勿用于非法目的和商业用途,违者后果自负**')
root = input(r"请输入你想文件保存的根目录如(/Users/bobmaster/Documents): ")
secret = input('请输入小说站下载授权密码,(默认为12022018)后期如有失效,自行获取: ')
#用于ip被封之后的解决方式
#proxies = {
# 'http': 'socks5://user:pass@host:port',
# 'https': 'socks5://user:pass@host:port',
#}
if (secret == ''):
secret = str(12022018)
s = requests.Session()
writer_lists = {}
writer_urls = []
main()
全站源码推荐用海外服务器来跑速度会比较快,国内很慢
效果图
服务器下载演示