Python 爬取某小说网站+转存到百度网盘
本帖最后由 null119 于 2019-8-8 23:07 编辑小说网站:http://mebook.cc/
百度网盘的文件转存在网上查阅了很多代码资料,现在大多都用不了,只能自己分析,花了大半天时间才搞定。
先上图:
代码分两块,一块是爬取小说mebook.py,另一块是保存到百度网盘Commonly.py,两个文件放在同一文件夹即可,分开写是为了以后调用方便
mebook.py
import requests,re
from lxml import etree
import Commonly
def gethtml(url,encode): #获取网页源码
global headers
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent':Commonly.get_user_agent_pc()
}
r = requests.get(url, headers=headers)
if r.status_code!=200:
print('Error',url)
else:
r.encoding = encode
return r.text
if __name__ == '__main__':
# BDUSS = 从百度COOKICES中提取
# STOKEN = 从百度COOKICES中提取
# bdstoken = 从百度COOKICES中提取
BDUSS = ''
STOKEN = ''
bdstoken = ''
savepath = 'mybook'
url='http://mebook.cc/'
html=gethtml(url,'utf-8')
ehtml = etree.HTML(html)
MenuUrlList =ehtml.xpath('//*[@class="sub-menu"]/li/a/@href')
MenuTitle = ehtml.xpath('//*[@class="sub-menu"]/li/a/text()')
tmp = ''
c = 0
for i in range(0,len(MenuUrlList)):
if len(str(i))==1:
id=' '+str(i)
else:
id=str(i)
if c<2:
tmp=tmp+'ID:'+id+' '+MenuTitle+'\t\t'
c += 1
else:
print(tmp+'ID:'+id+' '+MenuTitle+'\t\t')
tmp = ''
c=0
print(tmp)
val=input('请输入需要获取分类ID:')
html = gethtml(MenuUrlList,'utf-8')
page=re.findall('共 \d+ 页',html).replace('共 ','').replace(' 页','')
for i in range(1,int(page)+1):
if i==1:
html = gethtml(MenuUrlList, 'utf-8')
else:
html = gethtml(MenuUrlList+'/page/'+str(i), 'utf-8')
ehtml = etree.HTML(html)
listurl = ehtml.xpath('//*[@class="list"]/li/div/h2/a/@href')
count=1
for j in listurl:
zhtml=gethtml(j,'utf-8')
ezhtml = etree.HTML(zhtml)
downlink = ''.join(ezhtml.xpath('//*[@class="downbtn"]/@href'))
bookname = ''.join(ezhtml.xpath('//*[@class="downbtn"]/@title'))
try:
thtml=gethtml(downlink,'utf-8')
try:
bdpsw=re.findall('百度网盘密码:[ ]{0,1}{4}',thtml).replace('百度网盘密码:','').replace(' ','')
downurl=re.findall('http{0,1}://pan\.baidu\.com/.*?"',thtml).replace('"','')
print('共【'+page+'】页/当前第【'+str(i) +'】页/第【'+str(count)+'】本, '+bookname+' 百度网盘分享链接:'+downurl+'提取码:'+bdpsw +''+Commonly.bdsave(downurl,bdpsw,savepath,BDUSS,STOKEN,bdstoken))
count+=1
except:
pass
except:
pass
Commonly.pyimport random,requests,re
# pc headers user-agent
user_agent_pc = [
# 谷歌
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.71 Safari/537.36',
'Mozilla/5.0.html (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.html.1271.64 Safari/537.11',
'Mozilla/5.0.html (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.html.648.133 Safari/534.16',
# 火狐
'Mozilla/5.0.html (Windows NT 6.1; WOW64; rv:34.0.html) Gecko/20100101 Firefox/34.0.html',
'Mozilla/5.0.html (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
# opera
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.95 Safari/537.36 OPR/26.0.html.1656.60',
# qq浏览器
'Mozilla/5.0.html (compatible; MSIE 9.0.html; Windows NT 6.1; WOW64; Trident/5.0.html; SLCC2; .NET CLR 2.0.html.50727; .NET CLR 3.5.30729; .NET CLR 3.0.html.30729; Media Center PC 6.0.html; .NET4.0C; .NET4.0E; QQBrowser/7.0.html.3698.400)',
# 搜狗浏览器
'Mozilla/5.0.html (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.html.963.84 Safari/535.11 SE 2.X MetaSr 1.0.html',
# 360浏览器
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.html.1599.101 Safari/537.36',
'Mozilla/5.0.html (Windows NT 6.1; WOW64; Trident/7.0.html; rv:11.0.html) like Gecko',
# uc浏览器
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.html.2125.122 UBrowser/4.0.html.3214.0.html Safari/537.36',
]
#随机获取一个PC headers user_agent
def get_user_agent_pc():
return random.choice(user_agent_pc)
#百度网盘转存
# furl = 分享链接
# verify = 提取码
# savepath = 转存到百度网盘指定目录
# BDUSS = 从百度COOKICES中提取
# STOKEN = 从百度COOKICES中提取
# bdstoken = 从百度COOKICES中提取
def bdsave(furl,verify,savepath,BDUSS,STOKEN,bdstoken):
s = requests.Session()
s.cookies['BDUSS'] = BDUSS
s.cookies['STOKEN'] = STOKEN
surl=furl.split('/')[-1])]
headers={
'User-Agent': get_user_agent_pc(),
'Referer': 'https://pan.baidu.com/share/init?surl='+surl
}
req = s.get(furl,headers=headers)
req.encoding='utf-8'
if '侵权、色情、反动、低俗' in req.text:
return ('此链接分享内容可能因为涉及侵权、色情、反动、低俗等信息,无法访问!')
else:
purl='https://pan.baidu.com/api/report/user?channel=chunlei&web=1&app_id=250528&bdstoken='+bdstoken+'&logid=MTU2NTI1MzM3OTY2MDAuMjI4MDU3NjY2NTk5MDkwODg=&clienttype=0'
data = {'timestamp':'1565244533','action':'web_home'}
req = s.post(purl,data=data,headers=headers)
purl = 'https://pan.baidu.com/share/verify?surl='+surl+'&t=1565244999152&channel=chunlei&web=1&app_id=250528&bdstoken='+bdstoken+'&logid=MTU2NTI1MzM3OTY2MDAuMjI4MDU3NjY2NTk5MDkwODg=&clienttype=0'
data = {'pwd':verify,'vcode':'','vcode_str':''}
req = s.post(purl,data=data,headers=headers)
rinfo = re.findall('"errno":[-]{0,1}\d+',req.text).replace('"errno":','')
if rinfo == '-12':
return ('提取码错误。')
else:
req = s.get(furl,headers=headers)
req.encoding='utf-8'
shareid=re.findall('"shareid":\d+',req.text).replace('"shareid":','')
uk=re.findall('uk=\d+',req.text).replace('uk=','')
fsidlist = re.findall('"fs_id":\d+', req.text).replace('"fs_id":','')
app_id = re.findall('"app_id":"\d+"',req.text).replace('"app_id":','').replace('"','')
#print('shareid:',shareid,'uk:',uk,'fs_id:',fsidlist,'app_id:',app_id)
purl='https://pan.baidu.com/share/transfer?shareid='+shareid+'&from='+uk+'&channel=chunlei&web=1&app_id='+app_id+'&bdstoken='+bdstoken+'&logid=MTU2NTI1MzM3OTY2MDAuMjI4MDU3NjY2NTk5MDkwODg=&clienttype=0'
data = {
'fsidlist':'['+fsidlist+']',
'path':'/'+savepath
}
req =s.post(purl,data=data,headers=headers)
req.encoding = 'utf-8'
try:
zcinfo = re.findall('\[\{"errno":[-]{0,1}\d+',req.text).replace('[{"errno":','')
except:
return 'Cookies失效,请更新BDUSS、STOKEN、bdstoken后再试!'
else:
info ={
"0":"转存成功。",
"-1":"由于您分享了违反相关法律法规的文件,分享功能已被禁用,之前分享出去的文件不受影响。",
"-2":"用户不存在,请刷新页面后重试。",
"-3":"文件不存在,请刷新页面后重试。",
"-4":"登录信息有误,请重新登录试试。",
"-5":"host_key和user_key无效。",
"-6":"请重新登录。",
"-7":"该分享已删除或已取消。",
"-8":"该分享已经过期。",
"-9":"访问密码错误。",
"-10":"分享外链已经达到最大上限100000条,不能再次分享。",
"-11":"验证cookie无效。",
"-14":"对不起,短信分享每天限制20条,你今天已经分享完,请明天再来分享吧!",
"-15":"对不起,邮件分享每天限制20封,你今天已经分享完,请明天再来分享吧!",
"-16":"对不起,该文件已经限制分享!",
"-17":"文件分享超过限制。",
"-21":"预置文件无法进行相关操作。",
"-30":"文件已存在。",
"-31":"文件保存失败。",
"-33":"一次支持操作999个,减点试试吧。",
"-32":"未知结果。",
"-70":"你分享的文件中包含病毒或疑似病毒,为了你和他人的数据安全,换个文件分享吧。",
"2":"参数错误。",
"3":"未登录或帐号无效。",
"4":"存储好像出问题了,请稍候再试。",
"108":"文件名有敏感词,优化一下吧。",
"110":"分享次数超出限制,可以到“我的分享”中查看已分享的文件链接。",
"114":"当前任务不存在,保存失败。",
"115":"该文件禁止分享。",
"112":'页面已过期,请刷新后重试。',
"9100":'你的帐号存在违规行为,已被冻结。',
"9200":'你的帐号存在违规行为,已被冻结。',
"9300":'你的帐号存在违规行为,该功能暂被冻结。',
"9400":'你的帐号异常,需验证后才能使用该功能。',
"9500":'你的帐号存在安全风险,已进入保护模式,请修改密码后使用。'}
return (info)
祖高域 发表于 2019-8-8 21:50
支持一下,不过这个用油猴脚本更方便吧
爬虫爬到链接数据后自动转存网盘,再用pandownload一起下载,这已经是我能想到的最简单、快捷的方式了 支持一下,不过这个用油猴脚本更方便吧 学习收藏,感谢分享 感谢无私分享,收藏先。 谢谢分享 null119 发表于 2019-8-8 21:53
爬虫爬到链接数据后自动转存网盘,再用pandownload一起下载,这已经是我能想到的最简单、快捷的方式了
大神,成品呢?我没看到链接啊? {:1_921:}转存到百度网盘秀秀秀~ 6啊,虽然看不懂,好用就对了。