本帖最后由 qiujw 于 2022-8-17 15:22 编辑
[Python] 纯文本查看 复制代码 # -*- coding: utf-8 -*-
import time
import requests
requests.packages.urllib3.disable_warnings()
import re
import random
# 由于蓝奏云经常变更
# 下载连接的匹配规则发生变了,则需要修改以下正则表达式 var_domian_reg_arr 和 var_query_reg_arr
# 下载域名
var_domian_reg_arr = [
# # r: 不转义,原始字符串
# var pototo = 'https://develope.lanzoug.com/file/';
r"var\s*pototo\s*=\s*'(.+?)'",
# var cppat = 'https://develope.lanzoug.com/file/';
r"var\s*cppat\s*=\s*'(.+?)'",
]
# 下载token
var_query_reg_arr = [
# # r: 不转义,原始字符串
# var spototo = '?xxxx';
r"var\s*spototo\s*=\s*'(.+?)'",
# submit.href = cppat + '?xxx'
# r"submit.href\s*=\s*cppat\s*\+\s*'(.+?)'",
r"submit.href\s*=\s*.*\s*\+?\s*'(.+?)'"
]
# 默认下载域名
default_domain = 'https://develope.lanzoug.com/file/'
# 请求超时
timeout = 50
# 蓝奏云Python真实地址解析
def get_lanzou_download_url(url, retryTimes=3):
headers = {
'origin': 'https://www.lanzous.com',
'accept-language': 'zh-CN,zh;q=0.9',
'X-Forwarded-For': '.'.join('%s' % random.randint(0, 255) for i in range(4)),
'User-Agent': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G955U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36'
}
# 加上tp, 手机端
if url.rfind('/tp/') == -1:
ridx = url.rindex('/')
url = url[0:ridx] + '/tp' + url[ridx:]
# print(url)
# 请求下载页面
resp = None
try:
resp = requests.get(url, headers=headers, timeout=timeout, verify=False)
except Exception as err:
print(err)
if resp is None or not responseStatusOK(resp):
retryTimes -= 1
if retryTimes < 0:
return None
wait_some_time()
return get_lanzou_download_url(url, retryTimes)
resp.encoding = "utf-8"
page_content = resp.text
#print(page_content)
if not page_content:
print('Status: %u, Url: %s, content is empty' % (resp.status_code, resp.url))
retryTimes -= 1
if retryTimes < 0:
return None
wait_some_time()
return get_lanzou_download_url(url, retryTimes)
# 下载域名
down_domain = get_reg_value_by_content(page_content, var_domian_reg_arr, default_domain)
# 下载token
download_query = get_reg_value_by_content(page_content, var_query_reg_arr)
if down_domain and download_query:
return down_domain + download_query
else:
print("Url: %s, down_domain:%s download_query: %s" %(resp.url, down_domain, download_query))
return None
def get_reg_value_by_content(page_content, reg_arr, default_val=None):
return_val = default_val
for var_reg in reg_arr:
match_val = re.search(var_reg, page_content)
if match_val and match_val.group(1):
return_val = match_val.group(1)
break
return return_val
def responseStatusOK(resp):
if resp is None:
print('resp is None, Url: %s' % resp.url)
return False
if resp.status_code != requests.codes.OK:
print('Status: %u, Url: %s' % (resp.status_code, resp.url))
return False
return True
def wait_some_time(minMls=1000, maxMls=5000):
sleep_seconds = random.randint(minMls, maxMls) / 1000
time.sleep(sleep_seconds)
return sleep_seconds
if __name__ == '__main__':
print(get_lanzou_download_url('https://wwd.lanzouw.com/i269V065bmza'))
|