【原创源码】【Python】使用Python爬取爱盘工具

MikoSec 发表于 2019-10-20 18:41

刚装的Windows准备学逆向，装点工具。由于本人比较懒，不想一个一个工具去找然后再下载，虽然可以装吾爱的虚拟机，但是操作起来没物理机舒服(然而窝还是装了)。
正好爱盘这里有不少工具。所以enmmmm
Python Start
Wirte Code
Start Python Code.
Get Tools.

因为爱盘几个大字写着不允许多线程下载(如果我理解无误的话)。就没有写多线程enmmm
爱盘限制多线程下载访问，请使用单线程进行下载访问，多并发会被禁止访问。

需要安装的库文件:
requests -- HTTP请求库
bs4 -- 解析库

下面是源码qwq(码农玩家玩耍)

#-*- coding: utf-8 -*-
import os
import time

import requests

from bs4 import BeautifulSoup

def download(downurl, path, filename): # 下载函数
start = time.time() # 开始时间

if not os.path.exists(path):
   os.makedirs(path)
if path[-1] != os.sep:
   path += os.sep
file = path+filename
size = 0

response = requests.get(downurl, stream=True)
if response.status_code != 200:
   print(f" url => {url}\tstatus_code => {response.status_code}")
   return

chunk_size = 1024
content_size = int(response.headers["content-length"])

print(": %0.2f MB" % (content_size / chunk_size / 1024))
with open(file, "wb") as f:
   for data in response.iter_content(chunk_size):
         f.write(data)
         size += len(data)
         print("\r: %s>%.2f%%" % ("="*int(size*50/content_size), float(size/content_size*100)), end="")

end = time.time() # 结束时间
print("Using Time: %.2fs"%(end-start))

def main():
url = "https://down.52pojie.cn/Tools/" # 爱盘Tools URL
if not os.path.exists("Tools"):
   os.mkdir("Tools")
os.chdir("Tools")
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77me/77.0.3865.120 Safari/537.36"}
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'lxml')
for i in soup.find_all("td", class_="link"): # 获取目录
   tooldir = i.text
   dir_url = url+tooldir
   print(dir_url) # 目录URL
   req = requests.get(dir_url)
   req.encoding = "utf-8"
   soup1 = BeautifulSoup(req.text,'lxml')
   for j in soup1.find_all("td", class_="link"): # 获取文件
         path = tooldir
         filename = j.text
         downurl = dir_url+filename
         print(f" Path => {path}\tFileName => {filename}")
         download(downurl, path, filename)

main()

然后是打包后的exe(普通玩家玩耍)

链接：https://pan.baidu.com/s/11xc6ENUELIWaQNIJsbxojA
提取码：d3ig

百度网盘分享链接在线解析
https://www.baiduwp.com/

www.52pojie.cn 发表于 2019-10-21 11:06

来52破52{:301_1001:}这是扛把子精神{:301_1001:}

ixsec 发表于 2019-10-20 19:29

消灭0回复
话说 idm 下载还不错哈~~

chomosuke 发表于 2019-10-20 21:32

工具挺好用的，但是度盘的链接好像挂了

coradong1985 发表于 2019-10-20 23:17

chomosuke 发表于 2019-10-25 18:15

本帖最后由 chomosuke 于 2019-10-28 20:34 编辑

大概改良了一下，代码：

https://github.com/Maemo8086/Python_AiPan_Crawler

#!/usr/bin/python
# -*- coding: utf-8 -*-

'''
MIT License

Copyright (c) 2019 Maemo8086
Copyright (c) 2019 MikoSec

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
'''

import os
import random
import requests
import sys
import time
from bs4 import BeautifulSoup
from collections import deque

def download(file_url, path, filename):# 下载函数
global headers
global download_exception

display_max = 64

if path:
   if len(path) > display_max:
         display_path = '...' + path[-display_max:]
   else:
         display_path = path
else:
   display_path = '/'

if len(filename) > display_max:
   display_filename = '...' + filename[-display_max:]
else:
   display_filename = filename

print()
print(f'\r Path => {display_path}\tFile Name => {display_filename}')
sys.stdout.flush()

delay = False
if delay:
   wait = round(random.uniform(0, 5), 2)
   print(f'\r Waiting {wait} seconds...')
   sys.stdout.flush()
   time.sleep(wait)

start = time.time()# 开始时间

if path:
   if not os.path.exists(path):
         os.makedirs(path)
   if path[-1] != os.sep:
         path += os.sep
full_path = path + filename

try:
   response = requests.get(file_url, headers=headers, stream=True, timeout=30)
except:
   download_exception.append((file_url, path, filename))
   print(f'\r Download request for *{display_filename}* has failed.')
   return

if response.status_code != 200:
   response.close()
   download_exception.append((file_url, path, filename))
   print(f'\r Download request for *{display_filename}* has failed.\tstatus_code => {response.status_code}')
   return

try:
   content_size = int(response.headers['content-length'])
except:
   response.close()
   download_exception.append((file_url, path, filename))
   print(f'\r Download request for *{display_filename}* has failed.\tMissing or invalid content-length.')
   return

if content_size < 0:
   response.close()
   download_exception.append((file_url, path, filename))
   print(f'\r Download request for *{display_filename}* has failed.\tInvalid content-length range.')
   return

print(' %0.2f MB' % (content_size / 1024 ** 2))
sys.stdout.flush()

if os.path.exists(full_path):
   if os.path.getsize(full_path) == content_size:# 判断文件大小
         response.close()
         print(' Same sized file exists, skipping...')
         return
   else:
         print(' Overwriting existing copy.')

chunk_size = 1024
size = 0
try:
   with open(full_path, 'wb', buffering=1) as f:
         for data in response.iter_content(chunk_size):
            f.write(data)
            size += len(data)
            print(
               '\r %s>%.2f%%' % (
                     '=' * int(size * 50 / content_size), float(size / content_size * 100)), end='')
except:
   download_exception.append((file_url, path, filename))
   if os.path.exists(full_path):
         os.remove(full_path)
   print(f'\r Download *{display_filename}* has failed.')
   return
finally:
   response.close()
   end = time.time()# 结束时间
   print('\rTime elapsed: %.2fs' % (end - start))

def recursive_fetch(soup, part_url):
global url
global headers

for i in soup.find_all('td', class_='link'):# 获取文件或目录
   if i.text == 'Parent directory/':
         continue

   if i.text[-1] != '/':
         path = part_url
         filename = i.text
         file_url = part_url + filename
         download(file_url, path, filename)
   else:
         dir_url = part_url + i.text
         print()
         print(f'\r Searching under {dir_url}')

         execute = True
         while execute:
            wait = round(random.uniform(0, 5), 2)
            print(f'\r Waiting {wait} seconds...')
            sys.stdout.flush()
            time.sleep(wait)

            execute = False
            try:
               with requests.get(dir_url, headers=headers, timeout=30) as req:
                     req.encoding = req.apparent_encoding
                     soup1 = BeautifulSoup(req.text, 'lxml')
            except:
               execute = True
               print(f'\r URL request *{dir_url}* has failed, retrying...')

         recursive_fetch(soup1, dir_url)

def main():
global url
global headers
global download_exception

print(
   '''
   Python 爱盘爬虫工具

   作者： Maemo8086，MikoSec
   源码： https://github.com/Maemo8086/Python_AiPan_Crawler

   一款基于Python的吾爱破解论坛爱盘下载工具
   本工具使用requests库跟bs4库
   建议使用前先修改User Agent
   '''
)

directory = 'AiPan'
if not os.path.exists(directory):
   os.mkdir(directory)
os.chdir(directory)

try:
   with requests.get(url, headers=headers, timeout=30) as req:
         req.encoding = req.apparent_encoding
         soup = BeautifulSoup(req.text, 'lxml')
except:
   print(f'\r URL request *{url}* has failed.')
   return

recursive_fetch(soup, url)

while download_exception:
   print()
   print(f'\r Retrying {len(download_exception)} failed downloads...')

   wait = round(random.uniform(10, 30), 2)
   print(f'\r Waiting {wait} seconds...')
   sys.stdout.flush()
   time.sleep(wait)

   download_exception_copy = download_exception.copy()
   download_exception.clear()
   while download_exception_copy:
         file_url, path, filename = download_exception_copy.pop()
         file_url = file_url.strip('\\')
         path = path.strip('\\')
         filename = filename.strip('\\')
         download(file_url, path, filename)

url = 'https://down.52pojie.cn/'# 爱盘 URL
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77me/77.0.3865.120 Safari/537.36'}

download_exception = deque()
main()

小弟多年没写Python，请各大佬指点一下

18603867890 发表于 2020-8-1 16:29

然后是打包后的exe(普通玩家玩耍)

链接：https://pan.baidu.com/s/11xc6ENUELIWaQNIJsbxojA
提取码：d3ig

这个链接失效了

页: [1]

吾爱破解 - 52pojie.cn's Archiver

【原创源码】【Python】使用Python爬取爱盘工具