学了几天python,看到了别人封装多线程下载代码,大佬的帖子:https://www.52pojie.cn/thread-1615108-1-1.html
我发现里面没有注释,就自己读了一下并分享给大家,这里还有一些对于这个代码的模仿,如下是我的理解
import re
import requests
from threading import Thread
from queue import Queue
import time
q = Queue(100000) # 队列的数量为100000个,超过这个数量应该会报错
class FastRequests:
def __init__( # 这个像Java的的那个类的直接调用
# 下面的infos是表示要下载的链接,同时也存入了title,这是个字典
self, infos, threads=20, headers={
'User-Agent':'Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.188 Safari/537.36 CrKey/1.54.250320 Edg/99.0.4844.74',
'Cookie': ''
}
): # self相当于Java里面的this,赋值成员变量
self.threads = threads # 线程数 20
for info in infos: # 把全部链接放进队列里面
q.put(info)
self.headres = headers
def run(self):
for i in range(self.threads): # 这里是循环20次这个操作,达到20线程的效果
t = Consumer(self.headres) # 循环调用函数,并且将headers传入
t.start()
class Consumer(Thread): # 继承Thread的方法
def __init__(self, headers): # 将变量赋值
Thread.__init__(self)
self.headers = headers
self.size = 0
self.time = 0
def run(self): # 重写方法,在这里就是实现多线程的操作
while True:
if q.qsize() == 0: # 如果队列的大小为0则直接退出循环
break
self.download(q.get()) # 从队列里面获得下载链接,并且get会减小q的大小
def validateTitle(self, title): # 这似乎是改变下载的标题的,但是没用到
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
new_title = re.sub(rstr, "_", title) # 替换为下划线
return new_title
def sizeFormat(self, size, is_disk=False, precision=2): # 这个似乎是获得文件大小的,但是也没用上
formats = ['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
unit = 1000.0 if is_disk else 1024.0
if not (isinstance(size, float) or isinstance(size, int)):
raise TypeError('a float number or an integer number is required!')
if size < 0:
raise ValueError('number must be non-negative')
for i in formats:
size /= unit
if size < unit:
return f'{round(size, precision)}{i}'
return f'{round(size, precision)}{i}'
def download(self, info): # 用来下载文件的
title = info['title']
link = info['link']
if title == '':
title = self.validateTitle(link.split('/')[-1])
start_time = time.time()
response = requests.get(url=link, headers=self.headers, stream=True).content
end_time = time.time()
self.time = end_time - start_time
self.size += response.__sizeof__()
with open(title, 'wb') as f:
f.write(response)
f.close()
print(f'{title} {self.sizeFormat(self.size)} 耗时:{round(self.time,3)}s')
if __name__ == '__main__':
info1 = {
'title': '1.ts',
'link':'https://1252524126.vod2.myqcloud.com/9764a7a5vodtransgzp1252524126/215eee7e5285890804441012426/drm/v.f230.ts?start=0&end=2920399&type=mpegts'
}
info2 = {
'title': '2.ts',
'link':'https://1252524126.vod2.myqcloud.com/9764a7a5vodtransgzp1252524126/215eee7e5285890804441012426/drm/v.f230.ts?start=2920400&end=4720703&type=mpegts'
}
fr = FastRequests(infos=[info1, info2])
fr.run()
此外我还尝试模仿了一下代码,多线程解析网页,果然成功了
import requests
import re
import random
from threading import Thread
from queue import Queue
q = Queue(1000) # 创建一个1000大小的队列
class Parse(Thread): #
def __init__(self, header):
Thread.__init__(self)
self.header = header # 这个是将随机的headers复制
def run(self): # 这里是多线程的实现
if q.qsize() == 0:
return
response = requests.get(q.get(), headers=header)
response.encoding = 'utf-8'
try: # 这是一点异常处理
lib = re.findall('<a href=".*?"><img src="(.*?)" alt=".*?"></a>', response.text, re.S)[0]
src = 'http:' + lib
print(src)
except:
print('未找到链接')
class GetHead:
def run(self):
ulist = [ # 定义的随机数列表
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko)Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)']
ul = random.choice(ulist) # 获取随机数
hhhh = {'user-agent': ul} # 这个就是随机的headers
return hhhh # 将随机的headers返回
if __name__ == '__main__':
url = 'http://www.zdqx.com/xzj/89972.html'
header = {
'user-agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'
}
response = requests.get(url, headers=header)
response.encoding = 'utf-8'
src_page = re.findall('<option value="(.*?)" >.*?</option>', response.text) # 前面这些是获取网页的链接的操作
G = GetHead()
for i in src_page: # 将网页存进队列里面
aa = 'http:' + i
q.put(aa)
for i in range(20): # 设置20个线程同时对队列里面的内容操作
t = Parse(G.run()) # 随机获取headers
t.start()