本帖最后由 okij12589 于 2022-9-29 19:05 编辑
在练习python多线程的学习中,尝试开启多线程,执行采集任务,
但是每次只能开启单线程,参考网上的解决方案,参数都是没问题的,
在查找问题中发现,加入 time.sleep(1) 即恢复多线程,
查阅大量资料扔为找到原因,请求大神指导
[Python] 纯文本查看 复制代码 import threading
import requests
import parsel
import time
# 总数
all_proxy = []
can_proxy = []
# 生成页码
urls = [
f"https://www.kuaidaili.com/free/inha/{page}/"
for page in range(1, 3)
]
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0"
}
# 获取数据
def get_content(url):
print(url)
res = requests.get(url, headers=header)
selector = parsel.Selector(res.text)
ip_list = selector.css("#list tbody tr td:nth-child(1)::text").getall()
port_list = selector.css("#list tbody tr td:nth-child(2)::text").getall()
for ip, port in zip(ip_list, port_list):
proxy = ip + ":" + port
proxy_dict = {
"http": "http://" + proxy,
"https": "https://" + proxy
}
all_proxy.append(proxy_dict)
try:
resource = requests.get(url=url, headers=header, proxies=proxy_dict, timeout=1)
print(resource)
if resource.status_code == 200:
print(proxy + " 可用!!!!!!!!!")
can_proxy.append(proxy_dict)
except:
print(proxy + " 不可用")
# 单进程
def single_thread():
print("单线程开始")
for url in urls:
get_content(url)
print("单线程结束")
# 多进程
def multi_thread():
print("多线程开始")
threads = []
for url in urls:
threads.append(
threading.Thread(target=get_content, args=(url,))
)
for thread in threads:
thread.start()
time.sleep(1)
for thread in threads:
thread.join()
print("多线程结束")
if __name__ == "__main__":
# start = time.time()
# single_thread()
# end = time.time()
# print("单线程用时:", end-start)
start = time.time()
multi_thread()
end = time.time()
print("多线程用时:", end-start)
print("总共采集", len(all_proxy), "个")
print("总共采集到", len(can_proxy), "个可用")
print(can_proxy)
|