本帖最后由 520dd 于 2020-4-4 13:00 编辑
[Python] 纯文本查看 复制代码 def verify_ips(httpIp, httpsIp, ip_valid_list):
try:
if httpIp and httpsIp:
s.get('https://www.baidu.com', headers={
"User-Agent": ua.random,
'Connection': 'close',
}, proxies={
'http': httpIp,
'https': httpsIp
})
ip_valid_list.append(ip)
print(ip, "有效")
except BaseException as e:
print(e)
httpIp与httpsIp是我从https://www.xicidaili.com/nn/爬取下来的代{过}{滤}理,但是一直报错
HTTPSConnectionPool(host='www.baidu.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', RemoteDisconnected('Remote end closed
connection without response')))
网上的各种方法都用了
[Python] 纯文本查看 复制代码 ua = UserAgent()
requests.urllib3.disable_warnings()
requests.adapters.DEFAULT_RETRIES = 5
requests.urllib3.PoolManager(num_pools=10000)
s = requests.session()
s.keep_alive = False
但是依然全都不起效,求吧友助攻。
我直接把源码贴出来了
[Python] 纯文本查看 复制代码 import requests
from lxml import etree
import time
from multiprocessing import Pool
import multiprocessing
import sys
from fake_useragent import UserAgent
ua = UserAgent()
requests.urllib3.disable_warnings()
requests.adapters.DEFAULT_RETRIES = 5
requests.urllib3.PoolManager(num_pools=10000)
s = requests.session()
s.keep_alive = False
def input_urls():
for item in get_proxy_arr:
print("正在爬取", item['url'])
try:
for i in range(1, TotalPage):
get_single(item, str(i))
print('爬取第'+str(i)+'页\r', end="")
time.sleep(3)
except:
print("异常退出")
def get_single(item, index):
r = requests.get(item['url'] + index, headers={
"User-Agent": ua.random,
})
if r.status_code == 503:
print('由于爬取次数过多,你的Ip已经被封')
sys.exit(0)
content = etree.HTML(r.text)
if item['name'] == "kuaidaili":
ip = content.xpath(".//td[@data-title='IP']/text()")
duankou = content.xpath(".//td[@data-title='PORT']/text()")
protocol = content.xpath(".//td[@data-title='类型']/text()")
elif item['name'] == "xicidaili":
ip = content.xpath('//table[@id="ip_list"]/tr/td[2]/text()')
duankou = content.xpath('//table[@id="ip_list"]/tr/td[3]/text()')
protocol = content.xpath('//table[@id="ip_list"]/tr/td[6]/text()')
for i in range(0, len(ip)):
ip_list.append(protocol[i].lower()+"://"+ip[i]+":"+duankou[i])
def verify_ips(httpIp, httpsIp, ip_valid_list):
try:
if httpIp and httpsIp:
s.get('https://www.baidu.com', headers={
"User-Agent": ua.random,
'Connection': 'close',
}, proxies={
'http': httpIp,
'https': httpsIp
})
ip_valid_list.append(httpIp)
ip_valid_list.append(httpsIp)
print("有效")
except:
print("无效")
if __name__ == "__main__":
print(
"""
程序结束后会在当前文件夹生成一个ip_proxies_valid.txt文件,
"""
)
get_proxy_arr = [
{
'name': "xicidaili",
'url': "https://www.xicidaili.com/nn/"
},
{
'name': "kuaidaili",
'url': "https://www.kuaidaili.com/free/inha/"
},
]
ip_list = []
TotalPage = 3 # !爬取网站前多少页
mlist = multiprocessing.Manager()
ip_valid_list = mlist.list()
input_urls()
print("总共爬取到"+str(len(ip_list))+"个ip,接下来准备验证ip有效性")
http_ip_arr = []
https_ip_arr = []
for ip in ip_list:
if "http:" in ip:
http_ip_arr.append(ip.strip("\n"))
elif "https:" in ip:
https_ip_arr.append(ip.strip("\n"))
print("开始验证!")
p = Pool(15)
for index in range(0, len(https_ip_arr)):
p.apply_async(verify_ips, (http_ip_arr[index], https_ip_arr[index], ip_valid_list))
# verify_ips(http_ip_arr[index], https_ip_arr[index], ip_valid_list)
p.close()
p.join()
f = open('ip_proxies_valid.txt', 'a')
for ip in ip_list:
f.write(ip)
if ip != ip_list[-1]:
f.write('\n')
f.close()
print("完成")
|