好友
阅读权限10
听众
最后登录1970-1-1
|
本帖最后由 double07 于 2021-5-5 11:06 编辑
[Python] 纯文本查看 复制代码 import re
import time
import random
import requests
from lxml import etree
# ======================================================================================================= #自动抓取并验证有用的免费代{过}{滤}理,形成ip代{过}{滤}理池
def get_ip_list():
for page in range(1, 2): # 自己选取要爬页数
print('==========正在获取第{}页ip============'.format(str(page)))
base_url = 'https://www.kuaidaili.com/free/inha/{}/'.format(str(page))
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
response = requests.get(base_url, headers=headers)
data = response.text
html_data = etree.HTML(data)
parse_list = html_data.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
proxies_list = []
for tr in parse_list:
# dict_proxies = {}
# http_type = tr.xpath('./td[4]/text()')
ip_num = tr.xpath('./td[1]/text()')
ip_port = tr.xpath('./td[2]/text()')
dict_proxies = ip_num[0] + ':' + ip_port[0]
# print(dict_proxies)
proxies_list.append(dict_proxies)
time.sleep(0.5)
return proxies_list
def check_ip(proxies_list):
headers = {
'Referer': 'https://baidu.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
can_use = []
for proxy in proxies_list:
try:
proxy_host = "http://" + proxy
proxies = {"http": proxy_host}
response = requests.get('https://sf.taobao.com', headers=headers, proxies=proxies, timeout=0.1)
if response.status_code == 200:
can_use.append(proxy_host)
else:
print('不可使用')
except Exception as e:
print(e)
# finally:
# print('当前IP', proxy, '通过')
print('总共获得%d条有用ip' % len(can_use))
return can_use
def get_random_ip(can_use):
random_proxylist = []
for ip in can_use:
random_proxylist.append(ip)
proxy_ip = random.choice(random_proxylist)
proxies = {'http': proxy_ip}
return proxies
# ======================================================================================================= #自动抓取并验证有用的免费代{过}{滤}理,形成ip代{过}{滤}理池
p = 0
curPage = 1
link_list = ['https://sf-item.taobao.com/sf_item/640824628883.htm', 'https://sf-item.taobao.com/sf_item/641502301303.htm', 'https://sf-item.taobao.com/sf_item/641843794084.htm', 'https://sf-item.taobao.com/sf_item/642251459421.htm', 'https://sf-item.taobao.com/sf_item/642193923780.htm', 'https://sf-item.taobao.com/sf_item/642194171464.htm', 'https://sf-item.taobao.com/sf_item/642254059021.htm', 'https://sf-item.taobao.com/sf_item/642633234548.htm', 'https://sf-item.taobao.com/sf_item/641674674943.htm', 'https://sf-item.taobao.com/sf_item/641860786177.htm', 'https://sf-item.taobao.com/sf_item/641833404613.htm', 'https://sf-item.taobao.com/sf_item/642906910580.htm', 'https://sf-item.taobao.com/sf_item/642930454182.htm', 'https://sf-item.taobao.com/sf_item/643246399434.htm', 'https://sf-item.taobao.com/sf_item/643107141190.htm', 'https://sf-item.taobao.com/sf_item/641423917301.htm', 'https://sf-item.taobao.com/sf_item/641126976926.htm', 'https://sf-item.taobao.com/sf_item/640899648258.htm', 'https://sf-item.taobao.com/sf_item/641285745911.htm', 'https://sf-item.taobao.com/sf_item/641870471695.htm', 'https://sf-item.taobao.com/sf_item/641051516853.htm', 'https://sf-item.taobao.com/sf_item/641299249850.htm', 'https://sf-item.taobao.com/sf_item/640900456736.htm', 'https://sf-item.taobao.com/sf_item/641435389754.htm', 'https://sf-item.taobao.com/sf_item/642820317425.htm', 'https://sf-item.taobao.com/sf_item/642186087592.htm', 'https://sf-item.taobao.com/sf_item/642151179689.htm', 'https://sf-item.taobao.com/sf_item/640825988107.htm', 'https://sf-item.taobao.com/sf_item/644021779506.htm', 'https://sf-item.taobao.com/sf_item/642666508870.htm', 'https://sf-item.taobao.com/sf_item/643049745347.htm', 'https://sf-item.taobao.com/sf_item/642662660309.htm', 'https://sf-item.taobao.com/sf_item/642664828068.htm', 'https://sf-item.taobao.com/sf_item/642663848703.htm', 'https://sf-item.taobao.com/sf_item/643740591181.htm', 'https://sf-item.taobao.com/sf_item/643395154262.htm', 'https://sf-item.taobao.com/sf_item/643398210037.htm', 'https://sf-item.taobao.com/sf_item/643047449184.htm', 'https://sf-item.taobao.com/sf_item/643738223811.htm', 'https://sf-item.taobao.com/sf_item/643050809962.htm']
header = {
'authority': 'sf.taobao.com',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': '',
}
params = (
('spm', 'a213w.7398504.filter.63.76d93a49nDn8tk'),
('auction_source', '0'),
('province', '%D6%D8%C7%EC'),
('sorder', '1'),
('st_param', '-1'),
('auction_start_seg', '-1'),
)
# 获取网页内容
def gethtml(url):
response = requests.get(url, headers=header, params=params)
r_response = response.content.decode('gbk')
return r_response
def gethtml_detail(url):
proxies = get_random_ip(can_use)
print(proxies)
response = requests.get(url, headers=header, params=params,proxies=proxies)
r_response = response.content.decode('gbk')
return r_response
# 获取网页数据
def parse_url(html):
ult = re.findall(r'(sf-item[\S]+)\?', html) # 详情链接
for i in range(len(ult)):
detai_url = "https://" + ult[i].replace('"', "") # 房屋详情
link_list.append(detai_url)
# print(link_list)
return link_list
def parse_url_detail(r):
html = etree.HTML(r)
final_link = "https:" + html.xpath('//*[@id="J_NoticeDetail"]/@data-from')[0].strip()
return final_link
# 翻页
def next_page():
url_np = 'https://sf.taobao.com/item_list.htm?spm=a213w.7398504.pagination.1.7c773a49a7C9Lp&auction_source=0&province=%D6%D8%C7%EC&st_param=-1&auction_start_seg=-1&page={}'
url_list = [url_np.format(i + 1) for i in range(0, curPage)]
return url_list
# print(url_list)
# 主程序
def run_AL():
# page = next_page()
# p = 0
# for i in page:
# html = gethtml(i)
# content = parse_url(html)
# # print(content)
# time.sleep(2)
for u in link_list[1:4]:
html_detail = gethtml_detail(u)
print(html_detail)
parse = parse_url_detail(html_detail)
print(parse)
# time.sleep(2)
if __name__ == '__main__':
proxies_list = get_ip_list()
can_use = check_ip(proxies_list)
run_AL()
想爬取淘宝每个链接中详细数据,由于链接较多,请求过多会被封ip,所以建了ip代{过}{滤}理池,代{过}{滤}理池建好后,为什么仍出现错误提示:还请大佬指点(代码能直接用)
[Python] 纯文本查看 复制代码 Traceback (most recent call last):
File "C:\Program Files\JetBrains\PyCharm 2020.3.3\plugins\python\helpers\pydev\pydevd.py", line 1483, in _exec
pydev_imports.execfile(file, globals, locals) # execute the script
File "C:\Program Files\JetBrains\PyCharm 2020.3.3\plugins\python\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/Users/Administrator/Desktop/Python/test3.py", line 152, in <module>
run_AL()
File "C:/Users/Administrator/Desktop/Python/test3.py", line 142, in run_AL
html_detail = gethtml_detail(u)
File "C:/Users/Administrator/Desktop/Python/test3.py", line 104, in gethtml_detail
r_response = response.content.decode('gbk')
UnicodeDecodeError: 'gbk' codec can't decode byte 0xad in position 27: illegal multibyte sequence |
|