python 运行代码错误提示?
本帖最后由 double07 于 2021-5-5 11:06 编辑import re
import time
import random
import requests
from lxml import etree
# ======================================================================================================= #自动抓取并验证有用的免费代{过}{滤}理,形成ip代{过}{滤}理池
def get_ip_list():
for page in range(1, 2):# 自己选取要爬页数
print('==========正在获取第{}页ip============'.format(str(page)))
base_url = 'https://www.kuaidaili.com/free/inha/{}/'.format(str(page))
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
response = requests.get(base_url, headers=headers)
data = response.text
html_data = etree.HTML(data)
parse_list = html_data.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
proxies_list = []
for tr in parse_list:
# dict_proxies = {}
# http_type = tr.xpath('./td/text()')
ip_num = tr.xpath('./td/text()')
ip_port = tr.xpath('./td/text()')
dict_proxies = ip_num + ':' + ip_port
# print(dict_proxies)
proxies_list.append(dict_proxies)
time.sleep(0.5)
return proxies_list
def check_ip(proxies_list):
headers = {
'Referer': 'https://baidu.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
can_use = []
for proxy in proxies_list:
try:
proxy_host = "http://" + proxy
proxies = {"http": proxy_host}
response = requests.get('https://sf.taobao.com', headers=headers, proxies=proxies, timeout=0.1)
if response.status_code == 200:
can_use.append(proxy_host)
else:
print('不可使用')
except Exception as e:
print(e)
# finally:
# print('当前IP', proxy, '通过')
print('总共获得%d条有用ip' % len(can_use))
return can_use
def get_random_ip(can_use):
random_proxylist = []
for ip in can_use:
random_proxylist.append(ip)
proxy_ip = random.choice(random_proxylist)
proxies = {'http': proxy_ip}
return proxies
# ======================================================================================================= #自动抓取并验证有用的免费代{过}{滤}理,形成ip代{过}{滤}理池
p = 0
curPage = 1
link_list = ['https://sf-item.taobao.com/sf_item/640824628883.htm', 'https://sf-item.taobao.com/sf_item/641502301303.htm', 'https://sf-item.taobao.com/sf_item/641843794084.htm', 'https://sf-item.taobao.com/sf_item/642251459421.htm', 'https://sf-item.taobao.com/sf_item/642193923780.htm', 'https://sf-item.taobao.com/sf_item/642194171464.htm', 'https://sf-item.taobao.com/sf_item/642254059021.htm', 'https://sf-item.taobao.com/sf_item/642633234548.htm', 'https://sf-item.taobao.com/sf_item/641674674943.htm', 'https://sf-item.taobao.com/sf_item/641860786177.htm', 'https://sf-item.taobao.com/sf_item/641833404613.htm', 'https://sf-item.taobao.com/sf_item/642906910580.htm', 'https://sf-item.taobao.com/sf_item/642930454182.htm', 'https://sf-item.taobao.com/sf_item/643246399434.htm', 'https://sf-item.taobao.com/sf_item/643107141190.htm', 'https://sf-item.taobao.com/sf_item/641423917301.htm', 'https://sf-item.taobao.com/sf_item/641126976926.htm', 'https://sf-item.taobao.com/sf_item/640899648258.htm', 'https://sf-item.taobao.com/sf_item/641285745911.htm', 'https://sf-item.taobao.com/sf_item/641870471695.htm', 'https://sf-item.taobao.com/sf_item/641051516853.htm', 'https://sf-item.taobao.com/sf_item/641299249850.htm', 'https://sf-item.taobao.com/sf_item/640900456736.htm', 'https://sf-item.taobao.com/sf_item/641435389754.htm', 'https://sf-item.taobao.com/sf_item/642820317425.htm', 'https://sf-item.taobao.com/sf_item/642186087592.htm', 'https://sf-item.taobao.com/sf_item/642151179689.htm', 'https://sf-item.taobao.com/sf_item/640825988107.htm', 'https://sf-item.taobao.com/sf_item/644021779506.htm', 'https://sf-item.taobao.com/sf_item/642666508870.htm', 'https://sf-item.taobao.com/sf_item/643049745347.htm', 'https://sf-item.taobao.com/sf_item/642662660309.htm', 'https://sf-item.taobao.com/sf_item/642664828068.htm', 'https://sf-item.taobao.com/sf_item/642663848703.htm', 'https://sf-item.taobao.com/sf_item/643740591181.htm', 'https://sf-item.taobao.com/sf_item/643395154262.htm', 'https://sf-item.taobao.com/sf_item/643398210037.htm', 'https://sf-item.taobao.com/sf_item/643047449184.htm', 'https://sf-item.taobao.com/sf_item/643738223811.htm', 'https://sf-item.taobao.com/sf_item/643050809962.htm']
header = {
'authority': 'sf.taobao.com',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': '',
}
params = (
('spm', 'a213w.7398504.filter.63.76d93a49nDn8tk'),
('auction_source', '0'),
('province', '%D6%D8%C7%EC'),
('sorder', '1'),
('st_param', '-1'),
('auction_start_seg', '-1'),
)
# 获取网页内容
def gethtml(url):
response = requests.get(url, headers=header, params=params)
r_response = response.content.decode('gbk')
return r_response
def gethtml_detail(url):
proxies = get_random_ip(can_use)
print(proxies)
response = requests.get(url, headers=header, params=params,proxies=proxies)
r_response = response.content.decode('gbk')
return r_response
# 获取网页数据
def parse_url(html):
ult = re.findall(r'(sf-item[\S]+)\?', html)# 详情链接
for i in range(len(ult)):
detai_url = "https://" + ult.replace('"', "")# 房屋详情
link_list.append(detai_url)
# print(link_list)
return link_list
def parse_url_detail(r):
html = etree.HTML(r)
final_link = "https:" + html.xpath('//*[@id="J_NoticeDetail"]/@data-from').strip()
return final_link
# 翻页
def next_page():
url_np = 'https://sf.taobao.com/item_list.htm?spm=a213w.7398504.pagination.1.7c773a49a7C9Lp&auction_source=0&province=%D6%D8%C7%EC&st_param=-1&auction_start_seg=-1&page={}'
url_list =
return url_list
# print(url_list)
# 主程序
def run_AL():
# page = next_page()
# p = 0
# for i in page:
# html = gethtml(i)
# content = parse_url(html)
# # print(content)
# time.sleep(2)
for u in link_list:
html_detail = gethtml_detail(u)
print(html_detail)
parse = parse_url_detail(html_detail)
print(parse)
# time.sleep(2)
if __name__ == '__main__':
proxies_list = get_ip_list()
can_use = check_ip(proxies_list)
run_AL()
想爬取淘宝每个链接中详细数据,由于链接较多,请求过多会被封ip,所以建了ip代{过}{滤}理池,代{过}{滤}理池建好后,为什么仍出现错误提示:还请大佬指点(代码能直接用)
Traceback (most recent call last):
File "C:\Program Files\JetBrains\PyCharm 2020.3.3\plugins\python\helpers\pydev\pydevd.py", line 1483, in _exec
pydev_imports.execfile(file, globals, locals)# execute the script
File "C:\Program Files\JetBrains\PyCharm 2020.3.3\plugins\python\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/Users/Administrator/Desktop/Python/test3.py", line 152, in <module>
run_AL()
File "C:/Users/Administrator/Desktop/Python/test3.py", line 142, in run_AL
html_detail = gethtml_detail(u)
File "C:/Users/Administrator/Desktop/Python/test3.py", line 104, in gethtml_detail
r_response = response.content.decode('gbk')
UnicodeDecodeError: 'gbk' codec can't decode byte 0xad in position 27: illegal multibyte sequence 本帖最后由 Ping-High 于 2021-5-5 00:02 编辑
网上找到的方法,貌似能解决一个问题?
“以指定的编码类型(即文件本身的编码)打开文件,chardet库可以判断文件编码类型”
比如 file = open("file.txt",encoding='utf-8') 这是不是字符编码集的问题? chardet 模块检测到这些链接返回内容的编码都是 `utf-8`
104 行代码中 对 `rresponse.content.decode('gbk') ` 需要改为 `response.content.decode('utf-8')` 快代{过}{滤}理的质量垃圾的一批,GITHUB上有一个比较好的代{过}{滤}理池项目,有七八个代{过}{滤}理商的一共,redis做数据存储的,可以clone下来在自己服务器上架设一个 response.content.decode('gbk')直接改成 response.content.decode()
因为默认就是utf8 本帖最后由 double07 于 2021-5-5 09:53 编辑
tywolf 发表于 2021-5-5 00:14
快代{过}{滤}理的质量垃圾的一批,GITHUB上有一个比较好的代{过}{滤}理池项目,有七八个代{过}{滤}理商的一 ...
我也怀疑是快代的问题,但用了代{过}{滤}理验证,难道也不行?
随便问问GITHUB的链接能分享?{:1_918:} 本帖最后由 double07 于 2021-5-5 09:58 编辑
高维可破 发表于 2021-5-5 00:10
chardet 模块检测到这些链接返回内容的编码都是 `utf-8`
104 行代码中 对 `rresponse.content.decode( ...
大佬,试了utf-8也不行。同时 我要抓取网页的编码是gbk,如果提示“codec can't decode”,有可能是因为被反爬,用也代{过}{滤}理池也会被反爬?还是哪里没做对? double07 发表于 2021-5-5 09:51
大佬,试了utf-8也不行。同时 我要抓取网页的编码是gbk,如果提示“codec can't decode”,有可能是因为 ...
编码会变化的 我测试到有 `GB2312`, `utf-8`
你用 `chardet` 模块动态检测下吧, 在解码之前加入检测即可 如下
```
encodingInfo = chardet.detect(response.content)
r_response = response.content.decode(encodingInfo['encoding'])
``` 高维可破 发表于 2021-5-5 10:01
编码会变化的 我测试到有 `GB2312`, `utf-8`
你用 `chardet` 模块动态检测下吧, 在解码之前加入检 ...
编码变化的原因,可能是网站采购反爬跳到另外页面。话说代{过}{滤}理池也逃不过反爬?:rggrg
页:
[1]
2