python 运行代码错误提示？

double07 · 发表于 2021-5-4 23:29

本帖最后由 double07 于 2021-5-5 11:06 编辑

[Python] 纯文本查看 复制代码

import re
import time
import random
import requests
from lxml import etree
# ======================================================================================================= #自动抓取并验证有用的免费代{过}{滤}理，形成ip代{过}{滤}理池
def get_ip_list():
    for page in range(1, 2):  # 自己选取要爬页数
        print('==========正在获取第{}页ip============'.format(str(page)))
        base_url = 'https://www.kuaidaili.com/free/inha/{}/'.format(str(page))
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
        response = requests.get(base_url, headers=headers)
        data = response.text
        html_data = etree.HTML(data)
        parse_list = html_data.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
        proxies_list = []
        for tr in parse_list:
            # dict_proxies = {}
            # http_type = tr.xpath('./td[4]/text()')
            ip_num = tr.xpath('./td[1]/text()')
            ip_port = tr.xpath('./td[2]/text()')
            dict_proxies = ip_num[0] + ':' + ip_port[0]
            # print(dict_proxies)
            proxies_list.append(dict_proxies)
            time.sleep(0.5)
        return proxies_list


def check_ip(proxies_list):
    headers = {
        'Referer': 'https://baidu.com/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    can_use = []
    for proxy in proxies_list:
        try:
            proxy_host = "http://" + proxy
            proxies = {"http": proxy_host}
            response = requests.get('https://sf.taobao.com', headers=headers, proxies=proxies, timeout=0.1)
            if response.status_code == 200:
                can_use.append(proxy_host)
            else:
                print('不可使用')
        except Exception as e:
            print(e)
        # finally:
        #     print('当前IP', proxy, '通过')
    print('总共获得%d条有用ip' % len(can_use))
    return can_use


def get_random_ip(can_use):
    random_proxylist = []
    for ip in can_use:
        random_proxylist.append(ip)
    proxy_ip = random.choice(random_proxylist)
    proxies = {'http': proxy_ip}
    return proxies

# ======================================================================================================= #自动抓取并验证有用的免费代{过}{滤}理，形成ip代{过}{滤}理池
p = 0
curPage = 1
link_list = ['https://sf-item.taobao.com/sf_item/640824628883.htm', 'https://sf-item.taobao.com/sf_item/641502301303.htm', 'https://sf-item.taobao.com/sf_item/641843794084.htm', 'https://sf-item.taobao.com/sf_item/642251459421.htm', 'https://sf-item.taobao.com/sf_item/642193923780.htm', 'https://sf-item.taobao.com/sf_item/642194171464.htm', 'https://sf-item.taobao.com/sf_item/642254059021.htm', 'https://sf-item.taobao.com/sf_item/642633234548.htm', 'https://sf-item.taobao.com/sf_item/641674674943.htm', 'https://sf-item.taobao.com/sf_item/641860786177.htm', 'https://sf-item.taobao.com/sf_item/641833404613.htm', 'https://sf-item.taobao.com/sf_item/642906910580.htm', 'https://sf-item.taobao.com/sf_item/642930454182.htm', 'https://sf-item.taobao.com/sf_item/643246399434.htm', 'https://sf-item.taobao.com/sf_item/643107141190.htm', 'https://sf-item.taobao.com/sf_item/641423917301.htm', 'https://sf-item.taobao.com/sf_item/641126976926.htm', 'https://sf-item.taobao.com/sf_item/640899648258.htm', 'https://sf-item.taobao.com/sf_item/641285745911.htm', 'https://sf-item.taobao.com/sf_item/641870471695.htm', 'https://sf-item.taobao.com/sf_item/641051516853.htm', 'https://sf-item.taobao.com/sf_item/641299249850.htm', 'https://sf-item.taobao.com/sf_item/640900456736.htm', 'https://sf-item.taobao.com/sf_item/641435389754.htm', 'https://sf-item.taobao.com/sf_item/642820317425.htm', 'https://sf-item.taobao.com/sf_item/642186087592.htm', 'https://sf-item.taobao.com/sf_item/642151179689.htm', 'https://sf-item.taobao.com/sf_item/640825988107.htm', 'https://sf-item.taobao.com/sf_item/644021779506.htm', 'https://sf-item.taobao.com/sf_item/642666508870.htm', 'https://sf-item.taobao.com/sf_item/643049745347.htm', 'https://sf-item.taobao.com/sf_item/642662660309.htm', 'https://sf-item.taobao.com/sf_item/642664828068.htm', 'https://sf-item.taobao.com/sf_item/642663848703.htm', 'https://sf-item.taobao.com/sf_item/643740591181.htm', 'https://sf-item.taobao.com/sf_item/643395154262.htm', 'https://sf-item.taobao.com/sf_item/643398210037.htm', 'https://sf-item.taobao.com/sf_item/643047449184.htm', 'https://sf-item.taobao.com/sf_item/643738223811.htm', 'https://sf-item.taobao.com/sf_item/643050809962.htm']

header = {
    'authority': 'sf.taobao.com',
    'cache-control': 'max-age=0',
    'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'sec-fetch-site': 'none',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-user': '?1',
    'sec-fetch-dest': 'document',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cookie': '',
}

params = (
    ('spm', 'a213w.7398504.filter.63.76d93a49nDn8tk'),
    ('auction_source', '0'),
    ('province', '%D6%D8%C7%EC'),
    ('sorder', '1'),
    ('st_param', '-1'),
    ('auction_start_seg', '-1'),
)



# 获取网页内容
def gethtml(url):
    response = requests.get(url, headers=header, params=params)
    r_response = response.content.decode('gbk')
    return r_response


def gethtml_detail(url):
    proxies = get_random_ip(can_use)
    print(proxies)
    response = requests.get(url, headers=header, params=params,proxies=proxies)
    r_response = response.content.decode('gbk')
    return r_response


# 获取网页数据
def parse_url(html):
    ult = re.findall(r'(sf-item[\S]+)\?', html)  # 详情链接
    for i in range(len(ult)):
        detai_url = "https://" + ult[i].replace('"', "")  # 房屋详情
        link_list.append(detai_url)
        # print(link_list)
    return link_list


def parse_url_detail(r):
    html = etree.HTML(r)
    final_link = "https:" + html.xpath('//*[@id="J_NoticeDetail"]/@data-from')[0].strip()
    return final_link


# 翻页
def next_page():
    url_np = 'https://sf.taobao.com/item_list.htm?spm=a213w.7398504.pagination.1.7c773a49a7C9Lp&auction_source=0&province=%D6%D8%C7%EC&st_param=-1&auction_start_seg=-1&page={}'
    url_list = [url_np.format(i + 1) for i in range(0, curPage)]
    return url_list
    # print(url_list)


# 主程序
def run_AL():
    # page = next_page()
    # p = 0
    # for i in page:
    #     html = gethtml(i)
    #     content = parse_url(html)
    #     # print(content)
    #     time.sleep(2)
        for u in link_list[1:4]:
            html_detail = gethtml_detail(u)
            print(html_detail)
            parse = parse_url_detail(html_detail)
            print(parse)
            # time.sleep(2)


if __name__ == '__main__':
    proxies_list = get_ip_list()
    can_use = check_ip(proxies_list)
    run_AL()

想爬取淘宝每个链接中详细数据，由于链接较多，请求过多会被封ip，所以建了ip代{过}{滤}理池，代{过}{滤}理池建好后，为什么仍出现错误提示：还请大佬指点（代码能直接用）

[Python] 纯文本查看 复制代码

Traceback (most recent call last):
  File "C:\Program Files\JetBrains\PyCharm 2020.3.3\plugins\python\helpers\pydev\pydevd.py", line 1483, in _exec
    pydev_imports.execfile(file, globals, locals)  # execute the script
  File "C:\Program Files\JetBrains\PyCharm 2020.3.3\plugins\python\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
    exec(compile(contents+"\n", file, 'exec'), glob, loc)
  File "C:/Users/Administrator/Desktop/Python/test3.py", line 152, in <module>
    run_AL()
  File "C:/Users/Administrator/Desktop/Python/test3.py", line 142, in run_AL
    html_detail = gethtml_detail(u)
  File "C:/Users/Administrator/Desktop/Python/test3.py", line 104, in gethtml_detail
    r_response = response.content.decode('gbk')
UnicodeDecodeError: 'gbk' codec can't decode byte 0xad in position 27: illegal multibyte sequence

Ping-High · 发表于 2021-5-4 23:59

本帖最后由 Ping-High 于 2021-5-5 00:02 编辑

网上找到的方法，貌似能解决一个问题？
“以指定的编码类型（即文件本身的编码）打开文件，chardet库可以判断文件编码类型”
比如

[Asm] 纯文本查看 复制代码

file = open("file.txt",encoding='utf-8')

魉魍魅魑 · 发表于 2021-5-4 23:59

这是不是字符编码集的问题？

高维可破 · 发表于 2021-5-5 00:10

chardet 模块检测到这些链接返回内容的编码都是 utf-8
104 行代码中对 rresponse.content.decode('gbk') 需要改为 response.content.decode('utf-8')

tywolf · 发表于 2021-5-5 00:14

快代{过}{滤}理的质量垃圾的一批，GITHUB上有一个比较好的代{过}{滤}理池项目，有七八个代{过}{滤}理商的一共，redis做数据存储的，可以clone下来在自己服务器上架设一个

ZenoMiao · 发表于 2021-5-5 07:39

response.content.decode('gbk')直接改成 response.content.decode()
因为默认就是utf8

double07 · 发表于 2021-5-5 09:50

本帖最后由 double07 于 2021-5-5 09:53 编辑

tywolf 发表于 2021-5-5 00:14
快代{过}{滤}理的质量垃圾的一批，GITHUB上有一个比较好的代{过}{滤}理池项目，有七八个代{过}{滤}理商的一 ...

我也怀疑是快代的问题，但用了代{过}{滤}理验证，难道也不行？

随便问问GITHUB的链接能分享？

double07 · 发表于 2021-5-5 09:51

本帖最后由 double07 于 2021-5-5 09:58 编辑

高维可破发表于 2021-5-5 00:10
[md]chardet 模块检测到这些链接返回内容的编码都是 `utf-8`
104 行代码中对 `rresponse.content.decode( ...

大佬，试了utf-8也不行。同时我要抓取网页的编码是gbk，如果提示“codec can't decode”，有可能是因为被反爬，用也代{过}{滤}理池也会被反爬？还是哪里没做对？

高维可破 · 发表于 2021-5-5 10:01

double07 发表于 2021-5-5 09:51
大佬，试了utf-8也不行。同时我要抓取网页的编码是gbk，如果提示“codec can't decode”，有可能是因为 ...

编码会变化的我测试到有 GB2312， utf-8
你用 chardet 模块动态检测下吧, 在解码之前加入检测即可如下

encodingInfo = chardet.detect(response.content)
r_response = response.content.decode(encodingInfo['encoding'])

double07 · 发表于 2021-5-5 11:03

高维可破发表于 2021-5-5 10:01
[md]编码会变化的我测试到有 `GB2312`， `utf-8`
你用 `chardet` 模块动态检测下吧, 在解码之前加入检 ...

编码变化的原因，可能是网站采购反爬跳到另外页面。话说代{过}{滤}理池也逃不过反爬？

帐号		自动登录	找回密码
密码			注册[Register]

[求助] python 运行代码错误提示？