python爬虫数据乱码？

double07 发表于 2021-6-21 09:52

最近研究房产数据，把安居客的数据抓了一把，但发现部分数据有乱码，不知如何改进代码？import time
import chardet
import pandas as pd
import requests
from lxml import etree

p = 0
data_list = []
st = time.strftime("%Y-%m-%d", time.localtime())

header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36',
'cookie':
}

def gethtml_detail(url):
retry_count = 4
proxy = get_proxy().get("proxy")
while retry_count > 0:
   try:
         response = requests.get(url, headers=header, proxies={"http": "http://{}".format(proxy)})
         encodingInfo = chardet.detect(response.content)
         r_response = response.content.decode(encodingInfo['encoding'], 'ignore').replace("变卖价", "起拍价")
         return r_response
   except Exception:
         retry_count -= 1
         delete_proxy(proxy)
return None

def area_link(html):
html = etree.HTML(html)
link_list = []
links = html.xpath('//*[@id="__layout"]/div/section/section/div/section/div/ul/li/a/@href')# 链接列表
for i in links:
   detail_url = i# 区域http链接
   link_list.append(detail_url)
return link_list

def area_second_content_link(html):
html = etree.HTML(html)
link_list = []
links = html.xpath('//*[@id="__layout"]/div/section/section/section/div/a/@href')# 链接列表
for i in links:
   detail_url = i# 区域http链接
   link_list.append(detail_url)
return link_list

# 调用代{过}{滤}理API
def get_proxy():
return requests.get("http://127.0.0.1:5010/get/").json()

def delete_proxy(proxy):
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))

# 主程序
def main():
global p
# url = 'https://chongqing.anjuke.com/community'
# html = gethtml_detail(url)
# area_links = area_link(html)
# for i in area_links:
i='https://chongqing.anjuke.com/community/jiangbei/'
p_lst = i + "p{}" + "/"
url_list =
for i in url_list:
   html = gethtml_detail(i)
   lst = area_second_content_link(html)
   for i in lst:
         html_detail = gethtml_detail(i)
         html = etree.HTML(html_detail)
         lst = {}
         lst['索引'] = ''
         lst['小区名称'] = html.xpath('/html/body/div/div/div/a/@title').strip()
         lst['小区地址'] = html.xpath('/html/body/div/div/div/h1/span/text()').strip()
         lst['物业费'] = html.xpath('//*[@id="basic-infos-box"]/dl/dd/text()').strip()
         lst['竣工时间'] = html.xpath('//*[@id="basic-infos-box"]/dl/dd/text()').strip()
         lst['容积率'] = html.xpath('//*[@id="basic-infos-box"]/dl/dd/text()').strip()
         lst['绿化率'] = html.xpath('//*[@id="basic-infos-box"]/dl/dd/text()').strip()
         lst['开发商'] = html.xpath('//*[@id="basic-infos-box"]/dl/dd/text()').strip()
         lst['物业公司'] = html.xpath('//*[@id="basic-infos-box"]/dl/dd/text()').strip()
         lst['所属商圈'] = html.xpath('//*[@id="basic-infos-box"]/dl/dd/text()').strip()
         lst['小区id'] = eval(html.xpath('/html/body/div/div/div/a/@href').split("/")[-1])
         lst['小区链接'] = html.xpath('/html/body/div/div/div/a/@href').strip()
         data_list.append(lst)
         df = pd.DataFrame(data_list)
         for i in df.index:
            df['索引'].at = i + 1
         df.to_excel("C:/Users/Administrator/Desktop/Python/安居客/重庆住宅小区数据" + st + ".xlsx", index=False)
         p = p + 1
         print('第%s条数据已保存' % p)
         time.sleep(4)

if __name__ == '__main__':
main()

choujie1689 发表于 2021-6-21 10:04

数据.encode("latin1").decode("gbk",errors='ignore')
试试，可能是编码问题

magicianly 发表于 2021-6-21 10:31

本帖最后由 magicianly 于 2021-6-21 10:33 编辑

第24行代码，你在取解析编码的时候出现了错误，就解析的编码就有可能变成ISO。。。。什么什么的，或者WINDOW什么的，这个我测试过

charset = re.search(b'charset=(.+?)"', response.content).group(1)
return self.content.decode(charset, errors='ignore')

建议你改成这个

nmjk1234 发表于 2021-6-21 11:08

谢谢楼主的分享

double07 发表于 2021-6-21 11:31

15820394839 发表于 2021-6-21 10:04
数据.encode("latin1").decode("gbk",errors='ignore')
试试，可能是编码问题

谢谢，加在25行，但没成功

double07 发表于 2021-6-21 11:33

magicianly 发表于 2021-6-21 10:31
第24行代码，你在取解析编码的时候出现了错误，就解析的编码就有可能变成ISO。。。。什么什么的，或者WINDO ...

这样调整？但报错
def gethtml_detail(url):
retry_count = 4
proxy = get_proxy().get("proxy")
while retry_count > 0:
   try:
         response = requests.get(url, headers=header, proxies={"http": "http://{}".format(proxy)})
         charset = re.search(b'charset=(.+?)"', response.content).group(1)
         return self.content.decode(charset, errors='ignore')
   except Exception:
         retry_count -= 1
         delete_proxy(proxy)
return None

magicianly 发表于 2021-6-21 14:15

本帖最后由 magicianly 于 2021-6-21 14:22 编辑

```python
def get_proxy():
return requests.get("http://127.0.0.1:5010/get/").json()

def delete_proxy(proxy):
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))

header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36',
'cookie': ''
}

def gethtml_detail(url):
retry_count = 4
while retry_count > 0:
   try:
         proxy = get_proxy().get("proxy")
         response = requests.get(url, headers=header, proxies={"http": "http://{}".format(proxy)})
         charset = re.search(b'charset=(.+?)"', response.content).group(1)
         return response.content.decode(charset.decode('utf-8'), errors='ignore')
   except Exception:
         retry_count -= 1
         delete_proxy(proxy)
return None
```

云尚天下 发表于 2021-6-21 14:20

试试万能编码 *******.encoding = *******.apparent_encoding

magicianly 发表于 2021-6-21 16:36

云尚天下发表于 2021-6-21 14:20
试试万能编码 *******.encoding = *******.apparent_encoding

这样有可能会用系统的解码代码解码，就有可能会造成乱码

magicianly 发表于 2021-6-21 16:38

magicianly 发表于 2021-6-21 14:15
```python
def get_proxy():
return requests.get("http://127.0.0.1:5010/get/").json()

我觉得没啥可需要怎么理解的吧，无非就是获取二进制源代码的charset 获取出来以后，解码就不需要说了嘛

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

python爬虫数据乱码？