尝试解决58同城数字加密

Simonl 发表于 2020-5-1 18:12

最近无聊爬58的时候，发现它的数字是加密的，网上查了下是字体加密，我也尝试了下网上的方法，但是还是每次刷新了就不一样了，有比较高级的方法可以下载字体之后再分析排序，我就觉得太复杂了，我就用自己的笨办法来解密，大佬请无视。
首先，我们把数据爬下来：
import requests
fromopenpyxl importWorkbook
from bs4 import BeautifulSoup

# 实例化
wb = Workbook()
# 激活 worksheet
ws = wb.active
ws.append(['详情','大小','地址','来自','价格','链接'])
url='https://nn.58.com/chuzu/?PGTID=0d100000-0034-d95a-9039-6f71a0d2bcaa&ClickID=4'
a=requests.get(url).text

b=BeautifulSoup(a,'lxml').find_all('div',class_='des')
c=BeautifulSoup(a,'lxml').find_all('div',class_='money')
for (i,j) in zip(b,c):
价格=j.text.replace(' ', '').replace('\n', '')
链接=i.find('h2').find('a')['href']
详情=i.find('h2').text.replace(' ', '').replace('\n', '')
大小=i.find('p',class_='room').text.replace(' ', '').replace('\n', '')
地址=i.find('p',class_='infor').text.replace(' ', '').replace('\n', '')
try:
   来自=i.find('div',class_='jjr').text.replace(' ', '').replace('\n', '')
except:
   pass
print('------------------')
ws.append([])
ws.append()
print(f'详情：{详情}\n大小：{大小}\n地址：{地址}\n来自：{来自}\n价格：{价格}\n链接：{链接}')
wb.save('58住房信息.xlsx')

然后再解密，这里解密的方法就是从0-9找到对应的数字，组成字典，然后遍历对照，代码如下：
from openpyxl import load_workbook,Workbook
def jiemi(x):
a={'閏':1,'麣':2,'驋':3,'龤':4,'鑶':5,'龥':6,'餼':7,'鸺':8,'齤':9,'龒':0} #这个每次刷新都会有变化，所以每次抓取的数据不一定都一样，要重新弄
n=''
for i in x:
   try:
         n=n+str(a) #如果能对应字典里面的内容，就提取键值
   except:
         n=n+i       #如果没有，就加回来，知道循环完
return n

wb2=Workbook()    #实例化，这里是实例化我们解密之后的储存的数据的表格
ws2 = wb2.active #激活
wb = load_workbook('58住房信息.xlsx') #打开爬取下来的表格
# 激活 worksheet
ws = wb.active
for j in ws.rows: # we.rows 获取每一行数据
t = []#创建一个空表列表储存解密之后的数据
for n in j:
   print('------------')
   try:    #因为前面写入有空行，所以遍历的时候有报错，这里加个报错处理，如果报错就跳过
         print(jiemi(n.value))
         t.append(jiemi(n.value))    #把每一个单元格解密之后的数据存到列表
   except:
         pass
ws2.append(t) #写入解密之后的每一行的数据
print(t)
# 保存，save（必须要写文件名（绝对地址）默认 py 同级目录下，只支持 xlsx 格式）
wb2.save('58住房信息1.xlsx')

老飞机 发表于 2020-5-1 20:35

# @Time : 2020/3/18 12:43
# @AuThor : xx
# @file : 58.py
# @Software: PyCharm

from lxml import etree
import requests , re , random
from fontTools.ttLib import TTFont
from bs4 import BeautifulSoup
import csv,base64

file = open('租房信息.csv', 'w+')
write = csv.writer(file)
write.writerow(['房名标题', '价格', '租赁方式', '房屋类型', '朝向楼层', '所在小区', '所属区域', '详细地址', '电话'])

try:
UA = [
 "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
 "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
]

def get_html(url):

 if 'https://e.58.com/all/zhiding.html' in url:

 print('广告链接')
 else:

 for a in range(1):
 headers = {
 "User-Agent": random.choice(UA)
 }
 head_ip = {'https://': random.choice(ip)}

 response = requests.get(url, headers = headers )

 # print(response.text)

 response.encoding = 'utf-8'

 print('正在使用', head_ip)

 if '访问过于频繁，本次访问做以下验证码校验' in response:

 print('请打开这个网站完成人机验证：',url)

 if response.status_code == 200:

 print(url)

 return response.text

def html_data():

 for ic in range(1,70):
 try:
 response = get_html('https://xn.58.com/chuzu/pn2/?PGTID=0d3090a7-0080-454a-f3e5-d7bf08c919d6&ClickID='+str(ic))
 zhuan =etree.HTML(response)
 href = zhuan.xpath('//div[@class="des"]/h2/a/@href')
 for url in href:
 response = get_html(url)
 zhuan = etree.HTML(response)
 bs4_parser = BeautifulSoup(response,'html.parser')
 ziti = str(re.findall("charset=utf-8;base64,(.*?)'\)", response))
 b = base64.b64decode(ziti)
 with open('58.ttf', 'wb') as f:
 f.write(b)
 title = str(zhuan.xpath('//div[@class="house-title"]/h1/text()'))
 Price = str(re.findall('class="f36 strongbox">(.*?)', response))# 价格
 Renting = Price + '元/月' + str(bs4_parser.find(class_='instructions').text) #元/月半年付
 mode = str(re.findall('<li>租赁方式：(.*?)', response))# 租赁方式：
 Renting_Type = str(re.findall('class="strongbox">(.*?)', response))#房屋类型
 Orientation = str(re.findall('(.*?)', response)) #朝向楼层
 village = str(zhuan.xpath('//ul[@class="f14"]/li/span/a[@class="c_333 ah"]/text()')) #所在小区
 region = str(zhuan.xpath('//ul[@class="f14"]/li/span/a[@class="c_333 ah"]/text()')) #所属区域
 address = str(bs4_parser.find(class_='dz').text) #详细地址
 phone = str(re.findall('(.*?)',response))
 poyi(title,Renting,mode,Renting_Type,Orientation,village,region,address,phone)
 except Exception as j:
 print('解析函数出错：',j)

def poyi(title,Renting,mode,Renting_Type,Orientation,village,region,address,phone):

 font = TTFont('58.ttf')
 font.saveXML('test.xml')
 gly_list = font.getReverseGlyphMap()
 cmap = font.getBestCmap()
 keys = list(cmap.keys())
 values = list(cmap.values())
 for i in range(len(keys)):
 keys = hex(keys)
 for i in range(len(values)):
 values = int(gly_list]) - 1
 dic = {}
 for key, value in zip(keys, values):
 dic = value
 for key, value in dic.items():
 title = title.replace(key, str(value)).replace(';', '')
 Renting = Renting.replace(key, str(value)).replace(';', '')
 mode = mode.replace(key, str(value)).replace(';', '')
 Renting_Type = Renting_Type.replace(key, str(value)).replace(';', '')
 Orientation = Orientation.replace(key, str(value)).replace(';', '')
 village = village.replace(key, str(value)).replace(';', '')
 region = region.replace(key, str(value)).replace(';', '')
 address = address.replace(key, str(value)).replace(';', '')
 phone = phone.replace(key, str(value)).replace(';', '')
 write.writerow()
 print('*'*80,'分割线','*'*80)
 print('房名标题：',title)
 print('价格',Renting)
 print('租赁方式',mode)
 print('房屋类型',Renting_Type.replace('&nbsp',' ').replace(' ',''))
 print('朝向楼层',Orientation.replace('&nbsp',' '))
 print('所在小区',village)
 print('所属区域',region.replace("', '",' '))
 print('详细地址',address.replace('\n','').replace(' ',''))
 print('电话',phone)
html_data()

except Exception as g:
print('出现错误：',g)

YuanFang0w0 发表于 2022-3-14 01:20

提供一个思路，其实跟你差不多，也是创建字典，不过是编码字典，爬取到价格以后，提取出里面的数字，然后对提取到的数字进行一个utf8的编码，获取编码后的文本，然后保存字典{编码:数字}，一样解析0-9就行了，然后每次爬取到价格，都进行编码，然后字典中按照编码后的查找，翻译出数字，这种方式应该是不会变得，你可以试试，我只是提供一个思路，并没有去实践！改天有时间了我会去实践一下咯！

王祖蓝天 发表于 2020-7-6 17:53

58招聘信息怎么导出，有成品吗？

1911167165 发表于 2020-7-9 14:26

怎么整看不懂

zyt072594 发表于 2020-9-5 03:10

楼主数字解密出来了吗？告诉一声我哦

青衫桑 发表于 2020-9-29 21:25

我一开始以为只是一一对应关系，刷新之后我傻了

古道瘦马 发表于 2020-11-25 19:13

求出一期破虚拟号码，可有偿{:1_926:}

李冰0112 发表于 2020-12-2 08:59

58现在对外显示的都是虚拟号了，不是老板真是的电话

pythonSpider 发表于 2020-12-2 14:30

没怎么看明白

pythonSpider 发表于 2020-12-2 14:40

本帖最后由 pythonSpider 于 2020-12-2 15:24 编辑

看明白了

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

尝试解决58同城数字加密