[Python] 纯文本查看 复制代码
# [url=home.php?mod=space&uid=238618]@Time[/url] : 2020/3/18 12:43
# [url=home.php?mod=space&uid=686208]@AuThor[/url] : xx
# [url=home.php?mod=space&uid=267492]@file[/url] : 58.py
# @Software: PyCharm
from lxml import etree
import requests , re , random
from fontTools.ttLib import TTFont
from bs4 import BeautifulSoup
import csv,base64
file = open('租房信息.csv', 'w+')
write = csv.writer(file)
write.writerow(['房名标题', '价格', '租赁方式', '房屋类型', '朝向楼层', '所在小区', '所属区域', '详细地址', '电话'])
try:
UA = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
]
def get_html(url):
if 'https://e.58.com/all/zhiding.html' in url:
print('广告链接')
else:
for a in range(1):
headers = {
"User-Agent": random.choice(UA)
}
head_ip = {'https://': random.choice(ip)}
response = requests.get(url, headers = headers )
# print(response.text)
response.encoding = 'utf-8'
print('正在使用', head_ip)
if '访问过于频繁,本次访问做以下验证码校验' in response:
print('请打开这个网站完成人机验证:',url)
if response.status_code == 200:
print(url)
return response.text
def html_data():
for ic in range(1,70):
try:
response = get_html('https://xn.58.com/chuzu/pn2/?PGTID=0d3090a7-0080-454a-f3e5-d7bf08c919d6&ClickID='+str(ic))
zhuan = etree.HTML(response)
href = zhuan.xpath('//div[@class="des"]/h2/a/@href')
for url in href:
response = get_html(url)
zhuan = etree.HTML(response)
bs4_parser = BeautifulSoup(response,'html.parser')
ziti = str(re.findall("charset=utf-8;base64,(.*?)'\)", response))[2:-2]
b = base64.b64decode(ziti)
with open('58.ttf', 'wb') as f:
f.write(b)
title = str(zhuan.xpath('//div[@class="house-title"]/h1/text()'))[2:-2]
Price = str(re.findall('class="f36 strongbox">(.*?)</b>', response))[2:-2] # 价格
Renting = Price + '元/月' + str(bs4_parser.find(class_='instructions').text) #元/月 半年付
mode = str(re.findall('<li><span class="c_888 mr_15">租赁方式:</span><span>(.*?)</span>', response))[2:-2] # 租赁方式:
Renting_Type = str(re.findall('class="strongbox">(.*?)</span>', response)[0:1])[2:-2] #房屋类型
Orientation = str(re.findall('<span class="strongbox">(.*?)</span>', response))[2:-2] #朝向楼层
village = str(zhuan.xpath('//ul[@class="f14"]/li/span/a[@class="c_333 ah"]/text()')[0:1])[2:-2] #所在小区
region = str(zhuan.xpath('//ul[@class="f14"]/li/span/a[@class="c_333 ah"]/text()')[1:])[2:-2] #所属区域
address = str(bs4_parser.find(class_='dz').text) #详细地址
phone = str(re.findall('<p class="phone-num strongbox">(.*?)</p>',response))[2:-2]
poyi(title,Renting,mode,Renting_Type,Orientation,village,region,address,phone)
except Exception as j:
print('解析函数出错:',j)
def poyi(title,Renting,mode,Renting_Type,Orientation,village,region,address,phone):
font = TTFont('58.ttf')
font.saveXML('test.xml')
gly_list = font.getReverseGlyphMap()
cmap = font.getBestCmap()
keys = list(cmap.keys())
values = list(cmap.values())
for i in range(len(keys)):
keys[i] = hex(keys[i])
for i in range(len(values)):
values[i] = int(gly_list[values[i]]) - 1
dic = {}
for key, value in zip(keys, values):
dic[key.replace('0x', '&#x')] = value
for key, value in dic.items():
title = title.replace(key, str(value)).replace(';', '')
Renting = Renting.replace(key, str(value)).replace(';', '')
mode = mode.replace(key, str(value)).replace(';', '')
Renting_Type = Renting_Type.replace(key, str(value)).replace(';', '')
Orientation = Orientation.replace(key, str(value)).replace(';', '')
village = village.replace(key, str(value)).replace(';', '')
region = region.replace(key, str(value)).replace(';', '')
address = address.replace(key, str(value)).replace(';', '')
phone = phone.replace(key, str(value)).replace(';', '')
write.writerow([title, Renting, mode, Renting_Type.replace(' ',' ').replace(' ',''), Orientation.replace(' ',' '), village, region.replace("', '",' '), address.replace('\n','').replace(' ',''), phone])
print('*'*80,'分割线','*'*80)
print('房名标题:',title)
print('价格',Renting)
print('租赁方式',mode)
print('房屋类型',Renting_Type.replace(' ',' ').replace(' ',''))
print('朝向楼层',Orientation.replace(' ',' '))
print('所在小区',village)
print('所属区域',region.replace("', '",' '))
print('详细地址',address.replace('\n','').replace(' ',''))
print('电话',phone)
html_data()
except Exception as g:
print('出现错误:',g)