本帖最后由 panison 于 2021-3-25 12:08 编辑
有一位营销部门的哥们,需要统计哈尔滨国土局的土地招拍挂信息,而从网上粘贴复制信息很麻烦。当时只会Excel,帮不了他多少忙。
在学了Python几个月后,发现可以通过Python解决这个问题。
附源代码:
[Python] 纯文本查看 复制代码
import re
import requests_html
session = requests_html.HTMLSession()
# 获取累计页数
url = "http://xxgk.harbin.gov.cn/module/xxgk/search.jsp"
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"}
params = {"divid": "div11574", "infotypeId": "TDZPGX", "jdid": "2", "area": "002277134"}
request = session.post(url=url, headers=headers, params=params)
request.encoding = "utf-8"
pattern = r"共\d+页"
pages = int(re.findall(pattern, request.text)[0][1:-1])
# 建立字典,存放土地使用权出让的url,title
land_urls = dict()
# 每页获取页面中的土地使用权出让的url, title
for page in range(1, pages + 1):
params = {
"divid": "div11574",
"infotypeId": "TDZPGX",
"jdid": "2",
"area": "002277134",
"currpage": str(page)
}
request_page = session.post(url=url, headers=headers, params=params)
request_page.encoding = "utf-8"
titles = request_page.html.xpath("//li/a/@title")
hrefs = request_page.html.xpath("//li/a/@href")
page_urls = dict(zip(titles, hrefs))
for key in list(page_urls.keys()):
if key[-4:] == "出让公告":
land_urls[key] = page_urls[key]
# 获取每个土地出让公告的信息,并将信息存入列表land_bid_info
land_bid_info = list()
land_bid_info.append([
"地块编号",
"宗地位置",
"用地性质",
"出让方式",
"用地面积(平方米)",
"规划建筑面积(平方米)",
"出让年限",
"容积率",
"出让起始价(元)",
"竞买保证金(元)",
"公告网址"
])
print("========================================获取土地出让记录========================================")
for url in list(land_urls.values()):
print("指定URL:" + "{: <64}".format(url), end=",")
request_url = session.get(url=url, headers=headers)
request_url.encoding = "utf-8"
# 根据表头金额标题的文本确定金额单位是元还是万元
money = "".join(request_url.html.xpath("//div[@id='zoom']/table/tbody/tr[1]/td[last()]/p/span/text()"))
if "万元" in money:
flag = False
else:
flag = True
# 遍历表体的每一行记录
trs = request_url.html.xpath("//div[@id='zoom']/table/tbody/tr[position()>1]")
for tr in trs:
line = list()
# 遍历每一行记录信息的每个字段(出让起始价、竞拍保证金除外)
for td in tr.xpath("//td[position()<last()-1]"):
txt = ""
for string in td.xpath("//text()"):
txt = txt + string.replace(",", ",") # 防止csv多余的逗号
line.append(txt)
# 获取出让起始价、竞拍保证金,且全部转换金额单位为元。
money_1 = "".join(tr.xpath("//td[position()=last()-1]/p/span/text()")).replace(" ", "")
money_2 = "".join(tr.xpath("//td[position()=last()]/p/span/text()")).replace(" ", "")
if flag:
txt_1 = money_1
txt_2 = money_2
else:
txt_1 = str(int(money_1)*10000)
txt_2 = str(int(money_2)*10000)
line.append(txt_1)
line.append(txt_2)
line.append(url)
# 一条记录获取完毕,添加到列表land_bid_info
land_bid_info.append(line)
print("指定URL的土地出让记录获取完毕!")
print("=====================================将土地出让记录写入csv文件:=====================================")
# 将土地出让记录写入csv文件
with open("哈尔滨土地出让公告.csv", 'w+', encoding="utf-8") as f:
for line in land_bid_info:
f.write(",".join(line) + "\n")
print("=====================================土地出让记录已写入csv文件!=====================================")
说明一下:土地招拍挂信息都是依法需要公开的信息。放心使用即可。
|