本帖最后由 double07 于 2021-8-26 11:24 编辑
请问下列代码输出的“开拍时间”(67行)及“数据截止日期”(74行)格式是均为文本格式,无法按日期进行筛选,如何调整让输出的时间为时间格式的?
[Python] 纯文本查看 复制代码 import re
import time
import chardet
import cpca
import pandas as pd
import requests
from lxml import etree
p = 0
n = 0
startPrice = ''
curPage = 23
data_list = []
id_init = []
id_stock = []
id_stock = id_stock + id_init
st = time.strftime("%Y-%m-%d %H-%M", time.localtime())
st2 = time.strftime("%Y-%m-%d", time.localtime())
cookies = {
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0)Gecko/20100101 Firefox/67.0'",
}
# 获取网页内容
def gethtml(url):
response = requests.get(url, cookies=cookies, headers=headers)
encodingInfo = chardet.detect(response.content)
r_response = response.content.decode(encodingInfo['encoding'], 'ignore')
return r_response
def parse_url_detail(r):
html = etree.HTML(r)
lst = {}
lst['索引'] = ''
lst['阶段'] = html.xpath('//*[@id="page"]/div[4]/div/div/h1/span/text()')[0].strip()
lst['标的物地址'] = html.xpath('//*[@class="pm-main clearfix"]/h1/text()')[1].strip()
lst['起拍价/万元'] = round(
eval(html.xpath('//*[@id="sf-price"]/div/p[1]/span/em/text()')[0].strip().replace(",", "")) / 10000, 2)
try:
J_Price = re.findall(
r'<span.*?>评 估 价</span>\s*<span.*?>:\s*<em\s*class="m-i">¥</em><span\s*class="J_Price">(.*?)</span>',
r)[0] # 评估价
lst['评估价/万元'] = round(eval(J_Price.replace(",", "")) / 10000, 2)
except Exception:
try:
J_Price = re.findall(
r'<span.*?>市场价</span>\s*<span.*?>:\s*<em\s*class="m-i">¥</em><span\s*class="J_Price">(.*?)</span>',
r)[0]
lst['评估价/万元'] = round(eval(J_Price.replace(",", "")) / 10000, 2)
except Exception:
lst['评估价/万元'] = ' '
pay_price = \
re.findall(
r'<span.*?>保证金</span>\s*<span.*?>:\s*<em\s*class="m-i">¥</em><span\s*class="J_Price">(.*?)</span>',
r)[0] # 保证金
lst['保证金/万元'] = round(eval(pay_price.replace(",", "")) / 10000, 2)
increase = \
re.findall(
r'<span.*?>加价幅度</span>\s*<span.*?>:\s*<em\s*class="m-i">¥</em><span\s*class="J_Price">(.*?)</span>',
r)[0] # 加价幅度
lst['加价幅度'] = eval(increase.replace(",", ""))
slt = html.xpath('//*[@id="sf-countdown"]/@data-start')[0].strip() # 开拍日期
initial_date = eval(slt) / 1000 # 网页原始开拍日期
tupTime = time.localtime(initial_date) # 秒时间戳
lst['开拍时间'] = time.strftime("%m月%d日", tupTime)
elt = html.xpath('//*[@id="sf-countdown"]/@data-end')[0].strip() # 结束日期
initial_enddate = eval(elt) / 1000 # 网页原始结束日期
tupTime = time.localtime(initial_enddate) # 秒时间戳TF
lst['结束时间'] = time.strftime("%m月%d日", tupTime)
lst['详情页链接'] = "https:" + html.xpath('//*[@id="page"]/link[2]/@href')[0].strip()
lst['项目ID'] = eval(re.findall(r'item_id=(\d{1,})', r)[0]) # 项目ID
lst['数据截止日期'] = st2
data_list.append(lst)
return data_list
def updata(i):
global p
html = gethtml(i)
id_lst = re.findall(r'"id":(\d{1,})', html)
for d in range(len(id_lst)):
if eval(id_lst[d]) not in id_list:
u = "https:" + re.findall(r'"itemUrl":\s*"(//[^\s]*)\?.*?"', html)[d].strip()
html_detail = gethtml(u)
parse = parse_url_detail(html_detail)
time.sleep(7)
df = pd.DataFrame(parse)
for i in df.index:
df['索引'].at[i] = i + 1
df2 = cpca.transform(df['标的物地址'])
df['区'] = df2.loc[:, ['区']]
df['地址'] = df2.loc[:, ['地址']]
df.to_excel("C:/Users/Administrator/Desktop/Python/AL-SF/每日房源数据-调试/1-阿里_住宅最新房源" + st + ".xlsx",
index=False)
print('新增第%s条数据已保存' % str(i + 1))
time.sleep(4)
# 翻页
def next_page():
url_np = 'https://'
url_list = [url_np.format(i + 1, startPrice) for i in range(0, curPage)]
return url_list
# 主程序
def main():
global n
n = 0
page = next_page()
# with ThreadPoolExecutor(16) as t:
for i in page:
time_start = time.time()
updata(i)
time_end = time.time()
n += 1
print('第%s页已扫描====用时%.1f秒' % (n, time_end - time_start))
if __name__ == '__main__':
# while True:
df = pd.read_excel("C:/Users/每日房源数据-调试/数据更新+ ".xlsx",
sheet_name='源数据')
id_list = df['项目ID'].tolist()
main()
|