python如何把文本样式的时间格调整为日期格式的?
本帖最后由 double07 于 2021-8-26 11:24 编辑请问下列代码输出的“开拍时间”(67行)及“数据截止日期”(74行)格式是均为文本格式,无法按日期进行筛选,如何调整让输出的时间为时间格式的?
import re
import time
import chardet
import cpca
import pandas as pd
import requests
from lxml import etree
p = 0
n = 0
startPrice = ''
curPage = 23
data_list = []
id_init = []
id_stock = []
id_stock = id_stock + id_init
st = time.strftime("%Y-%m-%d %H-%M", time.localtime())
st2 = time.strftime("%Y-%m-%d", time.localtime())
cookies = {
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0)Gecko/20100101 Firefox/67.0'",
}
# 获取网页内容
def gethtml(url):
response = requests.get(url, cookies=cookies, headers=headers)
encodingInfo = chardet.detect(response.content)
r_response = response.content.decode(encodingInfo['encoding'], 'ignore')
return r_response
def parse_url_detail(r):
html = etree.HTML(r)
lst = {}
lst['索引'] = ''
lst['阶段'] = html.xpath('//*[@id="page"]/div/div/div/h1/span/text()').strip()
lst['标的物地址'] = html.xpath('//*[@class="pm-main clearfix"]/h1/text()').strip()
lst['起拍价/万元'] = round(
eval(html.xpath('//*[@id="sf-price"]/div/p/span/em/text()').strip().replace(",", "")) / 10000, 2)
try:
J_Price = re.findall(
r'<span.*?>评 估 价</span>\s*<span.*?>:\s*<em\s*class="m-i">¥</em><span\s*class="J_Price">(.*?)</span>',
r)# 评估价
lst['评估价/万元'] = round(eval(J_Price.replace(",", "")) / 10000, 2)
except Exception:
try:
J_Price = re.findall(
r'<span.*?>市场价</span>\s*<span.*?>:\s*<em\s*class="m-i">¥</em><span\s*class="J_Price">(.*?)</span>',
r)
lst['评估价/万元'] = round(eval(J_Price.replace(",", "")) / 10000, 2)
except Exception:
lst['评估价/万元'] = ' '
pay_price = \
re.findall(
r'<span.*?>保证金</span>\s*<span.*?>:\s*<em\s*class="m-i">¥</em><span\s*class="J_Price">(.*?)</span>',
r)# 保证金
lst['保证金/万元'] = round(eval(pay_price.replace(",", "")) / 10000, 2)
increase = \
re.findall(
r'<span.*?>加价幅度</span>\s*<span.*?>:\s*<em\s*class="m-i">¥</em><span\s*class="J_Price">(.*?)</span>',
r)# 加价幅度
lst['加价幅度'] = eval(increase.replace(",", ""))
slt = html.xpath('//*[@id="sf-countdown"]/@data-start').strip()# 开拍日期
initial_date = eval(slt) / 1000# 网页原始开拍日期
tupTime = time.localtime(initial_date)# 秒时间戳
lst['开拍时间'] = time.strftime("%m月%d日", tupTime)
elt = html.xpath('//*[@id="sf-countdown"]/@data-end').strip()# 结束日期
initial_enddate = eval(elt) / 1000# 网页原始结束日期
tupTime = time.localtime(initial_enddate)# 秒时间戳TF
lst['结束时间'] = time.strftime("%m月%d日", tupTime)
lst['详情页链接'] = "https:" + html.xpath('//*[@id="page"]/link/@href').strip()
lst['项目ID'] = eval(re.findall(r'item_id=(\d{1,})', r))# 项目ID
lst['数据截止日期'] = st2
data_list.append(lst)
return data_list
def updata(i):
global p
html = gethtml(i)
id_lst = re.findall(r'"id":(\d{1,})', html)
for d in range(len(id_lst)):
if eval(id_lst) not in id_list:
u = "https:" + re.findall(r'"itemUrl":\s*"(//[^\s]*)\?.*?"', html).strip()
html_detail = gethtml(u)
parse = parse_url_detail(html_detail)
time.sleep(7)
df = pd.DataFrame(parse)
for i in df.index:
df['索引'].at = i + 1
df2 = cpca.transform(df['标的物地址'])
df['区'] = df2.loc[:, ['区']]
df['地址'] = df2.loc[:, ['地址']]
df.to_excel("C:/Users/Administrator/Desktop/Python/AL-SF/每日房源数据-调试/1-阿里_住宅最新房源" + st + ".xlsx",
index=False)
print('新增第%s条数据已保存' % str(i + 1))
time.sleep(4)
# 翻页
def next_page():
url_np = 'https://'
url_list =
return url_list
# 主程序
def main():
global n
n = 0
page = next_page()
# with ThreadPoolExecutor(16) as t:
for i in page:
time_start = time.time()
updata(i)
time_end = time.time()
n += 1
print('第%s页已扫描====用时%.1f秒' % (n, time_end - time_start))
if __name__ == '__main__':
# while True:
df = pd.read_excel("C:/Users/每日房源数据-调试/数据更新+ ".xlsx",
sheet_name='源数据')
id_list = df['项目ID'].tolist()
main()
pd.to_datetime pd.Timestamp
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timestamp.html# import datetime
Initial_time = '2021-08-24'
# 补录开始时间
Initial_time = datetime.datetime.strptime(Initial_time, '%Y-%m-%d').date()
print(Initial_time, type(Initial_time)) rsnodame 发表于 2021-8-26 12:48
pd.Timestamp
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timestamp.html#
有个"00:00:00"的尾巴 double07 发表于 2021-8-26 15:05
有个"00:00:00"的尾巴
日期格式的转换,time库和datetime库都可以,因为看到你用了pandas,所以可以直接用pandas的Timestamp类,能直接喂给pandas做筛选。
你的代码没仔细看,如果目的是单纯的日期格式的转换,考虑到你用的pandas,建议你用datetime库,因为pandas的时间类数据是基于datetime的。具体用法网上很多介绍
页:
[1]