python如何把文本样式的时间格调整为日期格式的？

double07 发表于 2021-8-26 11:23

本帖最后由 double07 于 2021-8-26 11:24 编辑

请问下列代码输出的“开拍时间”（67行）及“数据截止日期”（74行）格式是均为文本格式，无法按日期进行筛选，如何调整让输出的时间为时间格式的？
import re
import time

import chardet
import cpca
import pandas as pd
import requests
from lxml import etree

p = 0
n = 0
startPrice = ''
curPage = 23
data_list = []
id_init = []
id_stock = []
id_stock = id_stock + id_init
st = time.strftime("%Y-%m-%d %H-%M", time.localtime())
st2 = time.strftime("%Y-%m-%d", time.localtime())

cookies = {
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0)Gecko/20100101 Firefox/67.0'",
}

# 获取网页内容
def gethtml(url):
response = requests.get(url, cookies=cookies, headers=headers)
encodingInfo = chardet.detect(response.content)
r_response = response.content.decode(encodingInfo['encoding'], 'ignore')
return r_response

def parse_url_detail(r):
html = etree.HTML(r)
lst = {}
lst['索引'] = ''
lst['阶段'] = html.xpath('//*[@id="page"]/div/div/div/h1/span/text()').strip()
lst['标的物地址'] = html.xpath('//*[@class="pm-main clearfix"]/h1/text()').strip()
lst['起拍价/万元'] = round(
 eval(html.xpath('//*[@id="sf-price"]/div/p/span/em/text()').strip().replace(",", "")) / 10000, 2)
try:
 J_Price = re.findall(
 r'<span.*?>评估价\s*<span.*?>:\s*<em\s*class="m-i">￥<span\s*class="J_Price">(.*?)',
 r)# 评估价
 lst['评估价/万元'] = round(eval(J_Price.replace(",", "")) / 10000, 2)
except Exception:
 try:
 J_Price = re.findall(
 r'<span.*?>市场价\s*<span.*?>:\s*<em\s*class="m-i">￥<span\s*class="J_Price">(.*?)',
 r)
 lst['评估价/万元'] = round(eval(J_Price.replace(",", "")) / 10000, 2)
 except Exception:
 lst['评估价/万元'] = ' '
pay_price = \
 re.findall(
 r'<span.*?>保证金\s*<span.*?>:\s*<em\s*class="m-i">￥<span\s*class="J_Price">(.*?)',
 r)# 保证金
lst['保证金/万元'] = round(eval(pay_price.replace(",", "")) / 10000, 2)
increase = \
 re.findall(
 r'<span.*?>加价幅度\s*<span.*?>:\s*<em\s*class="m-i">￥<span\s*class="J_Price">(.*?)',
 r)# 加价幅度
lst['加价幅度'] = eval(increase.replace(",", ""))
slt = html.xpath('//*[@id="sf-countdown"]/@data-start').strip()# 开拍日期
initial_date = eval(slt) / 1000# 网页原始开拍日期
tupTime = time.localtime(initial_date)# 秒时间戳
lst['开拍时间'] = time.strftime("%m月%d日", tupTime)
elt = html.xpath('//*[@id="sf-countdown"]/@data-end').strip()# 结束日期
initial_enddate = eval(elt) / 1000# 网页原始结束日期
tupTime = time.localtime(initial_enddate)# 秒时间戳TF
lst['结束时间'] = time.strftime("%m月%d日", tupTime)
lst['详情页链接'] = "https:" + html.xpath('//*[@id="page"]/link/@href').strip()
lst['项目ID'] = eval(re.findall(r'item_id=(\d{1,})', r))# 项目ID
lst['数据截止日期'] = st2
data_list.append(lst)
return data_list

def updata(i):
global p
html = gethtml(i)
id_lst = re.findall(r'"id":(\d{1,})', html)
for d in range(len(id_lst)):
 if eval(id_lst) not in id_list:
 u = "https:" + re.findall(r'"itemUrl":\s*"(//[^\s]*)\?.*?"', html).strip()
 html_detail = gethtml(u)
 parse = parse_url_detail(html_detail)
 time.sleep(7)
 df = pd.DataFrame(parse)
 for i in df.index:
 df['索引'].at = i + 1
 df2 = cpca.transform(df['标的物地址'])
 df['区'] = df2.loc[:, ['区']]
 df['地址'] = df2.loc[:, ['地址']]
 df.to_excel("C:/Users/Administrator/Desktop/Python/AL-SF/每日房源数据-调试/1-阿里_住宅最新房源" + st + ".xlsx",
 index=False)
 print('新增第%s条数据已保存' % str(i + 1))
 time.sleep(4)

# 翻页
def next_page():
url_np = 'https://'
url_list =
return url_list

# 主程序
def main():
global n
n = 0
page = next_page()
# with ThreadPoolExecutor(16) as t:
for i in page:
 time_start = time.time()
 updata(i)
 time_end = time.time()
 n += 1
 print('第%s页已扫描====用时%.1f秒' % (n, time_end - time_start))

if __name__ == '__main__':
# while True:
df = pd.read_excel("C:/Users/每日房源数据-调试/数据更新+ ".xlsx",
 sheet_name='源数据')
id_list = df['项目ID'].tolist()
main()

byyulei 发表于 2021-8-26 11:46

pd.to_datetime

rsnodame 发表于 2021-8-26 12:48

pd.Timestamp
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timestamp.html#

yyjj95 发表于 2021-8-26 14:52

import datetime
Initial_time = '2021-08-24'

# 补录开始时间

Initial_time = datetime.datetime.strptime(Initial_time, '%Y-%m-%d').date()

print(Initial_time, type(Initial_time))

double07 发表于 2021-8-26 15:05

rsnodame 发表于 2021-8-26 12:48
pd.Timestamp
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timestamp.html#

有个"00:00:00"的尾巴

rsnodame 发表于 2021-8-26 16:02

double07 发表于 2021-8-26 15:05
有个"00:00:00"的尾巴

日期格式的转换，time库和datetime库都可以，因为看到你用了pandas，所以可以直接用pandas的Timestamp类，能直接喂给pandas做筛选。
你的代码没仔细看，如果目的是单纯的日期格式的转换，考虑到你用的pandas，建议你用datetime库，因为pandas的时间类数据是基于datetime的。具体用法网上很多介绍

页: [1]

吾爱破解 - 52pojie.cn's Archiver

python如何把文本样式的时间格调整为日期格式的？