吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 1340|回复: 0
收起左侧

[Python 原创] 找到的工具只能备份weibo到csv?自己写一个输出markdown文件!

[复制链接]
平Fan_d世界 发表于 2023-11-3 19:41
本帖最后由 平Fan_d世界 于 2023-11-3 20:20 编辑

github找到一个备份weibo的工具,https://github.com/dataabc/weibo-crawler

但是能输出的是csv或者数据库文件,没有便于阅读的内容输出
所有摸鱼写出一个导出markdown文件的py。
以每天一个文件的方式导出,包括weibo发文、图片、评论、转文。
推荐使用obsidian,我照着他弄出来的。

weibo-crawler备份出的文件有:users.csv,weibodata.db,微博id.csv(如,1669879400.csv)
用工具像sqlitestudio的导出.db数据库的评论到comments.csv,py程序里也能导出就是很卡。
[SQL] 纯文本查看 复制代码
select * from comments

comments.csv放置到新建_csv文件夹里。

代码如下:
[Python] 纯文本查看 复制代码
import osimport datetime
from dateutil import relativedelta
import sqlite3
import numpy as np
import pandas as pd

dir = os.path.dirname(os.path.abspath(__file__))
p_dir = os.path.dirname(dir)
txt = ""
#整理微博id.csv文件
def csv_to_weibocsv():
        weibo = pd.read_csv(dir+"\\微博id.csv")
        columns1 = ["id","bid","正文","头条文章url","原始图片url","视频url","位置","日期","工具","点赞数","评论数","转发数","话题","@用户","日期2","是否原创","源用户id","源用户昵称","源微博id","源微博bid","源微博正文","源微博头条文章url","源微博原始图片url","源微博视频url","源微博位置","源微博日期","源微博工具","源微博点赞数","源微博评论数","源微博转发数","源微博话题","源微博@用户","日期3"]
        weibo.columns = columns1
        weibo["正文"]=weibo["正文"].str.replace("<br /><br />","<br />")
        weibo["源微博正文"]=weibo["源微博正文"].str.replace("<br /><br />","<br />")
        
        print("修正weibo.csv ... ")
        weibo.to_csv(dir+"\\_csv\\weibo.csv",encoding='utf-8-sig')
        print("成功!\n")

#评论导出csv,推荐用软件导出,更有效率
def comments_to_csv():
        conn = sqlite3.connect(os.path.dirname(dir)+"\\weibodata.db")
        cs = conn.cursor()

        cs.execute('select * from comments')
        arr=cs.fetchall()
        dt = np.array(arr)
        conn.close()
        #print(dt)
        print("导出评论 comments.cvs ... ")
        pd.DataFrame(dt).to_csv(dir+"\\_csv\\comments.csv",encoding='utf-8-sig')
        print("成功!\n")

#数据库导出weibo.csv,用不到了
def weibo_to_csv():
        conn = sqlite3.connect(os.path.dirname(dir)+"\\weibodata.db")
        cs = conn.cursor()

        cs.execute('select * from weibo')
        arr=cs.fetchall()
        dt = np.array(arr)
        conn.close()
        #print(dt)
        pd.DataFrame(dt).to_csv(dir+"\\_csv\\weibo.csv",encoding='utf-8-sig')

#输出评论,根据点赞数排序,前20个
def comments_print(_id_):
        global comments_txt
        comments_txt = ""
        comments = pd.read_csv(dir+"\\_csv\\comments.csv")
        columns2 = ["id","bid","weibo_id","root_id","user_id","日期","昵称","评论url","评论","图片url","点赞数"]
        comments.columns = columns2
        comments = comments.drop_duplicates()
        
        data = comments.query('weibo_id ==@_id_')
        data = data.sort_values("点赞数",ascending=False)
        data = data[:20]

        if data["id"].notnull().sum()<20:
                print(data["id"].notnull().sum(),"个评论导出,数据不完整!")
        for data in data.itertuples():
                #print(str(data[1]))
                if str(data[6]) is not None:
                        comments_txt = comments_txt +"> &#128172; <font size=\"2\">"+ str(data[7])+",发布于:"+str(data[6])+"</font>\n<font size=\"2\"> "+str(data[9])+"</font>\n"
                #out_file.write(comments_txt)
        #out_file.close()

#输出weibo
def weibo_print(_id_):
        weibo_name = "微博名"
        global txt
        global comments_txt
        global pic_path_txt
        global forwad_pic
        weibo = pd.read_csv(dir+"\\_csv\\weibo.csv")
        weibo = weibo.drop_duplicates()
        weibo = weibo.query('id == @_id_')
        weibo = weibo.sort_values("日期",ascending=False)
       
        for weibo in weibo.itertuples():                txt = txt + "### ["+weibo_name+"]([url=https://m.weibo.cn/detail/]https://m.weibo.cn/detail/[/url]"+str(weibo[2])+")\n<font size=\"2\">"+str(weibo[16])+",来自:"+weibo[10]+"</font>\n"
                txt = txt +'> '+str(weibo[4])+pic_path_txt
                forwad = ("\n[//@"+str(weibo[19])+":]([url=https://m.weibo.cn/detail/]https://m.weibo.cn/detail/[/url]"+"{:.0f}".format(weibo[20])+")\n>> "+str(weibo[22])+"\n") if str(weibo[17]) == "False" else ""
                txt = txt + forwad + forwad_pic + "\n<font size=\"2\">转发:"+str(weibo[13])+",评论:"+str(weibo[12])+",点赞:"+str(weibo[11])+"</font>\n"
                txt = txt + ">[!NOTE]- 评论\n"+comments_txt+"\n---- \n"
                #out_file.write(txt)
        #out_file.close()

#获取weibo图片/视频路径
def get_pic(_id_):
        global pic_path_txt
        pic_path_txt = ""
        conn = sqlite3.connect(os.path.dirname(dir)+"\\weibodata.db")
        cs = conn.cursor()

        cs.execute('select * from bins')
        arr=cs.fetchall()
        dt = np.array(arr)
        conn.close()
        columns3 = ["id","ext","data","weibo_id","comment","path","url"]
        pic = pd.DataFrame(dt)
        pic.columns = columns3
        pic.drop_duplicates()
        pic=pic.query('weibo_id == @_id_')
        for pic in pic.itertuples():
                if (pic[2] == ".jpg") | (pic[2] == ".gif"):
                        pic_path = str(pic[6]).replace("\\","/").split("img")
                        pic_path_txt=pic_path_txt+"![[img"+pic_path[1]+"]]"
                else: 
                        pic_path = str(pic[6]).replace("\\","/").split("video")
                        pic_path_txt=pic_path_txt+"![[video"+pic_path[1]+"]]"
        pic_path_txt = pic_path_txt + "\n"

#需要导出的开始时间和截止时间
start_date = '2022-01-01'
end_date = '2023-01-01'
#comments_to_csv()
csv_to_weibocsv()

#按日输出文件
while start_date <= end_date:
        start_date_strp = datetime.datetime.strptime(start_date,'%Y-%m-%d')
        next_date_strp = start_date_strp +relativedelta.relativedelta(days=1)
        next_date = str(next_date_strp.strftime('%Y-%m-%d'))
        print(start_date)
        
        weibo = pd.read_csv(dir+"\\_csv\\weibo.csv")
        weibo.drop_duplicates()   #去重

        weibo = weibo.query('日期 >= @start_date & 日期 <= @next_date')
        for weibo in weibo.itertuples():
                print("更新... ",weibo[2])
                get_pic("{:.0f}".format(weibo[20]))
                forwad_pic = pic_path_txt
                get_pic(str(weibo[2]))
                comments_print(weibo[2])
                weibo_print(weibo[2])
                forwad_pic = ""

        if txt != "":
                out_dir = start_date.split("-")
                if os.path.exists(dir+"\\"+out_dir[0]) is False:
                        os.mkdir(dir+"\\"+out_dir[0])
                if os.path.exists(dir+"\\"+out_dir[0]+"\\"+out_dir[1]) is False:                
                        os.mkdir(dir+"\\"+out_dir[0]+"\\"+out_dir[1]) 
                out_file = open(dir+"\\"+out_dir[0]+"\\"+out_dir[1]+"\\"+start_date+".md",'w',encoding='utf-8')
                out_file.write(txt)
                out_file.close()
                txt = ""

        start_date = next_date
        

文件布局是这样子的







预览效果是这样子的




免费评分

参与人数 6吾爱币 +12 热心值 +6 收起 理由
molikyle + 1 + 1 我很赞同!
苏紫方璇 + 7 + 1 欢迎分析讨论交流,吾爱破解论坛有你更精彩!
蓝董浩 + 1 + 1 我很赞同!
徐太尉府 + 1 + 1 我很赞同!
katapy + 1 + 1 用心讨论,共获提升!
hh442 + 1 + 1 热心回复!

查看全部评分

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2024-11-24 20:59

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表