将语雀md文件内的图片下载到本地

zhiyuckt · 发表于 2023-9-28 02:20

本帖最后由本是情~~ 于 2023-9-28 02:23 编辑

因为导出的md文件图片链接都是存放在语雀那边的，还是习惯图片下载到本地安心些

主要功能是获取md图片名，新建Image目录到md文件所在目录下，再新建一个以md文件名称的文件夹存放图片。图片路径中含有中文可能获取不了图片，所以对中文识别后替换为空
可对单个md文件操作也可批量对同目录所有md文件操作（垃圾代码勿喷）

[Python] 纯文本查看 复制代码

import requests,json,sys,random,time,os,re,openpyxl,urllib.parse,urllib3,glob
urllib3.disable_warnings()

def User_gent():
    agent=[
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0'
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57'
        'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
    ]
    fackender={}
    fackender['Cache-Control'] = "max-age=0"
    fackender['sec-ch-ua-platform'] = "Windows"
    fackender['Upgrade-Insecure-Requests'] = "1"
    fackender['user-agent']=agent[random.randint(0,len(agent)-1)]
    fackender['Sec-Fetch-Site'] = "same-origin"
    fackender['Sec-Fetch-Mode'] = "navigate"
    fackender['Accept-Encoding'] = "gzip, deflate"
    fackender['Accept-Language'] = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"
    fackender['Connection'] = "close"
    return (fackender)

#针对无中文
def Secone(fileName,FilePath,Fpath):
    with open(file=fileName, mode="r", encoding='utf-8') as f1:
        # 用来计数
        sus_count = 0
        lines = f1.readlines()
        for i in range(0, len(lines)):
            #针对其他md内的图片文件，一般只需要修改此处即可
            pattern = re.compile(r"[(](https://cdn.nlark.com/.*?)[)]")
            urls = re.findall(pattern, lines[i])
            num = 0
            for url in urls:
                try:
                    if len(url)!=0:
                        #获取时间作为图片文件名
                        timestamp = int(time.time())
                        image_name = str(time.strftime('%Y%m%d%H%M%S', time.localtime(timestamp+num)))+str(i)+".png"
                        url = url.replace("(", "").replace(")", "")
                        #请求图片
                        img = requests.get(url,headers=User_gent(),verify=False,timeout = 2)
                        #判断是否成功获取图片
                        if img.status_code != 200:
                           print('第', i, '行：', '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!',img.status_code)
                           continue
                        #存储网络图片到本地
                        print(Fpath)
                        target = open(Fpath + '\\' + image_name,'ab')
                        target.write(img.content)
                        target.close()
                        lines[i] = lines[i].replace(url, FilePath + '\\' + image_name)
                        print('第',sus_count,'个''第',i,'行：',url+'成功转为'+ FilePath + '\\' + image_name)
                        sus_count += 1
                        num += 1
                except:
                    print("图片路径错误")
    #重新保存md文件
    with open(file=fileName, mode='w', encoding='utf-8') as f2:
        print(fileName)
        f2.writelines(lines)

#针对有中文的文件名
def ChineseSecone_test(fileName,FilePath,Fpath):
    with open(file=fileName, mode="r", encoding='utf-8') as f1:
        # 用来计数
        sus_count = 0
        lines = f1.readlines()
        for i in range(0, len(lines)):
            num = 0
            #针对其他md内的图片文件，一般只需要修改此处即可
            pattern = re.compile(r"[(](https://cdn.nlark.com/.*?)[)]")
            urls = re.findall(pattern, lines[i])
            for url in urls:
                try:
                    if len(url)!=0:
                        #获取时间作为图片文件名
                        timestamp = int(time.time())
                        image_name = str(time.strftime('%Y%m%d%H%M%S', time.localtime(timestamp+num)))+str(i)+".png"
                        url = url.replace("(", "").replace(")", "")
                        #请求图片
                        img = requests.get(url,headers=User_gent(),verify=False,timeout = 2)
                        #判断是否成功获取图片
                        if img.status_code != 200:
                           print('第', i, '行：', '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!',img.status_code)
                           continue
                        #存储网络图片到本地
                        target = open(Fpath + image_name,'ab')
                        target.write(img.content)
                        target.close()
                        lines[i] = lines[i].replace(url, FilePath + '\\' + image_name)
                        print('第',sus_count,'个''第',i,'行：',url+'成功转为'+ FilePath + '\\' + image_name)
                        sus_count += 1
                        num += 1
                except:
                    print("图片路径错误")
    #重新保存md文件
    with open(file=fileName, mode='w', encoding='utf-8') as f2:
        print(fileName)
        f2.writelines(lines)

#判断是否有中文字符
def Chinese_char(chars):
    return bool(re.search(r'[\u4e00-\u9fff]', chars))

if __name__=='__main__':
    ImgDirectory = 'Image'              #存放图片的目录
    query = input("单文件请按1，全目录按2：")
    if query == "1":
        file_suffix = '.md'
        fileName = input("输入md文件：")
        # 获取上一级目录路径
        current_path = fileName
        parent_path = os.path.abspath(os.path.join(current_path, '..'))
        # 新建存放所有图片目录 image
        pickure = parent_path + '\\' + ImgDirectory

        if not os.path.exists(pickure):
            os.makedirs(pickure)
        try:
            #以.切割出后缀md
            fileNames = fileName.split(".")
            #以\切割出文件名名称
            Fname = fileNames[0].split("\\")
            for file in Fname:
                FilePath = file      # 得到最后一个为文件目录
            #创建文件夹，如果不存在则新建
            if not os.path.exists(ImgDirectory + '\\' + FilePath):
                #判断文件名中是否有中文
                if (Chinese_char(FilePath) == True):
                    #在文件名中去掉中文括号
                    Fstring = re.sub(r'[\u4e00-\u9fff（）（）（）]+', '', FilePath)
                    #去除空格
                    Fstring = re.sub(r' ', '', Fstring)
                    # 给后续文件夹命名
                    FilePath = ImgDirectory + '\\' +  Fstring + '-' + 'Chinese'
                    # 得到完整目录地址
                    Fpath = pickure + '\\'+Fstring + '-' + 'Chinese'+'\\'
                    # 新建文件夹
                    if not os.path.exists(Fpath):
                        os.makedirs(Fpath)
                    ChineseSecone_test(fileName, FilePath, Fpath)
                else:
                    Fpath = fileNames[0] + '\\'
                    Fpath = pickure + '\\' + FilePath  # FilePath此时是文件名称
                    FilePath = ImgDirectory + '\\' + FilePath  # Image/111
                    print('Fpath',Fpath)
                    print('FilePath',FilePath)
                    if not os.path.exists(Fpath):
                        os.makedirs(Fpath)  # D:\test\Image\111
                    Secone(fileName, FilePath, Fpath)
        except:
            print("单个md文件获取失败")
    elif query == "2":
        Path = input("输入md文件目录所在路径（D:\test)：")
        md_files = glob.glob(os.path.join(Path, '*.md'))
        num = 1
        # 获取上一级目录路径
        current_path = md_files[0]
        parent_path = os.path.abspath(os.path.join(current_path, '..'))
        # 新建存放所有图片目录 image
        pickure = parent_path + '\\' + ImgDirectory
        if not os.path.exists(pickure):
            os.makedirs(pickure)
        for fileName in md_files:
            if os.path.exists(pickure):
                #以. 切割出后缀md
                fileNames = fileName.split(".")
                #以\切割出文件名名称
                Fname = fileNames[0].split("\\")
                for file in Fname:
                    # 得到最后一个为文件名名称，将其进行判断是否有中文
                    FilePath = file
                # 创建文件夹，如果不存在则新建
                if not os.path.exists(ImgDirectory + '\\' + FilePath):
                    # 判断文件名中是否有中文
                    if (Chinese_char(FilePath) == True):
                        # 在文件名中去掉中文括号
                        Fstring = re.sub(r'[\u4e00-\u9fff（）（）（）]+', '', FilePath)
                        # 去除空格
                        Fstring = re.sub(r' ', '', Fstring)
                        # 给后续新建文件夹命名
                        FilePath = ImgDirectory + '\\' + Fstring + str(num) + '-' + 'Chinese'
                        # 得到一个完整的目录地址
                        Fpath = pickure + '\\' + Fstring + str(num) + '-' + 'Chinese' + '\\'
                        # 新建一个文件夹
                        if not os.path.exists(Fpath):
                            os.makedirs(Fpath)
                        ChineseSecone_test(fileName, FilePath, Fpath)
                    else:
                        Fpath = fileNames[0] + '\\'     #d:\test\111
                        # 得到一个完整的目录地址,后续准备新建文件夹
                        Fpath = pickure + '\\' + FilePath       #FilePath此时是文件名称
                        FilePath = ImgDirectory + '\\' + FilePath   #Image/111
                        if not os.path.exists(Fpath):
                            os.makedirs(Fpath)                  #D:\test\Image\111
                        Secone(fileName, FilePath,Fpath)
            else:
                pass
            num = num + 1
    else:
        print("未输入")

效果图：

Kls673M · 发表于 2023-9-28 11:09

感谢分享，我觉得应该把这个md都导出来就更完美了

zhiyuckt · 发表于 2023-9-28 12:23

Kls673M 发表于 2023-9-28 11:09
感谢分享，我觉得应该把这个md都导出来就更完美了

你是说想批量导出知识库md文件么？github上面有这个项目的

zx52 · 发表于 2023-9-28 23:15

感谢分享

JokerDa · 发表于 2023-10-7 08:54

感谢分享，学习了！！

帐号		自动登录	找回密码
密码			注册[Register]

[Python 原创] 将语雀md文件内的图片下载到本地

免费评分