Using Python to Modify Pictures Path in Markdown Files

Introduction

hackmd 是一个好地方,图片拖进去,会自动上传到imgur这个图床网站,不用再去考虑图片的事了。

但有一个问题imgur的图片,需要挂代理才能加载出来。放在自己的网站上,如果有没挂代理的人访问,就看不到图片了。

Typora是个写markdown的好软件。

比较喜欢Typora上的主题,想把hackmd上面写的笔记都下载下来,然后放到Typora里,让Typora转成html再上传到自己的网站上。

hackmd上下载下来的md文件主要需要修改的几个地方:

  1. 找到每个md文件中的图片链接,把上传到imgur上的所有图片下载到本地。
  2. 修改md文件中的图片路径,要改为本地的图片的路径。
  3. 再做一些其他的修改。

但是,md文件实在太多了,一个一个手改实在是效率太低下了,何不如写一个Python脚本批量改?

Plan

  • 查找图片链接:使用re模块,正好复习下暑假某一天学的正则表达式。
  • 下载图片:使用requests模块,去get图片原始数据,并辅以multithreading多线程。
  • 修改md文件中的图片路径:使用字符串的replace方法。

Coding

列出当前路径下的所有文件

1
2
3
4
5
import os

FILE_PATH = 'xxx'

files = list(os.walk(FILE_PATH))[0][2]  # 需要根据具体情况调整下标

获取所有md文件中的图片链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
import re

urls = []
for file in files:
    if file[-2] == 'md':
        with open(file, 'r', encoding='utf-8') as f:  # 由于有中文字符,所以需要加上encoding参数
            for line in f.readlines():
                # 既有.png文件,又有.jpg文件,需要同时处理
                result = re.search(r'(./img/){1}[a-zA-Z0-9]+((.png)|(.jpg)){1}', line)
                if result != None:
                    urls.append(result.group(0))

# print(urls)

使用requests模块下载指定url的图片

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
import shutil
import requests

IMG_PATH = 'xxxx'
# 由于是从imgur下载图片,所以要使用代理
proxies = {'http': 'http://localhost:yourport', 'https': 'http://localhost:yourport'}

def downloadImgFromUrl(url):
    response = requests.get(url, proxies=proxies, stream=True)  # stream参数不指定会导致下载的是空文件
    img_name = url.split('/')[-1]  # 根据图片链接的suffix给图片取名
    with open('f{IMG_PATH}/{img_name}', 'wb') as out_file:
        shutil.copyfileobj(response.raw, out_file)
        print(f'{img_name} was downloaded...')
    del response

多线程下载所有图片

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
import time
import concurrent.futures

t1 = time.perf_counter()

with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(downloadImgFromUrl, urls)

t2 = time.perf_counter()
print(f'Finished in {t2-t1} seconds')

修改md文件中图片路径

没找到什么好的in place修改文件内容的方法,只能下面这种简单粗暴的方法了。

1
2
3
4
5
6
7
8
for file in files:
    if file[-2:] == 'md':
        with open(file, 'r', encoding='utf-8') as fout:
            content = fout.read()
            content = re.sub(r'./img', './img', content)
            content = re.sub(r'##### tags: `[a-z]*`\n', '', content)  # 顺便去掉tags
        with open(file, 'w', encoding='utf-8') as fin:
            fin.write(content)

Finally

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import re
import time
import shutil
import requests
import concurrent.futures

FILE_PATH = 'xxx'
IMG_PATH = 'xxxx'

proxies = {'http': 'http://localhost:yourport', 'https': 'http://localhost:yourport'}


# Download images.
def downloadImgFromUrl(url):
    response = requests.get(url, proxies=proxies, stream=True)
    img_name = url.split('/')[-1]
    with open(f'{IMG_PATH}\{img_name}', 'wb') as out_file:
        shutil.copyfileobj(response.raw, out_file)
        print(f'{img_name} was downloaded...')
    del response

def findAllImgUrls(files):
    urls = []
    for file in files:
        if file[-2:] == 'md':
            with open(file, 'r', encoding='utf-8') as f:
                for line in f.readlines():
                    result = re.search(r'((./img/)|(./img/)){1}[a-zA-Z0-9]+((.png)|(.jpg)){1}', line)
                    if result != None:
                        urls.append(result.group(0))
    return urls

def downloadAllImgs(urls):
    t1 = time.perf_counter()

    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(downloadImgFromUrl, urls)

    t2 = time.perf_counter()
    print(f'Finished in {t2-t1} seconds')


# Modify markdown files.
def ModifyMDFile(files):
    for file in files:
        if file[-2:] == 'md':
            with open(file, 'r', encoding='utf-8') as fout:
                content = fout.read()
                content = re.sub(r'./img', './img', content)
                content = re.sub(r'##### tags: `[a-z]*`\n', '', content)
            with open(file, 'w', encoding='utf-8') as fin:
                fin.write(content)


if __name__ == '__main__':
    files = list(os.walk(FILE_PATH))[0][2]
    urls = findAllImgUrls(files)
    downloadAllImgs(urls)
    ModifyMDFile(files)

Conclusion

Python还是好用啊。

复习了正则表达式、多线程、requests请求,os模块。

可惜没找到能让Typora批量将md文件转为html文件的方法,只能自己手动一个一个转。。。

更新了一波网站的内容,太舒服了。