Ben Chuanlong Du's Blog

And let it direct your passion with reason.

Auto Rename eTrade Employee Stock Plan Release Confirmations Using pdftotext

Install pdftotext

In [ ]:
!wajig install build-essential libpoppler-cpp-dev pkg-config python3-dev
In [ ]:
!pip3 install pdftotext

Auto Rename PDF Files

In [ ]:
from typing import Union
from pathlib import Path
import shutil
import re
import pdftotext


def _extract_text(path: Path) -> str:
    with path.open("rb") as fin:
        pdf = pdftotext.PDF(fin)
    return ("=" * 80).join(pdf)


def _parse_date(text, keywords) -> str:
    date = re.search(keywords + r"\s+(\d{2}-\d{2}-\d{4})", text).group(1)
    date = date.replace("-", "")
    return date[4:] + date[:4]


def rename_pdf(path: Union[str, Path]) -> None:
    if isinstance(path, str):
        path = Path(path)
    text = _extract_text(path)
    award_num = re.search(r"Award Number\s+(\d+)", text).group(1)
    award_date = _parse_date(text, "Award Date")
    release_date = _parse_date(text, "Release Date")
    name = f"{award_num}_{award_date}_{release_date}.pdf"
    try:
        shutil.move(path, path.with_name(name))
    except FileExistsError:
        print(f"{path} has duplicate file!")
In [ ]:
for path in Path().glob("*.pdf"):
    rename_pdf(path)
In [ ]:

Comments