Install pdftotext¶
In [ ]:
!wajig install build-essential libpoppler-cpp-dev pkg-config python3-dev
In [ ]:
!pip3 install pdftotext
Auto Rename PDF Files¶
In [ ]:
from typing import Union
from pathlib import Path
import shutil
import re
import pdftotext
def _extract_text(path: Path) -> str:
with path.open("rb") as fin:
pdf = pdftotext.PDF(fin)
return ("=" * 80).join(pdf)
def _parse_date(text, keywords) -> str:
date = re.search(keywords + r"\s+(\d{2}-\d{2}-\d{4})", text).group(1)
date = date.replace("-", "")
return date[4:] + date[:4]
def rename_pdf(path: Union[str, Path]) -> None:
if isinstance(path, str):
path = Path(path)
text = _extract_text(path)
award_num = re.search(r"Award Number\s+(\d+)", text).group(1)
award_date = _parse_date(text, "Award Date")
release_date = _parse_date(text, "Release Date")
name = f"{award_num}_{award_date}_{release_date}.pdf"
try:
shutil.move(path, path.with_name(name))
except FileExistsError:
print(f"{path} has duplicate file!")
In [ ]:
for path in Path().glob("*.pdf"):
rename_pdf(path)
References¶
In [ ]: