Source code for jfscripts.extract_pdftext

#! /usr/bin/env python3

import argparse
import os
import re
import tempfile
import textwrap

from jfscripts import __version__
from jfscripts.utils import FilePath, Run, check_dependencies

run = Run()

line_length = 72

tmp_dir = tempfile.mkdtemp()
output_file = open("export.txt", "w")
dependencies = (
    ("pdftotext", "poppler"),
    ("pdfinfo", "poppler"),
)


[docs] class Txt: def __init__(self, path): self.path = path self.file = open(str(path), "w")
[docs] def add_line(self, line): self.file.write(line + "\n") print(line)
[docs] def get_page_count(pdf): pdfinfo_stdout = run.check_output(["pdfinfo", str(pdf)]) match = re.search(r"Pages:\s*(.*)\n", pdfinfo_stdout.decode("utf-8")) if match: return int(match.group(1))
[docs] def get_text_per_page(pdf, page, txt_file): page = str(page) tmp_txt_path = os.path.join(tmp_dir, page + ".txt") run.check_output(["pdftotext", "-f", page, "-l", page, str(pdf), tmp_txt_path]) tmp_txt_file = open(tmp_txt_path, "r") lines = tmp_txt_file.read().splitlines() full_lines = [] for line in lines: if len(line) > 20: full_lines.append(line) text_of_page = " ".join(full_lines) text_of_page = text_of_page.replace("'", "’") text_of_page = re.sub(r"[^a-zäöüA-ZÄÖÜß0-9 ]", "", text_of_page) text_of_page = re.sub(r"\s+", " ", text_of_page) wrapped_lines = textwrap.wrap(text_of_page, line_length) for line in wrapped_lines: txt_file.add_line(line)
[docs] def get_parser(): """The argument parser for the command line interface. :return: A ArgumentParser object. :rtype: argparse.ArgumentParser """ parser = argparse.ArgumentParser() parser.add_argument( "file", help="A PDF file containing text", ) parser.add_argument( "-c", "--colorize", action="store_true", help="Colorize the terminal output.", ) parser.add_argument( "-v", "--verbose", action="store_true", help="Make the command line output more verbose.", ) parser.add_argument( "-V", "--version", action="version", version="%(prog)s {version}".format(version=__version__), ) return parser
[docs] def main(): args = get_parser().parse_args() run.setup(verbose=args.verbose, colorize=args.colorize) check_dependencies(*dependencies) pdf = FilePath(args.file, absolute=True) txt_path = pdf.new(extension="txt") txt_file = Txt(txt_path) page_count = get_page_count(pdf) txt_file.add_line("# " + pdf.basename) for i in range(1, page_count + 1): txt_file.add_line("") txt_file.add_line("-" * line_length) txt_file.add_line("") txt_file.add_line("## Seite " + str(i)) txt_file.add_line("") get_text_per_page(pdf, i, txt_file)
if __name__ == "__main__": main()