#! /usr/bin/env python3
import argparse
import os
import re
import tempfile
import textwrap
from jfscripts import __version__
from jfscripts.utils import FilePath, Run, check_dependencies
run = Run()
line_length = 72
tmp_dir = tempfile.mkdtemp()
output_file = open("export.txt", "w")
dependencies = (
("pdftotext", "poppler"),
("pdfinfo", "poppler"),
)
[docs]
class Txt:
def __init__(self, path):
self.path = path
self.file = open(str(path), "w")
[docs]
def add_line(self, line):
self.file.write(line + "\n")
print(line)
[docs]
def get_page_count(pdf):
pdfinfo_stdout = run.check_output(["pdfinfo", str(pdf)])
match = re.search(r"Pages:\s*(.*)\n", pdfinfo_stdout.decode("utf-8"))
if match:
return int(match.group(1))
[docs]
def get_text_per_page(pdf, page, txt_file):
page = str(page)
tmp_txt_path = os.path.join(tmp_dir, page + ".txt")
run.check_output(["pdftotext", "-f", page, "-l", page, str(pdf), tmp_txt_path])
tmp_txt_file = open(tmp_txt_path, "r")
lines = tmp_txt_file.read().splitlines()
full_lines = []
for line in lines:
if len(line) > 20:
full_lines.append(line)
text_of_page = " ".join(full_lines)
text_of_page = text_of_page.replace("'", "’")
text_of_page = re.sub(r"[^a-zäöüA-ZÄÖÜß0-9 ]", "", text_of_page)
text_of_page = re.sub(r"\s+", " ", text_of_page)
wrapped_lines = textwrap.wrap(text_of_page, line_length)
for line in wrapped_lines:
txt_file.add_line(line)
[docs]
def get_parser():
"""The argument parser for the command line interface.
:return: A ArgumentParser object.
:rtype: argparse.ArgumentParser
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"file",
help="A PDF file containing text",
)
parser.add_argument(
"-c",
"--colorize",
action="store_true",
help="Colorize the terminal output.",
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Make the command line output more verbose.",
)
parser.add_argument(
"-V",
"--version",
action="version",
version="%(prog)s {version}".format(version=__version__),
)
return parser
[docs]
def main():
args = get_parser().parse_args()
run.setup(verbose=args.verbose, colorize=args.colorize)
check_dependencies(*dependencies)
pdf = FilePath(args.file, absolute=True)
txt_path = pdf.new(extension="txt")
txt_file = Txt(txt_path)
page_count = get_page_count(pdf)
txt_file.add_line("# " + pdf.basename)
for i in range(1, page_count + 1):
txt_file.add_line("")
txt_file.add_line("-" * line_length)
txt_file.add_line("")
txt_file.add_line("## Seite " + str(i))
txt_file.add_line("")
get_text_per_page(pdf, i, txt_file)
if __name__ == "__main__":
main()