Source code for jfscripts.extract_pdftext

#! /usr/bin/env python3

from jfscripts import __version__
from jfscripts._utils import check_dependencies, FilePath, Run
import argparse
import os
import re
import tempfile
import textwrap

run = Run()

line_length = 72

tmp_dir = tempfile.mkdtemp()
output_file = open('export.txt', 'w')
dependencies = (
    ('pdftotext', 'poppler'),
    ('pdfinfo', 'poppler'),
)


[docs]class Txt(object): def __init__(self, path): self.path = path self.file = open(str(path), 'w')
[docs] def add_line(self, line): self.file.write(line + '\n') print(line)
[docs]def get_page_count(pdf): pdfinfo_stdout = run.check_output(['pdfinfo', str(pdf)]) match = re.search(r'Pages:\s*(.*)\n', pdfinfo_stdout.decode('utf-8')) if match: return int(match.group(1))
[docs]def get_text_per_page(pdf, page, txt_file): page = str(page) tmp_txt_path = os.path.join(tmp_dir, page + '.txt') run.check_output([ 'pdftotext', '-f', page, '-l', page, str(pdf), tmp_txt_path ]) tmp_txt_file = open(tmp_txt_path, 'r') lines = tmp_txt_file.read().splitlines() full_lines = [] for line in lines: if len(line) > 20: full_lines.append(line) text_of_page = ' '.join(full_lines) text_of_page = text_of_page.replace("'", u'’') text_of_page = re.sub(r'[^a-zäöüA-ZÄÖÜß0-9 ]', '', text_of_page) text_of_page = re.sub(r'\s+', ' ', text_of_page) wrapped_lines = textwrap.wrap(text_of_page, line_length) for line in wrapped_lines: txt_file.add_line(line)
[docs]def get_parser(): """The argument parser for the command line interface. :return: A ArgumentParser object. :rtype: argparse.ArgumentParser """ parser = argparse.ArgumentParser() parser.add_argument( 'file', help='A PDF file containing text', ) parser.add_argument( '-c', '--colorize', action='store_true', help='Colorize the terminal output.', ) parser.add_argument( '-v', '--verbose', action='store_true', help='Make the command line output more verbose.', ) parser.add_argument( '-V', '--version', action='version', version='%(prog)s {version}'.format(version=__version__), ) return parser
[docs]def main(): args = get_parser().parse_args() run.setup(verbose=args.verbose, colorize=args.colorize) check_dependencies(*dependencies) pdf = FilePath(args.file, absolute=True) txt_path = pdf.new(extension='txt') txt_file = Txt(txt_path) page_count = get_page_count(pdf) txt_file.add_line('# ' + pdf.basename) for i in range(1, page_count + 1): txt_file.add_line('') txt_file.add_line('-' * line_length) txt_file.add_line('') txt_file.add_line('## Seite ' + str(i)) txt_file.add_line('') get_text_per_page(pdf, i, txt_file)
if __name__ == '__main__': main()