Source code for jfscripts.pdf_compress

#! /usr/bin/env python3

from jfscripts import __version__
from jfscripts import list_files
from jfscripts._utils import check_dependencies, Run, FilePath
import argparse
import multiprocessing
import os
import PyPDF2
import random
import re
import shutil
import time
import uuid


run = Run()
state = None
"""The global :class:`State` object."""

identifier = 'magick'
"""To allow better assignment of the output files."""

tmp_identifier = '{}_{}'.format(identifier, uuid.uuid1())
"""Used for the identification of temporary files."""

args = None
"""The argparse object."""

dependencies = (
    ('convert', 'imagemagick'),
    ('identify', 'imagemagick'),
    ('pdfimages', 'poppler'),
    ('pdfinfo', 'poppler'),
    'pdftk',
    'tesseract',
)


[docs]def check_threshold(value): """ Check if `value` is a valid threshold value. :param value: :type value: integer or string :return: A normalized threshold string (`90%`) :rtype: string """ value = re.sub(r'%$', '', str(value)) value = int(value) if value < 0 or value > 100: message = '{} is an invalid int value. Should be 0-100'.format(value) raise argparse.ArgumentTypeError(message) return '{}%'.format(value)
[docs]def get_parser(): """The argument parser for the command line interface. :return: A ArgumentParser object. :rtype: argparse.ArgumentParser """ parser = argparse.ArgumentParser( description='Convert and compress PDF scans. \ Make scans suitable for imslp.org (International Music Score Library \ Project). See also http://imslp.org/wiki/IMSLP:Musiknoten_beisteuern. \ The output files are monochrome bitmap images at a resolution of \ 600 dpi and the compression format CCITT group 4.', ) parser.add_argument( '-c', '--colorize', action='store_true', help='Colorize the terminal output.', ) parser.add_argument( '-m', '--multiprocessing', action='store_true', default=False, help='Use multiprocessing to run commands in parallel.', ) parser.add_argument( '-N', '--no-cleanup', action='store_true', help='Don’t clean up the temporary files.', ) parser.add_argument( '-v', '--verbose', action='store_true', help='Make the command line output more verbose.', ) parser.add_argument( '-V', '--version', action='version', version='%(prog)s {version}'.format(version=__version__), ) subcommand = parser.add_subparsers( dest='subcommand', help='Subcommand', ) subcommand.required = True ## # convert ## convert_parser = subcommand.add_parser( 'convert', aliases=['con', 'c'], description='Convert scanned images (can be many image ' 'file formats or a PDF files) in monochrome bitmap images. The ' 'resulting images are compressed using the CCITT group 4 compression.' ) convert_parser_color = convert_parser.add_mutually_exclusive_group() convert_parser_compress = convert_parser.add_mutually_exclusive_group() # auto_black_white convert_parser_color.add_argument( '-a', '--auto-black-white', action='store_true', help='The same as “' '--deskew ' '--join ' '--ocr ' '--pdf ' '--resize ' '--trim ' '--unify”', ) # auto_color convert_parser_color.add_argument( '-C', '--auto-color', action='store_true', help='The same as “' '--color ' '--deskew ' '--join ' '--ocr ' '--pdf ' '--resize ' '--trim ' '--unify”', ) # auto_png convert_parser_color.add_argument( '-P', '--auto-png', action='store_true', help='The same as “' '--deskew ' '--resize ' '--trim”', ) # backup convert_parser.add_argument( '-b', '--backup', action='store_true', help='Backup original images (add _backup.ext to filename).', ) # blur convert_parser.add_argument( '--blur', nargs=1, default=False, help='Blur images for better jpeg2000 compression rate.', ) # border convert_parser.add_argument( '-B', '--border', action='store_true', help='Frame the images with a white border.', ) # color convert_parser.add_argument( '-c', '--color', action='store_true', help='The input files are colored images.', ) # deskew convert_parser.add_argument( '-d', '--deskew', action='store_true', help='Straighten the images.', ) # enlighten_border convert_parser.add_argument( '-e', '--enlighten-border', action='store_true', help='Enlighten the border.', ) # force convert_parser.add_argument( '-f', '--force', action='store_true', help='Overwrite the output file even if it exists and it seems to be ' 'already converted.', ) # join convert_parser.add_argument( '-j', '--join', action='store_true', help='Join single paged PDF files to one PDF file. This option takes ' 'only effect with the option --pdf.', ) # ocr convert_parser.add_argument( '-o', '--ocr', action='store_true', default=False, help='Perform optical character recognition (OCR) on the input files.' 'The output format must be PDF.', ) # ocr_lanuage convert_parser.add_argument( '-l', '--ocr-language', nargs='+', help='Run tesseract --list-langs to get your installed languages.', ) # pdf convert_parser.add_argument( '-p', '--pdf', action='store_true', help='Generate a PDF file.', ) # png convert_parser.add_argument( '-n', '--png', action='store_true', help='Generate a PNG file.', ) # quality convert_parser_compress.add_argument( '-q', '--quality', default=False, help='Compress the input images in a specific quality. The command ' 'automatically turns into the color mode.', ) # resize convert_parser.add_argument( '-r', '--resize', action='store_true', help='Resize 200 percent.', ) # threshold convert_parser_compress.add_argument( '-t', '--threshold', default='50%', type=check_threshold, help='Threshold for monochrome, black and white images, default 50 \ percent. Colors above the threshold will be white and below will be \ black.', ) # trim convert_parser.add_argument( '-T', '--trim', action='store_true', help='This option removes any edges that are exactly the same color \ as the corner pixels.', ) # unify convert_parser.add_argument( '-u', '--unify', action='store_true', help='Unify the page size of all pages in a PDF File. The output must \ be a joined PDF.', ) convert_parser.add_argument( 'input_files', help=list_files.doc_examples('%(prog)s', 'tiff'), nargs='+', ) ## # extract ## extract_parser = subcommand.add_parser( 'extract', aliases=['ex', 'e'], description='Extract images from a PDF file and export them in the ' 'TIFF format.' ) extract_parser.add_argument( 'input_files', metavar='input_file', help='A pdf file', nargs='+', ) ## # join ## join_parser = subcommand.add_parser( 'join', aliases=['jn', 'j'], description='Join the input files into a single PDF file. If the ' 'input file is not PDF file, it is converted into a monochrome CCITT ' 'Group 4 compressed PDF file.', ) join_parser.add_argument( '-o', '--ocr', action='store_true', default=False, help='Perform optical character recognition (OCR) on the input files.', ) join_parser.add_argument( '-l', '--ocr-language', nargs='+', help='Run tesseract --list-langs to get your installed languages.', ) join_parser.add_argument( 'input_files', help=list_files.doc_examples('%(prog)s', 'png'), nargs='+', ) ## # samples ## samples_parser = subcommand.add_parser( 'samples', aliases=['sp', 's'], description='Convert the samge image with different threshold values \ to find the best threshold value.', ) samples_parser.add_argument( 'input_files', metavar='input_file', help='A image or a PDF file. The script selects randomly one page of \ a multipaged PDF to build the series with differnt threshold values.', ) samples_parser.add_argument( '-b', '--blur', action='store_true', help='Convert images on different blur values.', ) samples_parser.add_argument( '-q', '--quality', action='store_true', help='Compress to JPEG2000 images in different quality steps.', ) samples_parser.add_argument( '-t', '--threshold', action='store_true', help='Convert images on different threshold values to monochrome \ black and white images.', ) ## # unify ## unify_parser = subcommand.add_parser( 'unify', aliases=['un', 'u'], description='Unify the page size of all pages in a PDF File.', ) unify_parser.add_argument( 'input_files', metavar='input_file', help='A PDF file', ) # margin unify_parser.add_argument( '-m', '--margin', nargs=1, help='Add a margin around each page in the PDF file.', ) return parser
############################################################################### # do_* functions (alphabetically sorted) ############################################################################### # do_magick_convert # do_magick_convert_pdf # do_magick_identify # do_pdfimages # do_pdfinfo_page_count # do_pdftk_cat # do_tesseract
[docs]def _do_magick_command(command): """ImageMagick version 7 introduces a new top level command named `magick`. Use this newer command if present. :return: A list of command segments """ if shutil.which('magick'): return ['magick', command] else: return [command]
[docs]def _do_magick_convert_enlighten_border(width, height): """ Build the command line arguments to enlighten the border in four regions. :param int width: The width of the image. :param int height: The height of the image. :return: Command line arguments for imagemagicks’ `convert`. :rtype: list """ border = int(round(((width + height) / 2) * 0.05)) # top # right # bottom # left r = ('{}x{}'.format(width - border, border), '{}x{}+{}'.format(border, height - border, width - border), '{}x{}+{}+{}'.format(width - border, border, border, height - border), '{}x{}+{}+{}'.format(border, height - border, 0, border)) out = [] for region in r: out += ['-region', region, '-level', '0%,30%'] return out
[docs]def do_magick_convert(input_file, output_file, threshold=None, enlighten_border=False, border=False, resize=False, deskew=False, trim=False, color=False, quality=75, blur=False): """ Convert a input image file using the subcommand convert of the imagemagick suite. :return: The output image file. :rtype: jfscripts._utils.FilePath """ cmd_args = _do_magick_command('convert') cmd_args += ['-units', 'PixelsPerInch'] if enlighten_border: info_input_file = do_magick_identify(input_file) cmd_args += _do_magick_convert_enlighten_border( info_input_file['width'], info_input_file['height'], ) if resize: cmd_args += ['-resize', '200%'] if deskew: cmd_args += ['-deskew', '40%'] if threshold and not color: cmd_args += ['-threshold', threshold] if trim: cmd_args += ['-trim', '+repage'] if border: cmd_args += ['-border', '5%', '-bordercolor', '#FFFFFF'] if blur: cmd_args += ['-blur', str(blur)] if not color: cmd_args += ['-compress', 'Group4', '-monochrome'] else: cmd_args += ['-quality', str(quality)] if color and output_file.extension == 'pdf': cmd_args += ['-compress', 'JPEG2000'] cmd_args += [str(input_file), str(output_file)] return run.run(cmd_args)
[docs]def do_magick_identify(input_file): """The different informations of an image. :param input_file: The input file. :type input_file: jfscripts._utils.FilePath :return: A directory with the keys `width`, `height` and `colors`. :rtype: dict """ def _get_by_format(input_file, format): return run.check_output(_do_magick_command('identify') + ['-format', format, str(input_file)]).decode('utf-8') return { 'width': int(_get_by_format(input_file, '%w')), 'height': int(_get_by_format(input_file, '%h')), 'colors': int(_get_by_format(input_file, '%k')), }
[docs]def do_pdfimages(pdf_file, state, page_number=None, use_tmp_identifier=True): """Convert a PDF file to images in the TIFF format. :param pdf_file: The input file. :type pdf_file: jfscripts._utils.FilePath :param state: The state object. :type state: jfscripts.pdf_compress.State :param int page_number: Extract only the page with a specific page number. :return: The return value of `subprocess.run`. :rtype: subprocess.CompletedProcess """ if use_tmp_identifier: image_root = '{}_{}'.format(pdf_file.basename, tmp_identifier) else: image_root = pdf_file.basename command = ['pdfimages', '-tiff', str(pdf_file), image_root] if page_number: page_number = str(page_number) page_segments = ['-f', page_number, '-l', page_number] command = command[:2] + page_segments + command[2:] return run.run(command, cwd=state.common_path)
[docs]def do_pdfinfo_page_count(pdf_file): """Get the amount of pages a PDF files have. :param str pdf_file: Path of the PDF file. :return: Page count :rtype: int """ output = run.check_output(['pdfinfo', str(pdf_file)], encoding='utf-8') page_count = re.search(r'Pages:\s*([0-9]*)', output) return int(page_count.group(1))
[docs]def do_pdftk_cat(pdf_files, state): """Join a list of PDF files into a single PDF file using the tool `pdftk`. :param list pdf_files: a list of PDF files :param state: The state object. :type state: jfscripts.pdf_compress.State :return: None """ cmd = ['pdftk'] pdf_file_paths = map(lambda pdf_file: str(pdf_file), pdf_files) cmd += pdf_file_paths output_file_path = os.path.join( state.common_path, '{}_magick.pdf'.format(state.first_input_file.basename) ) cmd += ['cat', 'output', output_file_path] result = run.run(cmd) if result.returncode == 0: print('Successfully created: {}'.format(output_file_path))
[docs]def do_tesseract(input_file, languages=['deu', 'eng']): cmd_args = ['tesseract'] if languages: cmd_args += ['-l', '+'.join(languages)] cmd_args += [str(input_file), input_file.base, 'pdf'] return run.run(cmd_args, stderr=run.PIPE, stdout=run.PIPE)
############################################################################### # ###############################################################################
[docs]def collect_images(state): """Collection all images using the temporary identifier in a common path. :param state: The state object. :type state: jfscripts.pdf_compress.State :return: A sorted list of image paths. :rtype: list """ prefix = state.common_path out = [] for input_file in os.listdir(prefix): if tmp_identifier in input_file and \ os.path.getsize(os.path.join(prefix, input_file)) > 200: out.append(os.path.join(prefix, input_file)) out.sort() return out
[docs]def cleanup(state): """Delete all images using the temporary identifier in a common path. :param state: The state object. :type state: jfscripts.pdf_compress.State :return: None""" for work_file in os.listdir(state.common_path): if tmp_identifier in work_file: os.remove(os.path.join(state.common_path, work_file))
[docs]def unify_page_size(input_file, output_file, margin=0): input_file = open(str(input_file), 'rb') input_pdf = PyPDF2.PdfFileReader(input_file) output_pdf = PyPDF2.PdfFileWriter() max_width = 0 max_height = 0 for page in input_pdf.pages: width = page.mediaBox.getWidth() height = page.mediaBox.getHeight() if width > max_width: max_width = width if height > max_height: max_height = height for page in input_pdf.pages: width = page.mediaBox.getWidth() height = page.mediaBox.getHeight() blank = PyPDF2.pdf.PageObject.createBlankPage( None, max_width + 2 * margin, max_height + 2 * margin, ) blank.mergeScaledTranslatedPage( page, 1, margin + (max_width - width) / 2, margin + (max_height - height) / 2, ) output_pdf.addPage(blank) output_file = open(str(output_file), 'wb') output_pdf.write(output_file)
############################################################################### # subcommand wrapper functions ###############################################################################
[docs]def subcommand_convert_file(arguments): """Manipulate one input file :param tuple arguments: A tuple containing two elements: The first element is the input_file file object and the second element is the state object. """ input_file = arguments[0] if args.color: intermediate_extension = 'jp2' else: intermediate_extension = 'tiff' if args.pdf: extension = 'pdf' elif args.auto_png or args.png: extension = 'png' else: extension = intermediate_extension if args.ocr: extension = intermediate_extension if not args.join: output_file = input_file.new(extension=extension, del_substring='_' + tmp_identifier) else: output_file = input_file.new(extension=extension) if input_file == output_file: info_output_file = do_magick_identify(output_file) if info_output_file['colors'] == 2 and not args.force: print('The output file has already been converted.') return output_file if args.backup: backup = input_file.new(append='_backup') shutil.copy2(str(input_file), str(backup)) completed_process = do_magick_convert( input_file, output_file, threshold=args.threshold, enlighten_border=args.enlighten_border, border=args.border, resize=args.resize, deskew=args.deskew, trim=args.trim, color=args.color, quality=args.quality, blur=args.blur, ) if completed_process.returncode != 0: raise RuntimeError('magick convert failed.') if args.ocr: if output_file.extension not in ['tiff', 'jp2']: raise RuntimeError('Tesseract needs a tiff or a jp2 file as ' 'input.') completed_process = do_tesseract(output_file, args.ocr_language) if completed_process.returncode != 0: raise RuntimeError('tesseract failed.') os.remove(str(output_file)) output_file = output_file.new(extension='pdf') return output_file
[docs]def subcommand_join_convert_pdf(arguments): input_file = arguments[0] if args.ocr: extension = 'tiff' else: extension = 'pdf' output_file = input_file.new(extension=extension) process = do_magick_convert(input_file, output_file) if process.returncode != 0: raise RuntimeError('join: convert to pdf failed.') if args.ocr: process = do_tesseract(output_file) if process.returncode != 0: raise RuntimeError('join: ocr failed.') os.remove(str(output_file)) output_file = output_file.new(extension='pdf') return output_file
[docs]def subcommand_samples(input_file, state): """Generate a list of example files with different threshold values. :param input_file: The input file. :type input_file: jfscripts._utils.FilePath :param state: The state object. :type state: jfscripts.pdf_compress.State :return: None """ args = state.args def fix_output_path(output_file): output_file = str(output_file).replace('_-000', '') return FilePath(output_file, absolute=True) if state.input_is_pdf: page_count = do_pdfinfo_page_count(input_file) page_number = random.randint(1, page_count) print('Used page number {} of {} pages to generate a series of images ' 'with different threshold values.' .format(page_number, page_count)) do_pdfimages(input_file, state, page_number) images = collect_images(state) input_file = FilePath(images[0], absolute=True) if args.threshold: for threshold in range(40, 100, 5): appendix = '_threshold-{}'.format(threshold) output_file = input_file.new(extension='tiff', append=appendix, del_substring=tmp_identifier) output_file = str(output_file).replace('_-000', '') do_magick_convert(input_file, fix_output_path(output_file), threshold='{}%'.format(threshold)) if args.quality: for quality in range(40, 100, 5): appendix = '_quality-{}'.format(quality) output_file = input_file.new(extension='pdf', append=appendix, del_substring=tmp_identifier) do_magick_convert(input_file, fix_output_path(output_file), color=True, quality=quality) if args.blur: for blur in (1, 2, 3, 4, 5): appendix = '_blur-{}'.format(blur) output_file = input_file.new(extension='pdf', append=appendix, del_substring=tmp_identifier) do_magick_convert(input_file, fix_output_path(output_file), color=True, blur=blur, quality=100)
############################################################################### # ###############################################################################
[docs]class Timer(object): """Class to calculate the execution time. Mainly to test the speed improvements of the multiprocessing implementation.""" def __init__(self): self.end = None """UNIX timestamp the execution ended.""" self.begin = self.end = time.time() """UNIX timestamp the execution began."""
[docs] def stop(self): """Stop the time calculation and return the formated result. :return: The result :rtype: str """ self.end = time.time() return '{:.1f}s'.format(self.end - self.begin)
[docs]class State(object): """This object holds runtime data for the multiprocessing environment.""" def __init__(self, args): self.args = args """argparse arguments""" self.cwd = os.getcwd() """The current working directory""" self.input_files = [] """A list of all input files.""" if isinstance(self.args.input_files, str): self.input_files = [self.args.input_files] else: self.input_files = list_files.list_files(self.args.input_files) self.common_path = \ list_files.common_path(self.input_files) """The common path prefix of all input files.""" if self.common_path == '': self.common_path = self.cwd self.first_input_file = FilePath(self.input_files[0], absolute=True) """The first input file.""" self.input_is_pdf = False """Boolean that indicates if the first file is a pdf.""" if self.first_input_file.extension.lower() == 'pdf': self.input_is_pdf = True
[docs]def convert_file_paths(files): """Convert a list of file paths in a list of :class:`jfscripts._utils.FilePath` objects. :param list files: A list of file paths :return: a list of :class:`jfscripts._utils.FilePath` objects. """ out = [] for f in files: out.append(FilePath(f, absolute=True)) return out
[docs]def main(): """Main function. :return: None """ timer = Timer() global args args = get_parser().parse_args() run.setup(verbose=args.verbose, colorize=args.colorize) global state state = State(args) check_dependencies(*dependencies) ## # convert ## if args.subcommand in ['convert', 'cv', 'c']: if args.join and not args.pdf: args.pdf = True if args.auto_black_white or args.auto_color: args.deskew = True args.join = True args.ocr = True args.pdf = True args.trim = True args.unify = True if args.auto_black_white: args.resize = True if args.auto_png: args.deskew = True args.trim = True args.resize = True if args.auto_color: args.color = True if args.quality and not args.color: args.color = True if args.color and not args.quality: args.quality = 75 if args.blur: args.blur = args.blur[0] if state.first_input_file.extension == 'pdf': if len(state.input_files) > 1: raise ValueError('Specify only one PDF file.') do_pdfimages(state.first_input_file, state) input_files = collect_images(state) else: input_files = state.input_files input_files = convert_file_paths(input_files) if args.multiprocessing: pool = multiprocessing.Pool() data = [] for input_file in input_files: data.append((input_file, state)) output_files = pool.map(subcommand_convert_file, data) else: output_files = [] for input_file in input_files: output_files.append( subcommand_convert_file((input_file, state)) ) if args.join: do_pdftk_cat(output_files, state) if args.unify and len(state.input_files) > 1: unify_page_size( state.first_input_file.new(append='_magick'), state.first_input_file.new(append='_unifed'), margin=0, ) if not args.no_cleanup: cleanup(state) ## # extract ## elif args.subcommand in ['extract', 'ex', 'e']: if not state.input_is_pdf: raise ValueError('Specify a PDF file.') do_pdfimages(state.first_input_file, state, page_number=None, use_tmp_identifier=False) ## # join ## elif args.subcommand in ['join', 'jn', 'j']: input_files = convert_file_paths(state.input_files) if args.multiprocessing: pool = multiprocessing.Pool() data = [] for input_file in input_files: data.append((input_file, state)) files_converted = pool.map(subcommand_join_convert_pdf, data) else: files_converted = [] for input_file in input_files: files_converted.append( subcommand_join_convert_pdf((input_file, state)) ) do_pdftk_cat(files_converted, state) ## # samples ## elif args.subcommand in ['samples', 'sp', 's']: if args.blur == args.quality == \ args.threshold is False: args.blur = True args.quality = True args.threshold = True subcommand_samples(state.first_input_file, state) if not args.no_cleanup: cleanup(state) ## # unify ## elif args.subcommand in ['unify', 'un', 'u']: if state.first_input_file.extension != 'pdf': raise ValueError('Specify a PDF file.') if len(state.input_files) > 1: raise ValueError('Specify only one PDF file.') if args.margin: margin = int(args.margin[0]) else: margin = 0 unify_page_size( state.first_input_file, state.first_input_file.new(append='_unifed'), margin, ) print('Execution time: {}'.format(timer.stop()))
if __name__ == '__main__': main()