- Notifications
You must be signed in to change notification settings - Fork 1.9k
/
Copy pathexample_plugin.py
68 lines (49 loc) · 1.69 KB
/
example_plugin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# SPDX-FileCopyrightText: 2022 James R Barlow: https://github.com/jbarlow83
# SPDX-License-Identifier: MIT
"""An example of an OCRmyPDF plugin.
This plugin adds two new command line arguments
--grayscale-ocr: converts the image to grayscale before performing OCR on it
(This is occasionally useful for images whose color confounds OCR. It only
affects the image shown to OCR. The image is not saved.)
--mono-page: converts pages all pages in the output file to black and white
To use this from the command line:
ocrmypdf --plugin path/to/example_plugin.py --mono-page input.pdf output.pdf
To use this as an API:
import ocrmypdf
ocrmypdf.ocr('input.pdf', 'output.pdf',
plugins=['path/to/example_plugin.py'], mono_page=True
)
"""
from __future__ importannotations
importlogging
fromPILimportImage
fromocrmypdfimporthookimpl
log=logging.getLogger(__name__)
@hookimpl
defadd_options(parser):
parser.add_argument('--grayscale-ocr', action='store_true')
parser.add_argument('--mono-page', action='store_true')
@hookimpl
defprepare(options):
pass
@hookimpl
defvalidate(pdfinfo, options):
pass
@hookimpl
deffilter_ocr_image(page, image):
ifpage.options.grayscale_ocr:
log.info("graying")
returnimage.convert('L')
returnimage
@hookimpl
deffilter_page_image(page, image_filename):
ifpage.options.mono_page:
withImage.open(image_filename) asim:
im=im.convert('1')
im.save(image_filename)
returnimage_filename
else:
output=image_filename.with_suffix('.jpg')
withImage.open(image_filename) asim:
im.save(output)
returnoutput