- Notifications
You must be signed in to change notification settings - Fork 1.9k
/
Copy pathocrmypdf_compare.py
128 lines (109 loc) · 4.02 KB
/
ocrmypdf_compare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MIT
"""Run OCRmyPDF on the same PDF with different options."""
from __future__ importannotations
importos
importshlex
fromioimportBytesIO
frompathlibimportPath
fromsubprocessimportcheck_output, run
fromtempfileimportTemporaryDirectory
importpikepdf
importpymupdf
importstreamlitasst
fromlxmlimportetree
fromstreamlit_pdf_viewerimportpdf_viewer
defdo_column(label, suffix, d):
cli=st.text_area(
f"Command line arguments for {label}",
key=f"args{suffix}",
value="ocrmypdf {in_} {out}",
)
env_text=st.text_area(f"Environment variables for {label}", key=f"env{suffix}")
env=os.environ.copy()
forlineinenv_text.splitlines():
ifline:
try:
k, v=line.split("=", 1)
exceptValueError:
st.error(f"Invalid environment variable: {line}")
break
env[k] =v
args=shlex.split(
cli.format(
in_=os.path.join(d, "input.pdf"),
out=os.path.join(d, f"output{suffix}.pdf"),
)
)
withst.expander("Environment variables", expanded=bool(env_text.strip())):
st.code('\n'.join(f"{k}={v}"fork, vinenv.items()))
st.code(shlex.join(args))
returnenv, args
defmain():
st.set_page_config(layout="wide")
st.title("OCRmyPDF Compare")
st.write("Run OCRmyPDF on the same PDF with different options.")
st.warning("This is a testing tool and is not intended for production use.")
uploaded_pdf=st.file_uploader("Upload a PDF", type=["pdf"])
ifuploaded_pdfisNone:
return
pdf_bytes=uploaded_pdf.read()
withpikepdf.open(BytesIO(pdf_bytes)) asp, TemporaryDirectory() asd:
withst.expander("PDF Metadata"):
withp.open_metadata() asmeta:
xml_txt=str(meta)
parser=etree.XMLParser(remove_blank_text=True)
tree=etree.fromstring(xml_txt, parser=parser)
st.code(
etree.tostring(tree, pretty_print=True).decode("utf-8"),
language="xml",
)
st.write(p.docinfo)
st.write("Number of pages:", len(p.pages))
col1, col2=st.columns(2)
withcol1:
env1, args1=do_column("A", "1", d)
withcol2:
env2, args2=do_column("B", "2", d)
ifnotst.button("Execute and Compare"):
return
withst.spinner("Executing..."):
Path(d, "input.pdf").write_bytes(pdf_bytes)
run(args1, env=env1)
run(args2, env=env2)
col1, col2=st.columns(2)
withcol1:
st.text(
"Ghostscript version A: "
+check_output(
["gs", "--version"],
env=env1,
text=True,
)
)
withcol2:
st.text(
"Ghostscript version B: "
+check_output(
["gs", "--version"],
env=env2,
text=True,
)
)
doc1=pymupdf.open(os.path.join(d, "output1.pdf"))
doc2=pymupdf.open(os.path.join(d, "output2.pdf"))
fori, page1_2inenumerate(zip(doc1, doc2)):
st.write(f"Page {i+1}")
page1, page2=page1_2
col1, col2=st.columns(2)
withcol1, st.container(border=True):
st.write(page1.get_text())
withcol2, st.container(border=True):
st.write(page2.get_text())
col1, col2=st.columns(2)
withcol1, st.expander("PDF Viewer"):
pdf_viewer(Path(d, "output1.pdf"))
withcol2, st.expander("PDF Viewer"):
pdf_viewer(Path(d, "output2.pdf"))
if__name__=="__main__":
main()