davideuler commited on
Commit
91e1678
·
1 Parent(s): 016ba29

translator cli

Browse files
Files changed (1) hide show
  1. translator_cli.py +102 -0
translator_cli.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import pymupdf
3
+ from deep_translator import (
4
+ GoogleTranslator,
5
+ ChatGptTranslator,
6
+ )
7
+
8
+ # Map of supported translators
9
+ TRANSLATORS = {
10
+ 'google': GoogleTranslator,
11
+ 'chatgpt': ChatGptTranslator,
12
+ }
13
+
14
+ def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: str = "Korean", translator_name: str = "google"):
15
+ """
16
+ Translate a PDF file from source language to target language
17
+
18
+ Args:
19
+ input_file: Path to input PDF file
20
+ source_lang: Source language code (e.g. 'en', 'fr')
21
+ target_lang: Target language code (e.g. 'ko', 'ja')
22
+ layer: Name of the OCG layer (default: "Korean")
23
+ translator_name: Name of the translator to use (default: "google")
24
+ """
25
+ # Define color "white"
26
+ WHITE = pymupdf.pdfcolor["white"]
27
+
28
+ # This flag ensures that text will be dehyphenated after extraction.
29
+ textflags = pymupdf.TEXT_DEHYPHENATE
30
+
31
+ # Get the translator class
32
+ if translator_name not in TRANSLATORS:
33
+ raise ValueError(f"Unsupported translator: {translator_name}. Available translators: {', '.join(TRANSLATORS.keys())}")
34
+
35
+ TranslatorClass = TRANSLATORS[translator_name]
36
+
37
+ # Configure the translator
38
+ translator = TranslatorClass(source=source_lang, target=target_lang)
39
+
40
+ # Generate output filename
41
+ output_file = input_file.rsplit('.', 1)[0] + f'-{target_lang}.pdf'
42
+
43
+ # Open the document
44
+ doc = pymupdf.open(input_file)
45
+
46
+ # Define an Optional Content layer in the document.
47
+ # Activate it by default.
48
+ ocg_xref = doc.add_ocg(layer, on=True)
49
+
50
+ # Iterate over all pages
51
+ for page in doc:
52
+ # Extract text grouped like lines in a paragraph.
53
+ blocks = page.get_text("blocks", flags=textflags)
54
+
55
+ # Every block of text is contained in a rectangle ("bbox")
56
+ for block in blocks:
57
+ bbox = block[:4] # area containing the text
58
+ text = block[4] # the text of this block
59
+
60
+ # Invoke the actual translation
61
+ translated = translator.translate(text)
62
+
63
+ # Cover the source text with a white rectangle.
64
+ page.draw_rect(bbox, color=None, fill=WHITE, oc=ocg_xref)
65
+
66
+ # Write the translated text into the original rectangle
67
+ page.insert_htmlbox(
68
+ bbox, translated, css="* {font-family: sans-serif;}", oc=ocg_xref
69
+ )
70
+
71
+ doc.subset_fonts()
72
+ doc.ez_save(output_file)
73
+ print(f"Translated PDF saved as: {output_file}")
74
+
75
+ def main():
76
+ """
77
+ can be invoked like this:
78
+ python translator_cli.py --source english --target zh-CN "/Users/david/Downloads/Level_up_coding_by_ai.pdf"
79
+ """
80
+
81
+ parser = argparse.ArgumentParser(description='Translate PDF documents.')
82
+ parser.add_argument('input_file', help='Input PDF file path')
83
+ parser.add_argument('--source', '-s', default='en',
84
+ help='Source language code (default: en)')
85
+ parser.add_argument('--target', '-t', default='ko',
86
+ help='Target language code (default: ko)')
87
+ parser.add_argument('--layer', '-l', default='Korean',
88
+ help='Name of the OCG layer (default: Korean)')
89
+ parser.add_argument('--translator', '-tr', default='google',
90
+ choices=list(TRANSLATORS.keys()),
91
+ help='Translator to use (default: google)')
92
+
93
+ args = parser.parse_args()
94
+
95
+ try:
96
+ translate_pdf(args.input_file, args.source, args.target, args.layer, args.translator)
97
+ except Exception as e:
98
+ print(f"Error: {str(e)}")
99
+ exit(1)
100
+
101
+ if __name__ == "__main__":
102
+ main()