File size: 4,500 Bytes
2e237ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
import tempfile
import os
import shutil
import subprocess
from pathlib import Path

SCRIPT_DIR = Path(__file__).resolve().parent

def run_cmd(cmd, cwd=None, env=None):
    """Run a command, print nice logs, and also save them to run.log in cwd."""
    cwd = str(cwd or os.getcwd())
    print(f"🟦 Running: {' '.join(cmd)}  (cwd={cwd})")
    proc = subprocess.run(
        cmd,
        cwd=cwd,
        env=env,
        capture_output=True,
        text=True
    )
    if proc.stdout:
        print("🟩 STDOUT:")
        print(proc.stdout)
    if proc.stderr:
        print("πŸŸ₯ STDERR:")
        print(proc.stderr)
    # Save to run.log for debugging
    try:
        runlog = Path(cwd) / "run.log"
        with open(runlog, "a", encoding="utf-8") as f:
            f.write(f"$ {' '.join(cmd)}\n")
            if proc.stdout:
                f.write(proc.stdout + "\n")
            if proc.stderr:
                f.write(proc.stderr + "\n")
        print(f"🧾 Run log saved to: {runlog}")
    except Exception as e:
        print(f"⚠️ Could not write run.log: {e}")

    if proc.returncode != 0:
        # Let Gradio see the failure so it surfaces properly
        raise subprocess.CalledProcessError(proc.returncode, cmd, proc.stdout, proc.stderr)
    return proc

def _locate_pdf_json(temp_dir: str) -> str:
    """
    Your extractor writes a JSON like <pdf_stem>_comprehensive_data.json.
    Find it (and a few common fallbacks). Raise if not found.
    """
    td = Path(temp_dir)

    # Prefer exactly-named file if present
    candidates = [
        td / "pdf_data.json",                    # legacy name (if ever created)
        td / "input_comprehensive_data.json",    # most common from your logs
        td / "comprehensive_data.json",          # another common alias
        td / "output.json",                      # generic
    ]
    for p in candidates:
        if p.exists():
            print(f"βœ… Using PDF JSON: {p}")
            return str(p)

    # Generic pattern: anything *_comprehensive_data.json
    globs = list(td.glob("*_comprehensive_data.json"))
    if globs:
        print(f"βœ… Using PDF JSON (glob): {globs[0]}")
        return str(globs[0])

    # If still not found, surface a helpful error
    searched = ", ".join(str(p) for p in candidates) + ", " + str(td / "*_comprehensive_data.json")
    raise FileNotFoundError(
        f"PDF JSON not found. Looked for: {searched}\nTemp dir: {temp_dir}"
    )

def process_files(pdf_file, word_file):
    # Create a unique temporary directory for this run
    temp_dir = tempfile.mkdtemp(prefix="hf_redtext_")
    print(f"πŸ“‚ Temp dir: {temp_dir}")

    # Define standard filenames for use in the pipeline
    pdf_path = os.path.join(temp_dir, "input.pdf")
    word_path = os.path.join(temp_dir, "input.docx")
    word_json_path = os.path.join(temp_dir, "word_data.json")
    updated_json_path = os.path.join(temp_dir, "updated_word_data.json")
    final_docx_path = os.path.join(temp_dir, "updated.docx")

    # Copy the uploaded files to the temp directory
    shutil.copy(pdf_file, pdf_path)
    print(f"πŸ“„ PDF copied to: {pdf_path}")
    shutil.copy(word_file, word_path)
    print(f"πŸ“ DOCX copied to: {word_path}")

    # 1) PDF β†’ JSON  (extractor writes <stem>_comprehensive_data.json into cwd)
    run_cmd(["python", str(SCRIPT_DIR / "extract_pdf_data.py"), pdf_path], cwd=temp_dir)

    # Find the JSON produced by the extractor
    pdf_json_path = _locate_pdf_json(temp_dir)

    # 2) DOCX red text β†’ JSON
    run_cmd(["python", str(SCRIPT_DIR / "extract_red_text.py"), word_path, word_json_path], cwd=temp_dir)

    # 3) Merge JSON (uses the resolved pdf_json_path)
    run_cmd(["python", str(SCRIPT_DIR / "update_docx_with_pdf.py"), word_json_path, pdf_json_path, updated_json_path], cwd=temp_dir)

    # 4) Apply updates to DOCX
    run_cmd(["python", str(SCRIPT_DIR / "updated_word.py"), word_path, updated_json_path, final_docx_path], cwd=temp_dir)

    # Return the final .docx file
    return final_docx_path

iface = gr.Interface(
    fn=process_files,
    inputs=[
        gr.File(label="Upload PDF File", type="filepath"),
        gr.File(label="Upload Word File", type="filepath")
    ],
    outputs=gr.File(label="Download Updated Word File"),
    title="Red Text Replacer",
    description="Upload a PDF and Word document. Red-colored text in the Word doc will be replaced by matching content from the PDF."
)

if __name__ == "__main__":
    iface.launch()