File size: 7,143 Bytes
ee86994
152508e
 
 
 
 
841fa30
 
3c70d1e
 
3c2969a
 
3c70d1e
 
 
 
 
 
 
152508e
 
 
 
 
 
 
841fa30
 
 
 
 
 
 
152508e
e648c77
152508e
 
 
 
 
 
 
 
 
 
 
 
 
841fa30
 
 
 
 
 
ee86994
841fa30
152508e
 
 
 
b9ef50d
152508e
 
 
 
841fa30
 
 
 
 
 
 
 
 
152508e
 
841fa30
152508e
841fa30
 
 
 
 
 
 
 
 
152508e
e648c77
 
 
 
 
 
152508e
 
 
 
f1ebe2a
 
 
 
 
 
 
 
 
152508e
f1ebe2a
 
152508e
 
 
e648c77
3c70d1e
e648c77
152508e
e648c77
 
 
 
 
 
 
 
 
 
 
 
 
27f5b0b
 
e648c77
 
 
 
 
 
 
152508e
 
 
 
 
 
 
3c70d1e
 
 
 
 
 
 
 
841fa30
 
152508e
841fa30
 
 
 
3c70d1e
841fa30
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import gradio as gr
import requests
import base64
from PIL import Image
from io import BytesIO

print("=== DEBUG: Starting app.py ===")

# Get example images
import os
example_dir = os.path.join(os.environ.get('HOME', '/home/user'), 'app', 'example_images')
# example_dir = "example_images"  # Relative path since it's in the same directory
example_images = []
if os.path.exists(example_dir):
    for filename in os.listdir(example_dir):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
            example_images.append(os.path.join(example_dir, filename))
    print(f"Found {len(example_images)} example images")

def encode_image_to_base64(image: Image.Image) -> str:
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    img_str = base64.b64encode(buffered.getvalue()).decode()
    return f"data:image/jpeg;base64,{img_str}"

def query_vllm_api(image, temperature, max_tokens=12_000):
    print(f"=== DEBUG: query_vllm_api called with image={image is not None}, temp={temperature} ===")
    
    if image is None:
        return "No image provided", "No image provided", "Please upload an image first."
    
    try:
        messages = []
        # Optional: Resize image if needed (to avoid huge uploads)
        max_size = 2048
        if max(image.size) > max_size:
            ratio = max_size / max(image.size)
            new_size = tuple(int(dim * ratio) for dim in image.size)
            image = image.resize(new_size, Image.Resampling.LANCZOS)

        image_b64 = encode_image_to_base64(image)
        messages.append({
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": image_b64}}
            ]
        })

        payload = {
            "model": "numind/NuMarkdown-8B-Thinking",
            "messages": messages,
            "max_tokens": max_tokens,
            "temperature": temperature
        }

        print("=== DEBUG: About to make vLLM API request ===")
        response = requests.post(
            "http://localhost:8000/v1/chat/completions",
            json=payload,
            timeout=60
        )
        response.raise_for_status()
        data = response.json()

        result = data["choices"][0]["message"]["content"]
        
        # Handle the thinking/answer parsing
        try:
            reasoning = result.split("<think>")[1].split("</think>")[0]
            answer = result.split("<answer>")[1].split("</answer>")[0]
        except IndexError:
            # If no thinking tags, return the full result
            reasoning = "No thinking trace found"
            answer = result
        
        return reasoning, answer, answer
        
    except requests.exceptions.RequestException as e:
        error_msg = f"API request failed: {e}"
        print(f"=== DEBUG: Request error: {error_msg} ===")
        return error_msg, error_msg, error_msg
    except Exception as e:
        error_msg = f"Unexpected error: {e}"
        print(f"=== DEBUG: Unexpected error: {error_msg} ===")
        return error_msg, error_msg, error_msg

print("=== DEBUG: Creating Gradio interface ===")


with gr.Blocks(title="NuMarkdown-8B-Thinking", theme=gr.themes.Soft(), css="""
    * {
        font-family: 'Inter', 'Segoe UI', 'Roboto', 'Helvetica Neue', Arial, sans-serif !important;
    }
""") as demo:
    gr.HTML("""
    <div style="text-align: center; padding: 20px; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); border-radius: 10px; margin-bottom: 20px;">
        <h1 style="color: white; margin: 0; font-size: 2.5em; font-weight: bold;">πŸ‘οΈ NuMarkdown-8B-Thinking</h1>
        <p style="color: rgba(255,255,255,0.9); margin: 10px 0; font-size: 1.2em;">Upload an image to convert to Markdown!</p>
        <div style="margin-top: 15px;">
            <a href="https://nuextract.ai/" style="color: white; text-decoration: none; margin: 0 10px; font-weight: 500;">πŸ–₯️ API / Platform</a>
            <span style="color: rgba(255,255,255,0.7);">|</span>
            <a href="https://discord.gg/3tsEtJNCDe" style="color: white; text-decoration: none; margin: 0 10px; font-weight: 500;">πŸ—£οΈ Discord</a>
            <span style="color: rgba(255,255,255,0.7);">|</span>
            <a href="https://github.com/numindai/NuMarkdown" style="color: white; text-decoration: none; margin: 0 10px; font-weight: 500;">πŸ”— GitHub</a>
            <span style="color: rgba(255,255,255,0.7);">|</span>
            <a href="https://huggingface.co/numind/NuMarkdown-8B-Thinking" style="color: white; text-decoration: none; margin: 0 10px; font-weight: 500;">πŸ€— Model</a>
        </div>
    </div>
    <p>NuMarkdown-8B-Thinking is the first reasoning OCR VLM. It is specifically trained to convert documents into clean Markdown files, well suited for RAG applications. It generates thinking tokens to figure out the layout of the document before generating the Markdown file. It is particularly good at understanding documents with weird layouts and complex tables.</p>
    <p>NOTE: In this space we downsize large images and restrict the maximum output of the model, so performance could improve if you run the model yourself.</p>
    """)

    with gr.Row():
        with gr.Column(scale=2):
            temperature = gr.Slider(0.1, 1.5, value=0.4, step=0.1, label="Temperature")
            btn = gr.Button("Generate Response", variant="primary", size="lg")
            img_in = gr.Image(type="pil", label="Upload Image")
        
        with gr.Column(scale=2):
            
            # Debug section - collapsible
            with gr.Accordion("πŸ” Model Outputs", open=True):
                with gr.Tabs():
                    with gr.TabItem("🧠 Thinking Trace"):
                        thinking = gr.Textbox(
                            lines=15, 
                            max_lines=25,
                            show_label=False,
                            placeholder="The model's reasoning process will appear here..."
                        )
                    with gr.TabItem("πŸ“ Rendered Markdown"):
                        output = gr.Markdown(label="πŸ“ Generated Markdown")
                    with gr.TabItem("πŸ“„ Raw Markdown"):
                        raw_answer = gr.Textbox(
                            lines=15, 
                            max_lines=25,
                            show_label=False,
                            placeholder="The raw model output will appear here..."
                        )

    btn.click(
        query_vllm_api,
        inputs=[img_in, temperature],
        outputs=[thinking, raw_answer, output],
    )

    # Add examples if we have any
    if example_images:
        gr.Examples(
            examples=example_images[:5],  # Limit to 5 examples
            inputs=img_in,
            label="πŸ“Έ Try these example images"
        )

print("=== DEBUG: Gradio interface created ===")

if __name__ == "__main__":
    print("=== DEBUG: About to launch Gradio ===")
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True
    )
    print("=== DEBUG: Gradio launched ===")