import gradio as gr
import torch
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from PIL import Image
import requests
import pandas as pd
import numpy as np
import uuid
import os

# ──────────────────────────────────────────────────────────────
# 1. Load Qwen2-VL OCR Model & Processor (once at startup)
# ──────────────────────────────────────────────────────────────
MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"

# Choose device: GPU if available, otherwise CPU
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
).to(DEVICE).eval()

# ──────────────────────────────────────────────────────────────
# 2. OCR Helper: Extract text from a single PIL image
# ──────────────────────────────────────────────────────────────
@torch.no_grad()
def run_qwen_ocr(pil_image: Image.Image) -> str:
    """
    Use Qwen2-VL to OCR the given PIL image.
    Returns a single string of the extracted text.
    """
    # Build “chat” content: first a text prompt, then the image
    user_message = [
        {"type": "text", "text": "OCR the text in the image."},
        {"type": "image", "image": pil_image},
    ]
    messages = [{"role": "user", "content": user_message}]

    # Create the full prompt
    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[prompt_full],
        images=[pil_image],
        return_tensors="pt",
        padding=True,
    ).to(DEVICE)

    # Generate
    outputs = model.generate(**inputs, max_new_tokens=1024)
    decoded = processor.decode(outputs[0], skip_special_tokens=True).strip()
    # The model’s response may include some markup like “<|im_end|>”; remove it
    return decoded.replace("<|im_end|>", "").strip()

# ──────────────────────────────────────────────────────────────
# 3. OpenLibrary Lookup Helper
# ──────────────────────────────────────────────────────────────
def query_openlibrary(title_text: str, author_text: str = None) -> dict | None:
    """
    Query OpenLibrary.search.json by title (and optional author).
    Returns a dict with keys: title, author_name, publisher, first_publish_year.
    If no results, returns None.
    """
    base_url = "https://openlibrary.org/search.json"
    params = {"title": title_text}
    if author_text:
        params["author"] = author_text

    try:
        resp = requests.get(base_url, params=params, timeout=5)
        resp.raise_for_status()
        data = resp.json()
        if data.get("docs"):
            doc = data["docs"][0]
            return {
                "title": doc.get("title", ""),
                "author_name": ", ".join(doc.get("author_name", [])),
                "publisher": ", ".join(doc.get("publisher", [])),
                "first_publish_year": doc.get("first_publish_year", ""),
            }
    except Exception as e:
        print(f"OpenLibrary query failed: {e}")

    return None

# ──────────────────────────────────────────────────────────────
# 4. Main Processing: OCR → Parse → OpenLibrary → CSV/DF
# ──────────────────────────────────────────────────────────────
def process_image_list(images: list[Image.Image]):
    """
    Takes a list of PIL images (each ideally a single book cover).
    Runs OCR on each via Qwen2-VL, parses first two nonempty lines as title/author,
    looks up metadata once per image, and returns:
      - A pandas DataFrame of all results
      - A filepath to a CSV (written under /tmp)
    """
    records = []

    for pil_img in images:
        # 1) OCR
        try:
            ocr_text = run_qwen_ocr(pil_img)
        except Exception as e:
            # If model fails, skip this image
            print(f"OCR failed on one image: {e}")
            continue

        # 2) Parse lines: first nonempty → title, second → author if present
        lines = [line.strip() for line in ocr_text.splitlines() if line.strip()]
        if not lines:
            # No text extracted; skip
            continue

        title_guess = lines[0]
        author_guess = lines[1] if len(lines) > 1 else None

        # 3) Query OpenLibrary
        meta = query_openlibrary(title_guess, author_guess)
        if meta:
            records.append(meta)
        else:
            # Fallback: record OCR guesses if no OpenLibrary match
            records.append({
                "title": title_guess,
                "author_name": author_guess or "",
                "publisher": "",
                "first_publish_year": "",
            })

    # 4) Build DataFrame (even if empty)
    df = pd.DataFrame(records, columns=["title", "author_name", "publisher", "first_publish_year"])
    csv_bytes = df.to_csv(index=False).encode()

    # 5) Write CSV to a temporary file
    unique_name = f"books_{uuid.uuid4().hex}.csv"
    temp_path = os.path.join("/tmp", unique_name)
    with open(temp_path, "wb") as f:
        f.write(csv_bytes)

    return df, temp_path

# ──────────────────────────────────────────────────────────────
# 5. Gradio Interface
# ──────────────────────────────────────────────────────────────
def build_interface():
    with gr.Blocks(title="Book Cover Scanner (Qwen2-VL OCR)") as demo:
        gr.Markdown(
            """
            # 📚 Book Cover Scanner + Metadata Lookup

            1. Upload **one or more** images, each containing a single book cover.  
            2. The app will OCR each cover (via Qwen2-VL), take:
               - the **first nonempty line** as a “title” guess, and  
               - the **second nonempty line** (if present) as an “author” guess, then  
               - query OpenLibrary once per image for metadata.  
            3. A table appears below with Title, Author(s), Publisher, Year.  
            4. Click “Download CSV” to export all results.  

            **Tips:**  
            - Use clear, high‐contrast photos (text should be legible).  
            - For best results, crop each cover to the image frame (no extra background).  
            - If Qwen2-VL fails on any image, that image is skipped in the table.
            """
        )

        with gr.Row():
            img_in = gr.Gallery(label="Upload Book Cover(s)", elem_id="input_gallery").style(
                height="auto"
            )
            run_button = gr.Button("OCR & Lookup")

        output_table = gr.Dataframe(
            headers=["title", "author_name", "publisher", "first_publish_year"],
            label="Detected Books + Metadata",
            datatype="pandas",
        )
        download_file = gr.File(label="Download CSV")

        def on_run(image_list):
            # image_list is a list of numpy arrays (H×W×3). Convert to PIL:
            pil_images = []
            for np_img in image_list:
                if isinstance(np_img, np.ndarray):
                    pil_images.append(Image.fromarray(np_img))
            df, csv_path = process_image_list(pil_images)
            return df, csv_path

        run_button.click(
            fn=on_run,
            inputs=[img_in],
            outputs=[output_table, download_file],
        )

    return demo

if __name__ == "__main__":
    build_interface().launch()