File size: 4,572 Bytes
3acddcc
01fcf24
 
3acddcc
 
 
01fcf24
e76f3e1
 
 
 
 
f2ba171
3acddcc
a5faf3c
f2ba171
01fcf24
 
836cff2
3acddcc
 
01fcf24
836cff2
01fcf24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3acddcc
 
01fcf24
3acddcc
01fcf24
 
 
 
 
 
3acddcc
01fcf24
 
 
 
 
3acddcc
01fcf24
3acddcc
 
01fcf24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from fastapi import FastAPI, UploadFile, File, Form
from sentence_transformers import SentenceTransformer
import pdfplumber
import uuid
import chromadb
from chromadb.config import Settings
import httpx
import os

# Fix: Set custom writable Hugging Face cache directory
os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
os.makedirs("/app/cache", exist_ok=True)

# Initialize FastAPI
app = FastAPI()

# Load SentenceTransformer model for document embeddings
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Initialize ChromaDB
chroma_client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory="./chroma_storage"))
collection = chroma_client.get_or_create_collection(name="documents")

# RedMindGPT API details
REDMIND_API_URL = "http://redmindgpt.redmindtechnologies.com/v1"
REDMIND_API_KEY = "dataset-feqz5KrqHkFRdWbh2DInt58L"

# Function to process PDF and store each page
def process_pdf_and_store(file_bytes: bytes, filename: str):
    with pdfplumber.open(file_bytes) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if text:
                embedding = model.encode(text, normalize_embeddings=True).tolist()
                uid = str(uuid.uuid4())
                collection.add(
                    documents=[text],
                    embeddings=[embedding],
                    ids=[uid],
                    metadatas=[{
                        "filename": filename,
                        "page": page_number
                    }]
                )

# Home route
@app.get("/")
def root():
    return {"message": "Semantic Document Retrieval API with RedMindGPT is running!"}

# Upload PDF and store embeddings
@app.post("/upload-pdf/")
async def upload_pdf(file: UploadFile = File(...)):
    if not file.filename.endswith(".pdf"):
        return {"error": "Only PDF files are supported."}
    
    contents = await file.read()
    try:
        process_pdf_and_store(file_bytes=contents, filename=file.filename)
        return {"message": f"Successfully processed and stored '{file.filename}'"}
    except Exception as e:
        return {"error": f"Failed to process PDF: {str(e)}"}

# Search top K results
@app.post("/search/")
async def search_text(query: str = Form(...), top_k: int = 3):
    try:
        embedding = model.encode(query, normalize_embeddings=True).tolist()
        results = collection.query(query_embeddings=[embedding], n_results=top_k)

        return {
            "query": query,
            "results": [
                {
                    "filename": metadata["filename"],
                    "page": metadata["page"],
                    "snippet": doc[:200] + "..." if len(doc) > 200 else doc,
                    "score": score
                }
                for doc, metadata, score in zip(
                    results["documents"][0],
                    results["metadatas"][0],
                    results["distances"][0]
                )
            ]
        }
    except Exception as e:
        return {"error": f"Search failed: {str(e)}"}

# Search + send top result to RedMind API
@app.post("/search-and-query/")
async def search_and_query_redmind(question: str = Form(...)):
    try:
        # Get document embedding
        embedding = model.encode(question, normalize_embeddings=True).tolist()
        results = collection.query(query_embeddings=[embedding], n_results=1)

        if not results["documents"][0]:
            return {"error": "No relevant document found."}

        top_doc = results["documents"][0][0]

        # Send top doc + question to RedMind
        headers = {
            "Authorization": f"Bearer {REDMIND_API_KEY}",
            "Content-Type": "application/json"
        }

        payload = {
            "input": f"Context: {top_doc}\n\nQuestion: {question}"
        }

        async with httpx.AsyncClient() as client:
            response = await client.post(REDMIND_API_URL, headers=headers, json=payload)
            response.raise_for_status()
            answer = response.json()

        return {
            "question": question,
            "top_document_snippet": top_doc[:200] + "...",
            "redmind_response": answer
        }

    except Exception as e:
        return {"error": f"RedMind integration failed: {str(e)}"}

# List all stored documents (for dev use)
@app.get("/list-docs/")
def list_documents():
    try:
        return collection.peek()
    except Exception as e:
        return {"error": f"Failed to list documents: {str(e)}"}