Spaces:
Runtime error
Runtime error
File size: 4,572 Bytes
3acddcc 01fcf24 3acddcc 01fcf24 e76f3e1 f2ba171 3acddcc a5faf3c f2ba171 01fcf24 836cff2 3acddcc 01fcf24 836cff2 01fcf24 3acddcc 01fcf24 3acddcc 01fcf24 3acddcc 01fcf24 3acddcc 01fcf24 3acddcc 01fcf24 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
from fastapi import FastAPI, UploadFile, File, Form
from sentence_transformers import SentenceTransformer
import pdfplumber
import uuid
import chromadb
from chromadb.config import Settings
import httpx
import os
# Fix: Set custom writable Hugging Face cache directory
os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
os.makedirs("/app/cache", exist_ok=True)
# Initialize FastAPI
app = FastAPI()
# Load SentenceTransformer model for document embeddings
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# Initialize ChromaDB
chroma_client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory="./chroma_storage"))
collection = chroma_client.get_or_create_collection(name="documents")
# RedMindGPT API details
REDMIND_API_URL = "http://redmindgpt.redmindtechnologies.com/v1"
REDMIND_API_KEY = "dataset-feqz5KrqHkFRdWbh2DInt58L"
# Function to process PDF and store each page
def process_pdf_and_store(file_bytes: bytes, filename: str):
with pdfplumber.open(file_bytes) as pdf:
for page_number, page in enumerate(pdf.pages, start=1):
text = page.extract_text()
if text:
embedding = model.encode(text, normalize_embeddings=True).tolist()
uid = str(uuid.uuid4())
collection.add(
documents=[text],
embeddings=[embedding],
ids=[uid],
metadatas=[{
"filename": filename,
"page": page_number
}]
)
# Home route
@app.get("/")
def root():
return {"message": "Semantic Document Retrieval API with RedMindGPT is running!"}
# Upload PDF and store embeddings
@app.post("/upload-pdf/")
async def upload_pdf(file: UploadFile = File(...)):
if not file.filename.endswith(".pdf"):
return {"error": "Only PDF files are supported."}
contents = await file.read()
try:
process_pdf_and_store(file_bytes=contents, filename=file.filename)
return {"message": f"Successfully processed and stored '{file.filename}'"}
except Exception as e:
return {"error": f"Failed to process PDF: {str(e)}"}
# Search top K results
@app.post("/search/")
async def search_text(query: str = Form(...), top_k: int = 3):
try:
embedding = model.encode(query, normalize_embeddings=True).tolist()
results = collection.query(query_embeddings=[embedding], n_results=top_k)
return {
"query": query,
"results": [
{
"filename": metadata["filename"],
"page": metadata["page"],
"snippet": doc[:200] + "..." if len(doc) > 200 else doc,
"score": score
}
for doc, metadata, score in zip(
results["documents"][0],
results["metadatas"][0],
results["distances"][0]
)
]
}
except Exception as e:
return {"error": f"Search failed: {str(e)}"}
# Search + send top result to RedMind API
@app.post("/search-and-query/")
async def search_and_query_redmind(question: str = Form(...)):
try:
# Get document embedding
embedding = model.encode(question, normalize_embeddings=True).tolist()
results = collection.query(query_embeddings=[embedding], n_results=1)
if not results["documents"][0]:
return {"error": "No relevant document found."}
top_doc = results["documents"][0][0]
# Send top doc + question to RedMind
headers = {
"Authorization": f"Bearer {REDMIND_API_KEY}",
"Content-Type": "application/json"
}
payload = {
"input": f"Context: {top_doc}\n\nQuestion: {question}"
}
async with httpx.AsyncClient() as client:
response = await client.post(REDMIND_API_URL, headers=headers, json=payload)
response.raise_for_status()
answer = response.json()
return {
"question": question,
"top_document_snippet": top_doc[:200] + "...",
"redmind_response": answer
}
except Exception as e:
return {"error": f"RedMind integration failed: {str(e)}"}
# List all stored documents (for dev use)
@app.get("/list-docs/")
def list_documents():
try:
return collection.peek()
except Exception as e:
return {"error": f"Failed to list documents: {str(e)}"}
|