Spaces:

vickyvigneshmass
/

test

Runtime error

App Files Files Community

test / app.py

vickyvigneshmass

Update app.py

e76f3e1 verified 5 months ago

raw

history blame contribute delete

4.57 kB

	from fastapi import FastAPI, UploadFile, File, Form
	from sentence_transformers import SentenceTransformer
	import pdfplumber
	import uuid
	import chromadb
	from chromadb.config import Settings
	import httpx
	import os

	# Fix: Set custom writable Hugging Face cache directory
	os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
	os.makedirs("/app/cache", exist_ok=True)

	# Initialize FastAPI
	app = FastAPI()

	# Load SentenceTransformer model for document embeddings
	model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

	# Initialize ChromaDB
	chroma_client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory="./chroma_storage"))
	collection = chroma_client.get_or_create_collection(name="documents")

	# RedMindGPT API details
	REDMIND_API_URL = "http://redmindgpt.redmindtechnologies.com/v1"
	REDMIND_API_KEY = "dataset-feqz5KrqHkFRdWbh2DInt58L"

	# Function to process PDF and store each page
	def process_pdf_and_store(file_bytes: bytes, filename: str):
	with pdfplumber.open(file_bytes) as pdf:
	for page_number, page in enumerate(pdf.pages, start=1):
	text = page.extract_text()
	if text:
	embedding = model.encode(text, normalize_embeddings=True).tolist()
	uid = str(uuid.uuid4())
	collection.add(
	documents=[text],
	embeddings=[embedding],
	ids=[uid],
	metadatas=[{
	"filename": filename,
	"page": page_number
	}]
	)

	# Home route
	@app.get("/")
	def root():
	return {"message": "Semantic Document Retrieval API with RedMindGPT is running!"}

	# Upload PDF and store embeddings
	@app.post("/upload-pdf/")
	async def upload_pdf(file: UploadFile = File(...)):
	if not file.filename.endswith(".pdf"):
	return {"error": "Only PDF files are supported."}

	contents = await file.read()
	try:
	process_pdf_and_store(file_bytes=contents, filename=file.filename)
	return {"message": f"Successfully processed and stored '{file.filename}'"}
	except Exception as e:
	return {"error": f"Failed to process PDF: {str(e)}"}

	# Search top K results
	@app.post("/search/")
	async def search_text(query: str = Form(...), top_k: int = 3):
	try:
	embedding = model.encode(query, normalize_embeddings=True).tolist()
	results = collection.query(query_embeddings=[embedding], n_results=top_k)

	return {
	"query": query,
	"results": [
	{
	"filename": metadata["filename"],
	"page": metadata["page"],
	"snippet": doc[:200] + "..." if len(doc) > 200 else doc,
	"score": score
	}
	for doc, metadata, score in zip(
	results["documents"][0],
	results["metadatas"][0],
	results["distances"][0]
	)
	]
	}
	except Exception as e:
	return {"error": f"Search failed: {str(e)}"}

	# Search + send top result to RedMind API
	@app.post("/search-and-query/")
	async def search_and_query_redmind(question: str = Form(...)):
	try:
	# Get document embedding
	embedding = model.encode(question, normalize_embeddings=True).tolist()
	results = collection.query(query_embeddings=[embedding], n_results=1)

	if not results["documents"][0]:
	return {"error": "No relevant document found."}

	top_doc = results["documents"][0][0]

	# Send top doc + question to RedMind
	headers = {
	"Authorization": f"Bearer {REDMIND_API_KEY}",
	"Content-Type": "application/json"
	}

	payload = {
	"input": f"Context: {top_doc}\n\nQuestion: {question}"
	}

	async with httpx.AsyncClient() as client:
	response = await client.post(REDMIND_API_URL, headers=headers, json=payload)
	response.raise_for_status()
	answer = response.json()

	return {
	"question": question,
	"top_document_snippet": top_doc[:200] + "...",
	"redmind_response": answer
	}

	except Exception as e:
	return {"error": f"RedMind integration failed: {str(e)}"}

	# List all stored documents (for dev use)
	@app.get("/list-docs/")
	def list_documents():
	try:
	return collection.peek()
	except Exception as e:
	return {"error": f"Failed to list documents: {str(e)}"}