import os import gradio as gr from openai import OpenAI import weaviate from weaviate.classes.init import Auth import pypdf # Replaced PyPDF2 import docx from langchain.text_splitter import RecursiveCharacterTextSplitter from dotenv import load_dotenv from prompt_template import ( Prompt_template_translation, Prompt_template_LLM_Generation, Prompt_template_Reranker, Prompt_template_Wisal, Prompt_template_Halluciations, Prompt_template_paraphrasing, Prompt_template_Translate_to_original, Prompt_template_relevance, Prompt_template_User_document_prompt ) from query_utils import process_query_for_rewrite, get_non_autism_response # ─── Configuration ───────────────────────────────────────────────────────────── # helper functions GEMINI_API_KEY="AIzaSyCUCivstFpC9pq_jMHMYdlPrmh9Bx97dFo" TAVILY_API_KEY="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" OPENAI_API_KEY="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" QDRANT_API_KEY="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIiwiZXhwIjoxNzUxMDUxNzg4fQ.I9J-K7OM0BtcNKgj2d4uVM8QYAHYfFCVAyP4rlZkK2E" QDRANT_URL="https://6a3aade6-e8ad-4a6c-a579-21f5af90b7e8.us-east4-0.gcp.cloud.qdrant.io" OPENAI_API_KEY="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" WEAVIATE_URL="yorcqe2sqswhcaivxvt9a.c0.us-west3.gcp.weaviate.cloud" WEAVIATE_API_KEY="d2d0VGdZQTBmdTFlOWdDZl9tT2h3WDVWd1NpT1dQWHdGK0xjR1hYeWxicUxHVnFRazRUSjY2VlRUVlkwPV92MjAw" DEEPINFRA_API_KEY="285LUJulGIprqT6hcPhiXtcrphU04FG4" DEEPINFRA_BASE_URL="https://api.deepinfra.com/v1/openai" openai = OpenAI( api_key=DEEPINFRA_API_KEY, base_url="https://api.deepinfra.com/v1/openai", ) # Initialize Weaviate client client = weaviate.connect_to_weaviate_cloud( cluster_url=WEAVIATE_URL, auth_credentials=Auth.api_key(WEAVIATE_API_KEY), skip_init_checks=True, # <-- This disables gRPC check ) # ─── Utility: Extract raw text ────────────────────────────────────────────────── def extract_text(file_path: str) -> str: ext = os.path.splitext(file_path)[1].lower() if ext == ".pdf": text = "" with open(file_path, "rb") as f: reader = pypdf.PdfReader(f) for page in reader.pages: page_text = page.extract_text() or "" text += page_text + "\n" elif ext == ".docx": doc = docx.Document(file_path) text = "\n".join(p.text for p in doc.paragraphs) elif ext == ".txt": with open(file_path, "r", encoding="utf-8") as f: text = f.read() else: raise ValueError("Unsupported file format. Use PDF, DOCX, or TXT.") return text # ─── Chunker & Embed ────────────────────────────────────────────────────────── splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, separators=["\n\n", "\n", " "], ) def embed_texts(texts: list[str], batch_size: int = 70) -> list[list[float]]: """Embed texts in batches to avoid API limits.""" all_embeddings = [] for i in range(0, len(texts), batch_size): batch = texts[i:i + batch_size] resp = openai.embeddings.create( model="Qwen/Qwen3-Embedding-8B", input=batch, encoding_format="float" ) all_embeddings.extend([item.embedding for item in resp.data]) return all_embeddings # ─── Ingest & Index ─────────────────────────────────────────────────────────── def ingest_file(file_path: str) -> str: raw = extract_text(file_path) docs = splitter.split_text(raw) texts = [chunk for chunk in docs] vectors = embed_texts(texts) # Get the collection documents = client.collections.get("user") # Batch insert with new API with client.batch.dynamic() as batch: for txt, vec in zip(texts, vectors): batch.add_object( collection="user", properties={"text": txt}, vector=vec ) return f"Ingested {len(texts)} chunks from {os.path.basename(file_path)}" # ───────────────────────────────────────────── Query & Answer ─────────────────────────────────────────────────────────── def answer_question(question: str) -> str: # Process query for rewriting and relevance checking corrected_query, is_autism_related, rewritten_query = process_query_for_rewrite(question) # If not autism-related, show direct rejection message if not is_autism_related: return get_non_autism_response() # Use the corrected query for retrieval q_vec = embed_texts([corrected_query])[0] documents = client.collections.get("user") response = documents.query.near_vector( near_vector=q_vec, limit=5, return_metadata=["distance"] ) hits = response.objects context = "\n\n".join(hit.properties["text"] for hit in hits) print(context) UserSpecificDocument_prompt = Prompt_template_User_document_prompt.format(new_query=corrected_query, document=context) chat = openai.chat.completions.create( model="Qwen/Qwen3-32B", messages=[ {"role": "user", "content": UserSpecificDocument_prompt } ], temperature=0, reasoning_effort="none" ) initial_answer = chat.choices[0].message.content # NEW: Check if the generated answer is sufficiently related to autism from query_utils import check_answer_autism_relevance, get_non_autism_answer_response answer_relevance_score = check_answer_autism_relevance(initial_answer) # If answer relevance is below 50%, refuse the answer (updated threshold for enhanced scoring) if answer_relevance_score < 50: return get_non_autism_answer_response() # If sufficiently autism-related, return the answer return initial_answer # ─── Gradio Interface ───────────────────────────────────────────────────────── with gr.Blocks(title="Document Q&A with Qwen & Weaviate") as demo: gr.Markdown("## Upload a PDF, DOCX, or TXT and then ask away!") with gr.Row(): up = gr.File(label="Select document") btn = gr.Button("Ingest") out = gr.Textbox(label="Status", interactive=False) btn.click(fn=lambda f: ingest_file(f.name), inputs=up, outputs=out) with gr.Row(): q = gr.Textbox(placeholder="Your question...", lines=2) ask = gr.Button("Ask") ans = gr.Textbox(label="Answer", lines=6, interactive=False) ask.click(fn=answer_question, inputs=q, outputs=ans) if __name__ == "__main__": demo.launch(debug=True)