Spaces:

khalednabawi11
/

Med-Chatbot-API

Sleeping

App Files Files Community

khalednabawi11 commited on May 16

Commit

9b4a539

verified ·

1 Parent(s): 99639ba

Update app.py

Browse files

Files changed (1) hide show

app.py +264 -113

app.py CHANGED Viewed

@@ -1,52 +1,253 @@
 import torch
 import asyncio
 import logging
 import signal
 import uvicorn
-import os
 from fastapi import FastAPI, Request, HTTPException, status
 from pydantic import BaseModel, Field
 from langdetect import detect
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, GenerationConfig
-from langchain.vectorstores import Qdrant
 from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.chains import RetrievalQA
-from langchain.llms import HuggingFacePipeline
 from qdrant_client import QdrantClient
-from langchain.callbacks.base import BaseCallbackHandler
 from huggingface_hub import hf_hub_download
-from contextlib import asynccontextmanager
-# Get environment variables
-QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
-QDRANT_URL = os.getenv("QDRANT_URL")
 COLLECTION_NAME = "arabic_rag_collection"
-QDRANT_URL = os.getenv("QDRANT_URL", "https://12efeef2-9f10-4402-9deb-f070977ddfc8.eu-central-1-0.aws.cloud.qdrant.io:6333")
-QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.Jb39rYQW2rSE9RdXrjdzKY6T1RF44XjdQzCvzFkjat4")
 # === LOGGING === #
-logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
-# Load model and tokenizer
 model_name = "FreedomIntelligence/Apollo-7B"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(model_name)
 tokenizer.pad_token = tokenizer.eos_token
-# Connect to Qdrant + embedding
-embedding = HuggingFaceEmbeddings(model_name="Omartificial-Intelligence-Space/GATE-AraBert-v1")
-qdrant_client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
-vector_store = Qdrant(
-    client=qdrant_client,
-    collection_name=COLLECTION_NAME,
-    embeddings=embedding
-)
-# Generation settings
 generation_config = GenerationConfig(
     max_new_tokens=150,
     temperature=0.2,
@@ -56,7 +257,6 @@ generation_config = GenerationConfig(
     repetition_penalty=1.3,
 )
-# Text generation pipeline
 llm_pipeline = pipeline(
     model=model,
     tokenizer=tokenizer,
@@ -64,53 +264,20 @@ llm_pipeline = pipeline(
     generation_config=generation_config,
     device=model.device.index if model.device.type == "cuda" else -1
 )
 llm = HuggingFacePipeline(pipeline=llm_pipeline)
-retriever = vector_store.as_retriever(search_kwargs={"k": 3})
-# Set up RAG QA chain
-qa_chain = RetrievalQA.from_chain_type(
-    llm=llm,
-    retriever=retriever,
-    chain_type="stuff"
 )
-# FastAPI setup
-app = FastAPI(title="Apollo RAG Medical Chatbot")
-class Query(BaseModel):
-    question: str = Field(..., example="ما هي اسباب تساقط الشعر ؟", min_length=3)
-class TimeoutCallback(BaseCallbackHandler):
-    def __init__(self, timeout_seconds: int = 60):
-        self.timeout_seconds = timeout_seconds
-        self.start_time = None
-    async def on_llm_start(self, *args, **kwargs):
-        self.start_time = asyncio.get_event_loop().time()
-    async def on_llm_new_token(self, *args, **kwargs):
-        if asyncio.get_event_loop().time() - self.start_time > self.timeout_seconds:
-            raise TimeoutError("LLM processing timeout")
-# Prompt template
-# def generate_prompt(question: str) -> str:
-#     lang = detect(question)
-#     if lang == "ar":
-#         return f"""أجب على السؤال الطبي التالي بلغة عربية فصحى، بإجابة دقيقة ومفصلة. إذا لم تجد معلومات كافية في السياق، استخدم معرفتك الطبية السابقة.
-#  وتأكد من ان:
-# - عدم تكرار أي نقطة أو عبارة أو كلمة
-# - وضوح وسلاسة كل نقطة
-# - تجنب الحشو والعبارات الزائدة
-# السؤال: {question}
-# الإجابة:"""
-#     else:
-#         return f"""Answer the following medical question in clear English with a detailed, non-redundant response. Do not repeat ideas or restate the question. If the context lacks information, rely on prior medical knowledge.
-# Question: {question}
-# Answer:"""
 def generate_prompt(question: str) -> str:
     lang = detect(question)
     if lang == "ar":
@@ -124,23 +291,28 @@ def generate_prompt(question: str) -> str:
     else:
         return (
             "Answer the following medical question in clear English with a detailed, non-redundant response. "
-            "Do not repeat ideas, phrases, or restate the question in the answer. If the context lacks relevant "
-            "information, rely on your prior medical knowledge. If the answer involves multiple points, list them "
-            "in concise and distinct bullet points:\n"
             f"Question: {question}\nAnswer:"
         )
-# Input schema
-# class ChatRequest(BaseModel):
-#     message: str
-# # Output endpoint
-# @app.post("/chat")
-# def chat_rag(req: ChatRequest):
-#     prompt = generate_prompt(req.message)
-#     response = qa_chain.run(prompt)
-#     return {"response": response}
 # === ROUTES === #
 @app.get("/")
@@ -150,55 +322,34 @@ async def root():
 @app.post("/ask")
 async def ask(query: Query):
     try:
-        logger.debug(f"Received question: {query.question}")
-        prompt = generate_prompt(query.question)
-        timeout_callback = TimeoutCallback(timeout_seconds=60)
-        # docs = retriever.get_relevant_documents(query.question)
-        # if not docs:
-        #     logger.warning("No documents retrieved from Qdrant for the question.")
-        # else:
-        #     logger.debug(f"Retrieved documents: {[doc.page_content for doc in docs[:1]]}")
-        loop = asyncio.get_event_loop()
-        answer = await asyncio.wait_for(
-            # qa_chain.run(prompt, callbacks=[timeout_callback]),
-            loop.run_in_executor(None, qa_chain.run, query.question),
-            timeout=360
-        )
-        if not answer:
-            raise ValueError("Empty answer returned from model")
-        if 'Answer:' in answer:
-            response_text = answer.split('Answer:')[-1].strip()
-        elif 'الإجابة:' in answer:
-            response_text = answer.split('الإجابة:')[-1].strip()
         else:
-            response_text = answer.strip()
         return {
             "status": "success",
-            "answer": answer,
             "response": response_text,
             "language": detect(query.question)
         }
-    except TimeoutError as te:
-        logger.error("Request timed out", exc_info=True)
         raise HTTPException(
             status_code=status.HTTP_504_GATEWAY_TIMEOUT,
-            detail={"status": "error", "message": "Request timed out", "error": str(te)}
         )
     except Exception as e:
         logger.error(f"Unexpected error: {e}", exc_info=True)
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail={"status": "error", "message": "Internal server error", "error": str(e)}
         )
 # === ENTRYPOINT === #
@@ -208,6 +359,6 @@ if __name__ == "__main__":
         exit(0)
     signal.signal(signal.SIGINT, handle_exit)
-    import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=8000)

+# import torch
+# import asyncio
+# import logging
+# import signal
+# import uvicorn
+# import os
+# from fastapi import FastAPI, Request, HTTPException, status
+# from pydantic import BaseModel, Field
+# from langdetect import detect
+# from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, GenerationConfig
+# from langchain.vectorstores import Qdrant
+# from langchain.embeddings import HuggingFaceEmbeddings
+# from langchain.chains import RetrievalQA
+# from langchain.llms import HuggingFacePipeline
+# from qdrant_client import QdrantClient
+# from langchain.callbacks.base import BaseCallbackHandler
+# from huggingface_hub import hf_hub_download
+# from contextlib import asynccontextmanager
+# # Get environment variables
+# QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
+# QDRANT_URL = os.getenv("QDRANT_URL")
+# COLLECTION_NAME = "arabic_rag_collection"
+# QDRANT_URL = os.getenv("QDRANT_URL", "https://12efeef2-9f10-4402-9deb-f070977ddfc8.eu-central-1-0.aws.cloud.qdrant.io:6333")
+# QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.Jb39rYQW2rSE9RdXrjdzKY6T1RF44XjdQzCvzFkjat4")
+# # === LOGGING === #
+# logging.basicConfig(level=logging.DEBUG)
+# logger = logging.getLogger(__name__)
+# # Load model and tokenizer
+# model_name = "FreedomIntelligence/Apollo-7B"
+# tokenizer = AutoTokenizer.from_pretrained(model_name)
+# model = AutoModelForCausalLM.from_pretrained(model_name)
+# tokenizer.pad_token = tokenizer.eos_token
+# # Connect to Qdrant + embedding
+# embedding = HuggingFaceEmbeddings(model_name="Omartificial-Intelligence-Space/GATE-AraBert-v1")
+# qdrant_client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
+# vector_store = Qdrant(
+#     client=qdrant_client,
+#     collection_name=COLLECTION_NAME,
+#     embeddings=embedding
+# )
+# # Generation settings
+# generation_config = GenerationConfig(
+#     max_new_tokens=150,
+#     temperature=0.2,
+#     top_k=20,
+#     do_sample=True,
+#     top_p=0.7,
+#     repetition_penalty=1.3,
+# )
+# # Text generation pipeline
+# llm_pipeline = pipeline(
+#     model=model,
+#     tokenizer=tokenizer,
+#     task="text-generation",
+#     generation_config=generation_config,
+#     device=model.device.index if model.device.type == "cuda" else -1
+# )
+# llm = HuggingFacePipeline(pipeline=llm_pipeline)
+# retriever = vector_store.as_retriever(search_kwargs={"k": 3})
+# # Set up RAG QA chain
+# qa_chain = RetrievalQA.from_chain_type(
+#     llm=llm,
+#     retriever=retriever,
+#     chain_type="stuff"
+# )
+# # FastAPI setup
+# app = FastAPI(title="Apollo RAG Medical Chatbot")
+# class Query(BaseModel):
+#     question: str = Field(..., example="ما هي اسباب تساقط الشعر ؟", min_length=3)
+# class TimeoutCallback(BaseCallbackHandler):
+#     def __init__(self, timeout_seconds: int = 60):
+#         self.timeout_seconds = timeout_seconds
+#         self.start_time = None
+#     async def on_llm_start(self, *args, **kwargs):
+#         self.start_time = asyncio.get_event_loop().time()
+#     async def on_llm_new_token(self, *args, **kwargs):
+#         if asyncio.get_event_loop().time() - self.start_time > self.timeout_seconds:
+#             raise TimeoutError("LLM processing timeout")
+# # Prompt template
+# # def generate_prompt(question: str) -> str:
+# #     lang = detect(question)
+# #     if lang == "ar":
+# #         return f"""أجب على السؤال الطبي التالي بلغة عربية فصحى، بإجابة دقيقة ومفصلة. إذا لم تجد معلومات كافية في السياق، استخدم معرفتك الطبية السابقة.
+# #  وتأكد من ان:
+# # - عدم تكرار أي نقطة أو عبارة أو كلمة
+# # - وضوح وسلاسة كل نقطة
+# # - تجنب الحشو والعبارات الزائدة
+# # السؤال: {question}
+# # الإجابة:"""
+# #     else:
+# #         return f"""Answer the following medical question in clear English with a detailed, non-redundant response. Do not repeat ideas or restate the question. If the context lacks information, rely on prior medical knowledge.
+# # Question: {question}
+# # Answer:"""
+# def generate_prompt(question: str) -> str:
+#     lang = detect(question)
+#     if lang == "ar":
+#         return (
+#             "أجب على السؤال الطبي التالي بلغة عربية فصحى، بإجابة دقيقة ومفصلة. إذا لم تجد معلومات كافية في السياق، استخدم معرفتك الطبية السابقة. \n"
+#             "- عدم تكرار أي نقطة أو عبارة أو كلمة\n"
+#             "- وضوح وسلاسة كل نقطة\n"
+#             "- تجنب الحشو والعبارات الزائدة\n"
+#             f"\nالسؤال: {question}\nالإجابة:"
+#         )
+#     else:
+#         return (
+#             "Answer the following medical question in clear English with a detailed, non-redundant response. "
+#             "Do not repeat ideas, phrases, or restate the question in the answer. If the context lacks relevant "
+#             "information, rely on your prior medical knowledge. If the answer involves multiple points, list them "
+#             "in concise and distinct bullet points:\n"
+#             f"Question: {question}\nAnswer:"
+#         )
+# # Input schema
+# # class ChatRequest(BaseModel):
+# #     message: str
+# # # Output endpoint
+# # @app.post("/chat")
+# # def chat_rag(req: ChatRequest):
+# #     prompt = generate_prompt(req.message)
+# #     response = qa_chain.run(prompt)
+# #     return {"response": response}
+# # === ROUTES === #
+# @app.get("/")
+# async def root():
+#     return {"message": "Medical QA API is running!"}
+# @app.post("/ask")
+# async def ask(query: Query):
+#     try:
+#         logger.debug(f"Received question: {query.question}")
+#         prompt = generate_prompt(query.question)
+#         timeout_callback = TimeoutCallback(timeout_seconds=60)
+#         # docs = retriever.get_relevant_documents(query.question)
+#         # if not docs:
+#         #     logger.warning("No documents retrieved from Qdrant for the question.")
+#         # else:
+#         #     logger.debug(f"Retrieved documents: {[doc.page_content for doc in docs[:1]]}")
+#         loop = asyncio.get_event_loop()
+#         answer = await asyncio.wait_for(
+#             # qa_chain.run(prompt, callbacks=[timeout_callback]),
+#             loop.run_in_executor(None, qa_chain.run, query.question),
+#             timeout=360
+#         )
+#         if not answer:
+#             raise ValueError("Empty answer returned from model")
+#         if 'Answer:' in answer:
+#             response_text = answer.split('Answer:')[-1].strip()
+#         elif 'الإجابة:' in answer:
+#             response_text = answer.split('الإجابة:')[-1].strip()
+#         else:
+#             response_text = answer.strip()
+#         return {
+#             "status": "success",
+#             "answer": answer,
+#             "response": response_text,
+#             "language": detect(query.question)
+#         }
+#     except TimeoutError as te:
+#         logger.error("Request timed out", exc_info=True)
+#         raise HTTPException(
+#             status_code=status.HTTP_504_GATEWAY_TIMEOUT,
+#             detail={"status": "error", "message": "Request timed out", "error": str(te)}
+#         )
+#     except Exception as e:
+#         logger.error(f"Unexpected error: {e}", exc_info=True)
+#         raise HTTPException(
+#             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+#             detail={"status": "error", "message": "Internal server error", "error": str(e)}
+#         )
+# # === ENTRYPOINT === #
+# if __name__ == "__main__":
+#     def handle_exit(signum, frame):
+#         print("Shutting down gracefully...")
+#         exit(0)
+#     signal.signal(signal.SIGINT, handle_exit)
+#     import uvicorn
+#     uvicorn.run(app, host="0.0.0.0", port=8000)
 import torch
 import asyncio
 import logging
 import signal
 import uvicorn
+import os
 from fastapi import FastAPI, Request, HTTPException, status
 from pydantic import BaseModel, Field
 from langdetect import detect
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, GenerationConfig
+from langchain_community.vectorstores import Qdrant
 from langchain.embeddings import HuggingFaceEmbeddings
+from langchain_community.llms import HuggingFacePipeline
 from qdrant_client import QdrantClient
+from langchain_core.runnables import RunnableMap
 from huggingface_hub import hf_hub_download
+# === ENVIRONMENT SETUP === #
+QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "your_fallback_api_key")
+QDRANT_URL = os.getenv("QDRANT_URL", "your_fallback_qdrant_url")
 COLLECTION_NAME = "arabic_rag_collection"
 # === LOGGING === #
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# === MODEL SETUP === #
 model_name = "FreedomIntelligence/Apollo-7B"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(model_name)
 tokenizer.pad_token = tokenizer.eos_token
+# === GENERATION CONFIG === #
 generation_config = GenerationConfig(
     max_new_tokens=150,
     temperature=0.2,
     repetition_penalty=1.3,
 )
 llm_pipeline = pipeline(
     model=model,
     tokenizer=tokenizer,
     generation_config=generation_config,
     device=model.device.index if model.device.type == "cuda" else -1
 )
 llm = HuggingFacePipeline(pipeline=llm_pipeline)
+# === EMBEDDING + VECTOR STORE === #
+embedding = HuggingFaceEmbeddings(model_name="Omartificial-Intelligence-Space/GATE-AraBert-v1")
+qdrant_client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
+vector_store = Qdrant(
+    client=qdrant_client,
+    collection_name=COLLECTION_NAME,
+    embeddings=embedding
 )
+retriever = vector_store.as_retriever(search_kwargs={"k": 3})
+# === PROMPT FUNCTION === #
 def generate_prompt(question: str) -> str:
     lang = detect(question)
     if lang == "ar":
     else:
         return (
             "Answer the following medical question in clear English with a detailed, non-redundant response. "
+            "Do not repeat ideas, phrases, or restate the question. If the context lacks relevant "
+            "information, rely on prior medical knowledge.\n"
             f"Question: {question}\nAnswer:"
         )
+# === FASTAPI SETUP === #
+app = FastAPI(title="Apollo RAG Medical Chatbot")
+class Query(BaseModel):
+    question: str = Field(..., example="ما هي اسباب تساقط الشعر ؟", min_length=3)
+# === RAG PIPELINE === #
+async def async_chain(question: str):
+    prompt = generate_prompt(question)
+    docs = await retriever.aget_relevant_documents(question)
+    if not docs:
+        logger.warning("No relevant documents found in Qdrant.")
+    context = "\n".join([doc.page_content for doc in docs])
+    full_prompt = f"{context}\n\n{prompt}"
+    logger.debug(f"Prompt: {full_prompt}")
+    response = llm.invoke(full_prompt)
+    return response
 # === ROUTES === #
 @app.get("/")
 @app.post("/ask")
 async def ask(query: Query):
     try:
+        response = await asyncio.wait_for(async_chain(query.question), timeout=60)
+        if 'Answer:' in response:
+            response_text = response.split('Answer:')[-1].strip()
+        elif 'الإجابة:' in response:
+            response_text = response.split('الإجابة:')[-1].strip()
         else:
+            response_text = response.strip()
         return {
             "status": "success",
+            "answer": response,
             "response": response_text,
             "language": detect(query.question)
         }
+    except asyncio.TimeoutError:
+        logger.error("Request timed out")
         raise HTTPException(
             status_code=status.HTTP_504_GATEWAY_TIMEOUT,
+            detail="Request timed out"
         )
     except Exception as e:
         logger.error(f"Unexpected error: {e}", exc_info=True)
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Internal server error: {e}"
         )
 # === ENTRYPOINT === #
         exit(0)
     signal.signal(signal.SIGINT, handle_exit)
     uvicorn.run(app, host="0.0.0.0", port=8000)