Spaces:

Nightwing11
/

Hubermanbot2

Sleeping

App Files Files

xet

Community

Nightwing11 commited on Jan 30

Commit

7fc2087

1 Parent(s): 3d0cf77

haindling issue of corerefence

Browse files

Files changed (3) hide show

Rag/chunking.py +29 -9
Rag/corefrence.py +11 -0
Rag/summarization.py +14 -129

Rag/chunking.py CHANGED Viewed

@@ -6,6 +6,8 @@ import os
 import json
 import logging
 from Llm.llm_endpoints import get_llm_response
 # Configuration
 API_KEY = os.getenv("GOOGLE_API_KEY")
 if API_KEY:
@@ -68,6 +70,12 @@ def query_database(collection, query_text, n_results=3):
     metadatas = results['metadatas'][0]
     return retrieved_docs, metadatas
 def update_conversation_history(history, user_query, bot_response):
     """
     Update and keeps track of conversation history between user and the bot
@@ -76,16 +84,27 @@ def update_conversation_history(history, user_query, bot_response):
     :param bot_response:
     :return:
     """
-    history.append({"user":user_query, "bot":bot_response})
     return history
-def generate_response(conversation_history,query_text, retrieved_docs):
     """Generate a response using retrieved documents and the generative AI model."""
     context = " ".join(retrieved_docs)
     history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
-    prompt = f"Using the context below, answer the question:\n\nContext:\n{context}\n\nQuestion: {query_text}"
     response = get_llm_response(prompt)
     return response
@@ -108,16 +127,17 @@ def main_workflow(transcripts_folder_path, collection):
         if query_text.lower() == "exit":
             print("Ending the conversation. Goodbye")
             break
-        retrived_docs, metadatas = query_database(collection, query_text)
-        print("-"*50)
         print(metadatas)
-        print("-"*50)
         if not retrived_docs:
             print("No relevent documents is found")
             continue
-        response = generate_response(conversation_history,query_text,retrived_docs)
-        conversation_history = update_conversation_history(conversation_history,query_text,response)
         print("\nGenerated Response:")
         print(response)

 import json
 import logging
 from Llm.llm_endpoints import get_llm_response
+from Rag.summarization import summarize_conversation
+from Rag.corefrence import resolve_coreference_in_query
 # Configuration
 API_KEY = os.getenv("GOOGLE_API_KEY")
 if API_KEY:
     metadatas = results['metadatas'][0]
     return retrieved_docs, metadatas
+def enhance_query_with_history(query_text, summarized_history):
+    enhance_query = f"{query_text}*2\n\n{summarized_history}"
+    return enhance_query
 def update_conversation_history(history, user_query, bot_response):
     """
     Update and keeps track of conversation history between user and the bot
     :param bot_response:
     :return:
     """
+    history.append({"user": user_query, "bot": bot_response})
     return history
+def generate_response(conversation_history, query_text, retrieved_docs):
     """Generate a response using retrieved documents and the generative AI model."""
     context = " ".join(retrieved_docs)
     history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
+    prompt = f"""
+    Using the context below and the conversation history, answer the question:
+    Context:
+    {context}
+    Conversation History:
+    {history_str}
+    Question: {query_text}
+    """
     response = get_llm_response(prompt)
     return response
         if query_text.lower() == "exit":
             print("Ending the conversation. Goodbye")
             break
+        query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
+        resolved_query = resolve_coreference_in_query(query_text_with_conversation_history, conversation_history)
+        retrived_docs, metadatas = query_database(collection, resolved_query)
+        print("-" * 50)
         print(metadatas)
+        print("-" * 50)
         if not retrived_docs:
             print("No relevent documents is found")
             continue
+        response = generate_response(conversation_history, query_text, retrived_docs)
+        conversation_history = update_conversation_history(conversation_history, query_text, response)
         print("\nGenerated Response:")
         print(response)

Rag/corefrence.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from transformers import pipeline
+coref_pipeline = pipeline("coref-resolution", model="coref-spanbert-large")
+def resolve_coreference_in_query(query_text, conversation_history):
+    context = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
+    full_text = f"{context}\nUser: {query_text}"
+    resolved_text = coref_pipeline(full_text)
+    resolved_query = resolved_text.split("User:")[-1].strip()
+    return resolved_query

Rag/summarization.py CHANGED Viewed

@@ -1,129 +1,14 @@
-# from Rag.chunking import split_text_to_chunks
-# from tqdm import tqdm
-# import numpy as np
-# import chromadb
-# from sentence_transformers import SentenceTransformer
-# import os
-# import shutil
-#
-#
-# def get_embeddings(docs, batch_size=32):
-#     """Generate embeddings for documents using sentence transformer"""
-#     model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
-#     embeddings = []
-#
-#     with tqdm(total=len(docs), desc="Generating embeddings") as pbar:
-#         for i in range(0, len(docs), batch_size):
-#             batch = docs[i:i + batch_size]
-#             batch_embeddings = model.encode(batch)
-#             embeddings.extend(batch_embeddings)
-#             pbar.update(len(batch))
-#
-#     return embeddings
-#
-#
-# def initialize_chroma_client(db_path):
-#     """Initialize ChromaDB client without removing existing database"""
-#     if not os.path.exists(db_path):
-#         try:
-#             os.makedirs(db_path, mode=0o777, exist_ok=True)
-#             print(f"Created database directory at: {db_path}")
-#         except Exception as e:
-#             print(f"Error creating database directory: {str(e)}")
-#             raise
-#
-#     return chromadb.PersistentClient(path=db_path)
-#
-#
-# def get_or_create_collection(client, collection_name="transcript_collection"):
-#     """Get existing collection or create new one if it doesn't exist"""
-#     try:
-#         # Try to get existing collection
-#         collection = client.get_collection(name=collection_name)
-#         print(f"Found existing collection with {collection.count()} documents")
-#         return collection
-#     except Exception:
-#         # Create new collection if it doesn't exist
-#         collection = client.create_collection(
-#             name=collection_name,
-#             metadata={"description": "Video transcript embeddings"}
-#         )
-#         print("Created new collection")
-#         return collection
-#
-#
-# def process_new_chunks(chunks, collection):
-#     """Process and add new chunks to the collection"""
-#     # Prepare documents for insertion
-#     docs = [chunk.page_content for chunk in chunks]
-#     metadatas = [chunk.metadata for chunk in chunks]
-#
-#     # Generate new IDs starting after existing documents
-#     start_id = collection.count()
-#     ids = [str(i) for i in range(start_id, start_id + len(chunks))]
-#
-#     print(f"Generating embeddings for {len(docs)} new documents...")
-#     embeddings = get_embeddings(docs)
-#
-#     print(f"Adding {len(docs)} new documents to collection...")
-#     try:
-#         collection.add(
-#             ids=ids,
-#             documents=docs,
-#             embeddings=embeddings,
-#             metadatas=metadatas
-#         )
-#         print(f"Successfully added {len(docs)} new documents")
-#     except Exception as e:
-#         print(f"Error adding documents: {str(e)}")
-#         raise
-#
-#
-# def store_in_chroma(chunks, db_path="db"):
-#     """Store chunks in ChromaDB, handling both new and existing databases"""
-#     # Initialize client without removing existing DB
-#     client = initialize_chroma_client(db_path)
-#
-#     # Get existing collection or create new one
-#     collection = get_or_create_collection(client)
-#
-#     # Process and add new chunks
-#     process_new_chunks(chunks, collection)
-#
-#     return collection
-#
-#
-# def main():
-#     print("Starting chunking process...")
-#     chunks = split_text_to_chunks()
-#     print(f"Generated {len(chunks)} chunks")
-#
-#     try:
-#         collection = store_in_chroma(chunks)
-#         final_count = collection.count()
-#         print(f"Process complete. Collection contains {final_count} documents.")
-#         return collection
-#     except Exception as e:
-#         print(f"Process failed: {str(e)}")
-#         return None
-#
-#
-# if __name__ == "__main__":
-#     main()
-#
-# chroma_client = chromadb.Client()
-# collection = chroma_client.get_or_create_collection(name = "my_collection")
-# collection.upsert(
-#     documents = [
-#         "This is a dcouments about pineapple",
-#         "this is a document about oranges"
-#     ],
-#     ids = ['id1', 'id2']
-#
-# )
-# results = collection.query(
-#     query_texts=["This is a query document about florida"], # Chroma will embed this for you
-#     n_results=2 # how many results to return
-# )
-#
-# print(results)

+from Llm.llm_endpoints import get_llm_response
+def summarize_conversation(conversation_history):
+    try:
+        summary_prompt = "Summarize the following conversation:\n" + "\n".join(
+            [f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
+        summary = get_llm_response(summary_prompt)
+        print("*************************************************")
+        print(summary)
+        print("*************************************************")
+        return summary
+    except:
+        return ""