Spaces:
Sleeping
Sleeping
Commit
·
7fc2087
1
Parent(s):
3d0cf77
haindling issue of corerefence
Browse files- Rag/chunking.py +29 -9
- Rag/corefrence.py +11 -0
- Rag/summarization.py +14 -129
Rag/chunking.py
CHANGED
@@ -6,6 +6,8 @@ import os
|
|
6 |
import json
|
7 |
import logging
|
8 |
from Llm.llm_endpoints import get_llm_response
|
|
|
|
|
9 |
# Configuration
|
10 |
API_KEY = os.getenv("GOOGLE_API_KEY")
|
11 |
if API_KEY:
|
@@ -68,6 +70,12 @@ def query_database(collection, query_text, n_results=3):
|
|
68 |
metadatas = results['metadatas'][0]
|
69 |
return retrieved_docs, metadatas
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
def update_conversation_history(history, user_query, bot_response):
|
72 |
"""
|
73 |
Update and keeps track of conversation history between user and the bot
|
@@ -76,16 +84,27 @@ def update_conversation_history(history, user_query, bot_response):
|
|
76 |
:param bot_response:
|
77 |
:return:
|
78 |
"""
|
79 |
-
history.append({"user":user_query, "bot":bot_response})
|
80 |
return history
|
81 |
|
82 |
|
83 |
-
def generate_response(conversation_history,query_text, retrieved_docs):
|
84 |
"""Generate a response using retrieved documents and the generative AI model."""
|
85 |
|
86 |
context = " ".join(retrieved_docs)
|
87 |
history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
|
88 |
-
prompt = f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
response = get_llm_response(prompt)
|
90 |
return response
|
91 |
|
@@ -108,16 +127,17 @@ def main_workflow(transcripts_folder_path, collection):
|
|
108 |
if query_text.lower() == "exit":
|
109 |
print("Ending the conversation. Goodbye")
|
110 |
break
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
114 |
print(metadatas)
|
115 |
-
print("-"*50)
|
116 |
if not retrived_docs:
|
117 |
print("No relevent documents is found")
|
118 |
continue
|
119 |
-
response = generate_response(conversation_history,query_text,retrived_docs)
|
120 |
-
conversation_history = update_conversation_history(conversation_history,query_text,response)
|
121 |
print("\nGenerated Response:")
|
122 |
print(response)
|
123 |
|
|
|
6 |
import json
|
7 |
import logging
|
8 |
from Llm.llm_endpoints import get_llm_response
|
9 |
+
from Rag.summarization import summarize_conversation
|
10 |
+
from Rag.corefrence import resolve_coreference_in_query
|
11 |
# Configuration
|
12 |
API_KEY = os.getenv("GOOGLE_API_KEY")
|
13 |
if API_KEY:
|
|
|
70 |
metadatas = results['metadatas'][0]
|
71 |
return retrieved_docs, metadatas
|
72 |
|
73 |
+
|
74 |
+
def enhance_query_with_history(query_text, summarized_history):
|
75 |
+
enhance_query = f"{query_text}*2\n\n{summarized_history}"
|
76 |
+
return enhance_query
|
77 |
+
|
78 |
+
|
79 |
def update_conversation_history(history, user_query, bot_response):
|
80 |
"""
|
81 |
Update and keeps track of conversation history between user and the bot
|
|
|
84 |
:param bot_response:
|
85 |
:return:
|
86 |
"""
|
87 |
+
history.append({"user": user_query, "bot": bot_response})
|
88 |
return history
|
89 |
|
90 |
|
91 |
+
def generate_response(conversation_history, query_text, retrieved_docs):
|
92 |
"""Generate a response using retrieved documents and the generative AI model."""
|
93 |
|
94 |
context = " ".join(retrieved_docs)
|
95 |
history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
|
96 |
+
prompt = f"""
|
97 |
+
Using the context below and the conversation history, answer the question:
|
98 |
+
|
99 |
+
Context:
|
100 |
+
{context}
|
101 |
+
|
102 |
+
Conversation History:
|
103 |
+
{history_str}
|
104 |
+
|
105 |
+
Question: {query_text}
|
106 |
+
"""
|
107 |
+
|
108 |
response = get_llm_response(prompt)
|
109 |
return response
|
110 |
|
|
|
127 |
if query_text.lower() == "exit":
|
128 |
print("Ending the conversation. Goodbye")
|
129 |
break
|
130 |
+
query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
|
131 |
+
resolved_query = resolve_coreference_in_query(query_text_with_conversation_history, conversation_history)
|
132 |
+
retrived_docs, metadatas = query_database(collection, resolved_query)
|
133 |
+
print("-" * 50)
|
134 |
print(metadatas)
|
135 |
+
print("-" * 50)
|
136 |
if not retrived_docs:
|
137 |
print("No relevent documents is found")
|
138 |
continue
|
139 |
+
response = generate_response(conversation_history, query_text, retrived_docs)
|
140 |
+
conversation_history = update_conversation_history(conversation_history, query_text, response)
|
141 |
print("\nGenerated Response:")
|
142 |
print(response)
|
143 |
|
Rag/corefrence.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
|
3 |
+
coref_pipeline = pipeline("coref-resolution", model="coref-spanbert-large")
|
4 |
+
|
5 |
+
|
6 |
+
def resolve_coreference_in_query(query_text, conversation_history):
|
7 |
+
context = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
|
8 |
+
full_text = f"{context}\nUser: {query_text}"
|
9 |
+
resolved_text = coref_pipeline(full_text)
|
10 |
+
resolved_query = resolved_text.split("User:")[-1].strip()
|
11 |
+
return resolved_query
|
Rag/summarization.py
CHANGED
@@ -1,129 +1,14 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
# with tqdm(total=len(docs), desc="Generating embeddings") as pbar:
|
16 |
-
# for i in range(0, len(docs), batch_size):
|
17 |
-
# batch = docs[i:i + batch_size]
|
18 |
-
# batch_embeddings = model.encode(batch)
|
19 |
-
# embeddings.extend(batch_embeddings)
|
20 |
-
# pbar.update(len(batch))
|
21 |
-
#
|
22 |
-
# return embeddings
|
23 |
-
#
|
24 |
-
#
|
25 |
-
# def initialize_chroma_client(db_path):
|
26 |
-
# """Initialize ChromaDB client without removing existing database"""
|
27 |
-
# if not os.path.exists(db_path):
|
28 |
-
# try:
|
29 |
-
# os.makedirs(db_path, mode=0o777, exist_ok=True)
|
30 |
-
# print(f"Created database directory at: {db_path}")
|
31 |
-
# except Exception as e:
|
32 |
-
# print(f"Error creating database directory: {str(e)}")
|
33 |
-
# raise
|
34 |
-
#
|
35 |
-
# return chromadb.PersistentClient(path=db_path)
|
36 |
-
#
|
37 |
-
#
|
38 |
-
# def get_or_create_collection(client, collection_name="transcript_collection"):
|
39 |
-
# """Get existing collection or create new one if it doesn't exist"""
|
40 |
-
# try:
|
41 |
-
# # Try to get existing collection
|
42 |
-
# collection = client.get_collection(name=collection_name)
|
43 |
-
# print(f"Found existing collection with {collection.count()} documents")
|
44 |
-
# return collection
|
45 |
-
# except Exception:
|
46 |
-
# # Create new collection if it doesn't exist
|
47 |
-
# collection = client.create_collection(
|
48 |
-
# name=collection_name,
|
49 |
-
# metadata={"description": "Video transcript embeddings"}
|
50 |
-
# )
|
51 |
-
# print("Created new collection")
|
52 |
-
# return collection
|
53 |
-
#
|
54 |
-
#
|
55 |
-
# def process_new_chunks(chunks, collection):
|
56 |
-
# """Process and add new chunks to the collection"""
|
57 |
-
# # Prepare documents for insertion
|
58 |
-
# docs = [chunk.page_content for chunk in chunks]
|
59 |
-
# metadatas = [chunk.metadata for chunk in chunks]
|
60 |
-
#
|
61 |
-
# # Generate new IDs starting after existing documents
|
62 |
-
# start_id = collection.count()
|
63 |
-
# ids = [str(i) for i in range(start_id, start_id + len(chunks))]
|
64 |
-
#
|
65 |
-
# print(f"Generating embeddings for {len(docs)} new documents...")
|
66 |
-
# embeddings = get_embeddings(docs)
|
67 |
-
#
|
68 |
-
# print(f"Adding {len(docs)} new documents to collection...")
|
69 |
-
# try:
|
70 |
-
# collection.add(
|
71 |
-
# ids=ids,
|
72 |
-
# documents=docs,
|
73 |
-
# embeddings=embeddings,
|
74 |
-
# metadatas=metadatas
|
75 |
-
# )
|
76 |
-
# print(f"Successfully added {len(docs)} new documents")
|
77 |
-
# except Exception as e:
|
78 |
-
# print(f"Error adding documents: {str(e)}")
|
79 |
-
# raise
|
80 |
-
#
|
81 |
-
#
|
82 |
-
# def store_in_chroma(chunks, db_path="db"):
|
83 |
-
# """Store chunks in ChromaDB, handling both new and existing databases"""
|
84 |
-
# # Initialize client without removing existing DB
|
85 |
-
# client = initialize_chroma_client(db_path)
|
86 |
-
#
|
87 |
-
# # Get existing collection or create new one
|
88 |
-
# collection = get_or_create_collection(client)
|
89 |
-
#
|
90 |
-
# # Process and add new chunks
|
91 |
-
# process_new_chunks(chunks, collection)
|
92 |
-
#
|
93 |
-
# return collection
|
94 |
-
#
|
95 |
-
#
|
96 |
-
# def main():
|
97 |
-
# print("Starting chunking process...")
|
98 |
-
# chunks = split_text_to_chunks()
|
99 |
-
# print(f"Generated {len(chunks)} chunks")
|
100 |
-
#
|
101 |
-
# try:
|
102 |
-
# collection = store_in_chroma(chunks)
|
103 |
-
# final_count = collection.count()
|
104 |
-
# print(f"Process complete. Collection contains {final_count} documents.")
|
105 |
-
# return collection
|
106 |
-
# except Exception as e:
|
107 |
-
# print(f"Process failed: {str(e)}")
|
108 |
-
# return None
|
109 |
-
#
|
110 |
-
#
|
111 |
-
# if __name__ == "__main__":
|
112 |
-
# main()
|
113 |
-
#
|
114 |
-
# chroma_client = chromadb.Client()
|
115 |
-
# collection = chroma_client.get_or_create_collection(name = "my_collection")
|
116 |
-
# collection.upsert(
|
117 |
-
# documents = [
|
118 |
-
# "This is a dcouments about pineapple",
|
119 |
-
# "this is a document about oranges"
|
120 |
-
# ],
|
121 |
-
# ids = ['id1', 'id2']
|
122 |
-
#
|
123 |
-
# )
|
124 |
-
# results = collection.query(
|
125 |
-
# query_texts=["This is a query document about florida"], # Chroma will embed this for you
|
126 |
-
# n_results=2 # how many results to return
|
127 |
-
# )
|
128 |
-
#
|
129 |
-
# print(results)
|
|
|
1 |
+
from Llm.llm_endpoints import get_llm_response
|
2 |
+
|
3 |
+
|
4 |
+
def summarize_conversation(conversation_history):
|
5 |
+
try:
|
6 |
+
summary_prompt = "Summarize the following conversation:\n" + "\n".join(
|
7 |
+
[f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
|
8 |
+
summary = get_llm_response(summary_prompt)
|
9 |
+
print("*************************************************")
|
10 |
+
print(summary)
|
11 |
+
print("*************************************************")
|
12 |
+
return summary
|
13 |
+
except:
|
14 |
+
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|