Nightwing11 commited on
Commit
7fc2087
·
1 Parent(s): 3d0cf77

haindling issue of corerefence

Browse files
Files changed (3) hide show
  1. Rag/chunking.py +29 -9
  2. Rag/corefrence.py +11 -0
  3. Rag/summarization.py +14 -129
Rag/chunking.py CHANGED
@@ -6,6 +6,8 @@ import os
6
  import json
7
  import logging
8
  from Llm.llm_endpoints import get_llm_response
 
 
9
  # Configuration
10
  API_KEY = os.getenv("GOOGLE_API_KEY")
11
  if API_KEY:
@@ -68,6 +70,12 @@ def query_database(collection, query_text, n_results=3):
68
  metadatas = results['metadatas'][0]
69
  return retrieved_docs, metadatas
70
 
 
 
 
 
 
 
71
  def update_conversation_history(history, user_query, bot_response):
72
  """
73
  Update and keeps track of conversation history between user and the bot
@@ -76,16 +84,27 @@ def update_conversation_history(history, user_query, bot_response):
76
  :param bot_response:
77
  :return:
78
  """
79
- history.append({"user":user_query, "bot":bot_response})
80
  return history
81
 
82
 
83
- def generate_response(conversation_history,query_text, retrieved_docs):
84
  """Generate a response using retrieved documents and the generative AI model."""
85
 
86
  context = " ".join(retrieved_docs)
87
  history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
88
- prompt = f"Using the context below, answer the question:\n\nContext:\n{context}\n\nQuestion: {query_text}"
 
 
 
 
 
 
 
 
 
 
 
89
  response = get_llm_response(prompt)
90
  return response
91
 
@@ -108,16 +127,17 @@ def main_workflow(transcripts_folder_path, collection):
108
  if query_text.lower() == "exit":
109
  print("Ending the conversation. Goodbye")
110
  break
111
-
112
- retrived_docs, metadatas = query_database(collection, query_text)
113
- print("-"*50)
 
114
  print(metadatas)
115
- print("-"*50)
116
  if not retrived_docs:
117
  print("No relevent documents is found")
118
  continue
119
- response = generate_response(conversation_history,query_text,retrived_docs)
120
- conversation_history = update_conversation_history(conversation_history,query_text,response)
121
  print("\nGenerated Response:")
122
  print(response)
123
 
 
6
  import json
7
  import logging
8
  from Llm.llm_endpoints import get_llm_response
9
+ from Rag.summarization import summarize_conversation
10
+ from Rag.corefrence import resolve_coreference_in_query
11
  # Configuration
12
  API_KEY = os.getenv("GOOGLE_API_KEY")
13
  if API_KEY:
 
70
  metadatas = results['metadatas'][0]
71
  return retrieved_docs, metadatas
72
 
73
+
74
+ def enhance_query_with_history(query_text, summarized_history):
75
+ enhance_query = f"{query_text}*2\n\n{summarized_history}"
76
+ return enhance_query
77
+
78
+
79
  def update_conversation_history(history, user_query, bot_response):
80
  """
81
  Update and keeps track of conversation history between user and the bot
 
84
  :param bot_response:
85
  :return:
86
  """
87
+ history.append({"user": user_query, "bot": bot_response})
88
  return history
89
 
90
 
91
+ def generate_response(conversation_history, query_text, retrieved_docs):
92
  """Generate a response using retrieved documents and the generative AI model."""
93
 
94
  context = " ".join(retrieved_docs)
95
  history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
96
+ prompt = f"""
97
+ Using the context below and the conversation history, answer the question:
98
+
99
+ Context:
100
+ {context}
101
+
102
+ Conversation History:
103
+ {history_str}
104
+
105
+ Question: {query_text}
106
+ """
107
+
108
  response = get_llm_response(prompt)
109
  return response
110
 
 
127
  if query_text.lower() == "exit":
128
  print("Ending the conversation. Goodbye")
129
  break
130
+ query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
131
+ resolved_query = resolve_coreference_in_query(query_text_with_conversation_history, conversation_history)
132
+ retrived_docs, metadatas = query_database(collection, resolved_query)
133
+ print("-" * 50)
134
  print(metadatas)
135
+ print("-" * 50)
136
  if not retrived_docs:
137
  print("No relevent documents is found")
138
  continue
139
+ response = generate_response(conversation_history, query_text, retrived_docs)
140
+ conversation_history = update_conversation_history(conversation_history, query_text, response)
141
  print("\nGenerated Response:")
142
  print(response)
143
 
Rag/corefrence.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ coref_pipeline = pipeline("coref-resolution", model="coref-spanbert-large")
4
+
5
+
6
+ def resolve_coreference_in_query(query_text, conversation_history):
7
+ context = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
8
+ full_text = f"{context}\nUser: {query_text}"
9
+ resolved_text = coref_pipeline(full_text)
10
+ resolved_query = resolved_text.split("User:")[-1].strip()
11
+ return resolved_query
Rag/summarization.py CHANGED
@@ -1,129 +1,14 @@
1
- # from Rag.chunking import split_text_to_chunks
2
- # from tqdm import tqdm
3
- # import numpy as np
4
- # import chromadb
5
- # from sentence_transformers import SentenceTransformer
6
- # import os
7
- # import shutil
8
- #
9
- #
10
- # def get_embeddings(docs, batch_size=32):
11
- # """Generate embeddings for documents using sentence transformer"""
12
- # model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
13
- # embeddings = []
14
- #
15
- # with tqdm(total=len(docs), desc="Generating embeddings") as pbar:
16
- # for i in range(0, len(docs), batch_size):
17
- # batch = docs[i:i + batch_size]
18
- # batch_embeddings = model.encode(batch)
19
- # embeddings.extend(batch_embeddings)
20
- # pbar.update(len(batch))
21
- #
22
- # return embeddings
23
- #
24
- #
25
- # def initialize_chroma_client(db_path):
26
- # """Initialize ChromaDB client without removing existing database"""
27
- # if not os.path.exists(db_path):
28
- # try:
29
- # os.makedirs(db_path, mode=0o777, exist_ok=True)
30
- # print(f"Created database directory at: {db_path}")
31
- # except Exception as e:
32
- # print(f"Error creating database directory: {str(e)}")
33
- # raise
34
- #
35
- # return chromadb.PersistentClient(path=db_path)
36
- #
37
- #
38
- # def get_or_create_collection(client, collection_name="transcript_collection"):
39
- # """Get existing collection or create new one if it doesn't exist"""
40
- # try:
41
- # # Try to get existing collection
42
- # collection = client.get_collection(name=collection_name)
43
- # print(f"Found existing collection with {collection.count()} documents")
44
- # return collection
45
- # except Exception:
46
- # # Create new collection if it doesn't exist
47
- # collection = client.create_collection(
48
- # name=collection_name,
49
- # metadata={"description": "Video transcript embeddings"}
50
- # )
51
- # print("Created new collection")
52
- # return collection
53
- #
54
- #
55
- # def process_new_chunks(chunks, collection):
56
- # """Process and add new chunks to the collection"""
57
- # # Prepare documents for insertion
58
- # docs = [chunk.page_content for chunk in chunks]
59
- # metadatas = [chunk.metadata for chunk in chunks]
60
- #
61
- # # Generate new IDs starting after existing documents
62
- # start_id = collection.count()
63
- # ids = [str(i) for i in range(start_id, start_id + len(chunks))]
64
- #
65
- # print(f"Generating embeddings for {len(docs)} new documents...")
66
- # embeddings = get_embeddings(docs)
67
- #
68
- # print(f"Adding {len(docs)} new documents to collection...")
69
- # try:
70
- # collection.add(
71
- # ids=ids,
72
- # documents=docs,
73
- # embeddings=embeddings,
74
- # metadatas=metadatas
75
- # )
76
- # print(f"Successfully added {len(docs)} new documents")
77
- # except Exception as e:
78
- # print(f"Error adding documents: {str(e)}")
79
- # raise
80
- #
81
- #
82
- # def store_in_chroma(chunks, db_path="db"):
83
- # """Store chunks in ChromaDB, handling both new and existing databases"""
84
- # # Initialize client without removing existing DB
85
- # client = initialize_chroma_client(db_path)
86
- #
87
- # # Get existing collection or create new one
88
- # collection = get_or_create_collection(client)
89
- #
90
- # # Process and add new chunks
91
- # process_new_chunks(chunks, collection)
92
- #
93
- # return collection
94
- #
95
- #
96
- # def main():
97
- # print("Starting chunking process...")
98
- # chunks = split_text_to_chunks()
99
- # print(f"Generated {len(chunks)} chunks")
100
- #
101
- # try:
102
- # collection = store_in_chroma(chunks)
103
- # final_count = collection.count()
104
- # print(f"Process complete. Collection contains {final_count} documents.")
105
- # return collection
106
- # except Exception as e:
107
- # print(f"Process failed: {str(e)}")
108
- # return None
109
- #
110
- #
111
- # if __name__ == "__main__":
112
- # main()
113
- #
114
- # chroma_client = chromadb.Client()
115
- # collection = chroma_client.get_or_create_collection(name = "my_collection")
116
- # collection.upsert(
117
- # documents = [
118
- # "This is a dcouments about pineapple",
119
- # "this is a document about oranges"
120
- # ],
121
- # ids = ['id1', 'id2']
122
- #
123
- # )
124
- # results = collection.query(
125
- # query_texts=["This is a query document about florida"], # Chroma will embed this for you
126
- # n_results=2 # how many results to return
127
- # )
128
- #
129
- # print(results)
 
1
+ from Llm.llm_endpoints import get_llm_response
2
+
3
+
4
+ def summarize_conversation(conversation_history):
5
+ try:
6
+ summary_prompt = "Summarize the following conversation:\n" + "\n".join(
7
+ [f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
8
+ summary = get_llm_response(summary_prompt)
9
+ print("*************************************************")
10
+ print(summary)
11
+ print("*************************************************")
12
+ return summary
13
+ except:
14
+ return ""