Nightwing11 commited on
Commit
6961452
Β·
1 Parent(s): 7fc2087

Yt video source added

Browse files
.gitignore CHANGED
@@ -129,7 +129,11 @@ Rag/chromadb.db/
129
 
130
  # mkdocs documentation
131
  /site
132
-
 
 
 
 
133
  # mypy
134
  .mypy_cache/
135
  .dmypy.json
 
129
 
130
  # mkdocs documentation
131
  /site
132
+ __pycache__/
133
+ *.pyc
134
+ *.pyo
135
+ *.pyd
136
+ .env
137
  # mypy
138
  .mypy_cache/
139
  .dmypy.json
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a base image
2
+ FROM python:3.9-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the requirements file into the container
8
+ COPY requirements.txt .
9
+
10
+ # Install dependencies
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copy the rest of your application
14
+ COPY . .
15
+
16
+ # Command to run your application
17
+ CMD ["python", "-m", "Rag"]
Example/__init__.py ADDED
File without changes
Example/rag_example.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
3
+ chromadb_path = "/home/nightwing/Codes/Xyzbot/Rag/chromadb.db"
4
+ client = chromadb.PersistentClient(path=chromadb_path)
5
+ collection = client.get_or_create_collection(name="yt_transcript_collection")
6
+ from Rag.rag_pipeline import main_workflow
7
+
8
+ # Run the application
9
+ if __name__ == "__main__":
10
+ main_workflow(transcripts_folder_path, collection)
Rag/{chunking.py β†’ rag_pipeline.py} RENAMED
@@ -3,18 +3,17 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
  from sentence_transformers import SentenceTransformer
4
  import google.generativeai as genai
5
  import os
6
- import json
7
  import logging
8
  from Llm.llm_endpoints import get_llm_response
9
- from Rag.summarization import summarize_conversation
10
- from Rag.corefrence import resolve_coreference_in_query
11
  # Configuration
12
  API_KEY = os.getenv("GOOGLE_API_KEY")
13
  if API_KEY:
14
  genai.configure(api_key=API_KEY)
15
 
16
  chromadb_path = "/home/nightwing/Codes/Xyzbot/Rag/chromadb.db"
17
- transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
18
  processed_files_path = "/home/nightwing/Codes/Xyzbot/Rag/Processed_folder/processed_files.json"
19
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
20
 
@@ -88,17 +87,22 @@ def update_conversation_history(history, user_query, bot_response):
88
  return history
89
 
90
 
91
- def generate_response(conversation_history, query_text, retrieved_docs):
92
  """Generate a response using retrieved documents and the generative AI model."""
93
 
94
  context = " ".join(retrieved_docs)
95
  history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
 
 
96
  prompt = f"""
97
  Using the context below and the conversation history, answer the question:
98
 
99
  Context:
100
  {context}
101
 
 
 
 
102
  Conversation History:
103
  {history_str}
104
 
@@ -106,7 +110,10 @@ def generate_response(conversation_history, query_text, retrieved_docs):
106
  """
107
 
108
  response = get_llm_response(prompt)
109
- return response
 
 
 
110
 
111
 
112
  # Main Workflow
@@ -128,20 +135,19 @@ def main_workflow(transcripts_folder_path, collection):
128
  print("Ending the conversation. Goodbye")
129
  break
130
  query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
131
- resolved_query = resolve_coreference_in_query(query_text_with_conversation_history, conversation_history)
132
- retrived_docs, metadatas = query_database(collection, resolved_query)
133
  print("-" * 50)
134
- print(metadatas)
 
135
  print("-" * 50)
136
  if not retrived_docs:
137
  print("No relevent documents is found")
138
  continue
139
- response = generate_response(conversation_history, query_text, retrived_docs)
140
  conversation_history = update_conversation_history(conversation_history, query_text, response)
141
  print("\nGenerated Response:")
142
  print(response)
143
 
144
 
145
- # Run the application
146
- if __name__ == "__main__":
147
- main_workflow(transcripts_folder_path, collection)
 
3
  from sentence_transformers import SentenceTransformer
4
  import google.generativeai as genai
5
  import os
 
6
  import logging
7
  from Llm.llm_endpoints import get_llm_response
8
+ from utils.get_link import get_source_link
9
+ # from Rag.corefrence import resolve_coreference_in_query
10
  # Configuration
11
  API_KEY = os.getenv("GOOGLE_API_KEY")
12
  if API_KEY:
13
  genai.configure(api_key=API_KEY)
14
 
15
  chromadb_path = "/home/nightwing/Codes/Xyzbot/Rag/chromadb.db"
16
+ # transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
17
  processed_files_path = "/home/nightwing/Codes/Xyzbot/Rag/Processed_folder/processed_files.json"
18
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
19
 
 
87
  return history
88
 
89
 
90
+ def generate_response(conversation_history, query_text, retrieved_docs, source_links):
91
  """Generate a response using retrieved documents and the generative AI model."""
92
 
93
  context = " ".join(retrieved_docs)
94
  history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
95
+ sources_str = "\n".join(source_links)
96
+
97
  prompt = f"""
98
  Using the context below and the conversation history, answer the question:
99
 
100
  Context:
101
  {context}
102
 
103
+ Conversation Sources:
104
+ {sources_str}
105
+
106
  Conversation History:
107
  {history_str}
108
 
 
110
  """
111
 
112
  response = get_llm_response(prompt)
113
+
114
+ # Append sources to the response
115
+ full_response = f"{response}\n\nSources:\n{sources_str}"
116
+ return full_response
117
 
118
 
119
  # Main Workflow
 
135
  print("Ending the conversation. Goodbye")
136
  break
137
  query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
138
+ # resolved_query = resolve_coreference_in_query(query_text_with_conversation_history, conversation_history)
139
+ retrived_docs, metadatas = query_database(collection, query_text_with_conversation_history)
140
  print("-" * 50)
141
+ source_link = get_source_link(metadatas)
142
+ print(source_link)
143
  print("-" * 50)
144
  if not retrived_docs:
145
  print("No relevent documents is found")
146
  continue
147
+ response = generate_response(conversation_history, query_text, retrived_docs, source_link)
148
  conversation_history = update_conversation_history(conversation_history, query_text, response)
149
  print("\nGenerated Response:")
150
  print(response)
151
 
152
 
153
+
 
 
requirements.txt CHANGED
@@ -13,3 +13,4 @@ flask_cors
13
  sentence_transformers
14
  tqdm
15
  torch
 
 
13
  sentence_transformers
14
  tqdm
15
  torch
16
+ transformers
utils/__init__.py ADDED
File without changes
{Rag β†’ utils}/corefrence.py RENAMED
@@ -1,6 +1,6 @@
1
  from transformers import pipeline
2
 
3
- coref_pipeline = pipeline("coref-resolution", model="coref-spanbert-large")
4
 
5
 
6
  def resolve_coreference_in_query(query_text, conversation_history):
 
1
  from transformers import pipeline
2
 
3
+ coref_pipeline = pipeline("coref-resolution", model="coref-roberta-large")
4
 
5
 
6
  def resolve_coreference_in_query(query_text, conversation_history):
utils/get_link.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_source_link(metadatas):
2
+ link = 'https://www.youtube.com/watch?v='
3
+ yt_link = []
4
+ for metadata in metadatas:
5
+ source = metadata['source']
6
+ values = source.split('.txt')
7
+
8
+ link = link + values[0]
9
+ yt_link.append(link)
10
+ # print(yt_link)
11
+ return yt_link
{Rag β†’ utils}/summarization.py RENAMED
File without changes