Spaces:
Sleeping
Sleeping
Commit
Β·
6961452
1
Parent(s):
7fc2087
Yt video source added
Browse files- .gitignore +5 -1
- Dockerfile +17 -0
- Example/__init__.py +0 -0
- Example/rag_example.py +10 -0
- Rag/{chunking.py β rag_pipeline.py} +19 -13
- requirements.txt +1 -0
- utils/__init__.py +0 -0
- {Rag β utils}/corefrence.py +1 -1
- utils/get_link.py +11 -0
- {Rag β utils}/summarization.py +0 -0
.gitignore
CHANGED
@@ -129,7 +129,11 @@ Rag/chromadb.db/
|
|
129 |
|
130 |
# mkdocs documentation
|
131 |
/site
|
132 |
-
|
|
|
|
|
|
|
|
|
133 |
# mypy
|
134 |
.mypy_cache/
|
135 |
.dmypy.json
|
|
|
129 |
|
130 |
# mkdocs documentation
|
131 |
/site
|
132 |
+
__pycache__/
|
133 |
+
*.pyc
|
134 |
+
*.pyo
|
135 |
+
*.pyd
|
136 |
+
.env
|
137 |
# mypy
|
138 |
.mypy_cache/
|
139 |
.dmypy.json
|
Dockerfile
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use an official Python runtime as a base image
|
2 |
+
FROM python:3.9-slim
|
3 |
+
|
4 |
+
# Set the working directory in the container
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# Copy the requirements file into the container
|
8 |
+
COPY requirements.txt .
|
9 |
+
|
10 |
+
# Install dependencies
|
11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
12 |
+
|
13 |
+
# Copy the rest of your application
|
14 |
+
COPY . .
|
15 |
+
|
16 |
+
# Command to run your application
|
17 |
+
CMD ["python", "-m", "Rag"]
|
Example/__init__.py
ADDED
File without changes
|
Example/rag_example.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chromadb
|
2 |
+
transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
|
3 |
+
chromadb_path = "/home/nightwing/Codes/Xyzbot/Rag/chromadb.db"
|
4 |
+
client = chromadb.PersistentClient(path=chromadb_path)
|
5 |
+
collection = client.get_or_create_collection(name="yt_transcript_collection")
|
6 |
+
from Rag.rag_pipeline import main_workflow
|
7 |
+
|
8 |
+
# Run the application
|
9 |
+
if __name__ == "__main__":
|
10 |
+
main_workflow(transcripts_folder_path, collection)
|
Rag/{chunking.py β rag_pipeline.py}
RENAMED
@@ -3,18 +3,17 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
import google.generativeai as genai
|
5 |
import os
|
6 |
-
import json
|
7 |
import logging
|
8 |
from Llm.llm_endpoints import get_llm_response
|
9 |
-
from
|
10 |
-
from Rag.corefrence import resolve_coreference_in_query
|
11 |
# Configuration
|
12 |
API_KEY = os.getenv("GOOGLE_API_KEY")
|
13 |
if API_KEY:
|
14 |
genai.configure(api_key=API_KEY)
|
15 |
|
16 |
chromadb_path = "/home/nightwing/Codes/Xyzbot/Rag/chromadb.db"
|
17 |
-
transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
|
18 |
processed_files_path = "/home/nightwing/Codes/Xyzbot/Rag/Processed_folder/processed_files.json"
|
19 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
20 |
|
@@ -88,17 +87,22 @@ def update_conversation_history(history, user_query, bot_response):
|
|
88 |
return history
|
89 |
|
90 |
|
91 |
-
def generate_response(conversation_history, query_text, retrieved_docs):
|
92 |
"""Generate a response using retrieved documents and the generative AI model."""
|
93 |
|
94 |
context = " ".join(retrieved_docs)
|
95 |
history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
|
|
|
|
|
96 |
prompt = f"""
|
97 |
Using the context below and the conversation history, answer the question:
|
98 |
|
99 |
Context:
|
100 |
{context}
|
101 |
|
|
|
|
|
|
|
102 |
Conversation History:
|
103 |
{history_str}
|
104 |
|
@@ -106,7 +110,10 @@ def generate_response(conversation_history, query_text, retrieved_docs):
|
|
106 |
"""
|
107 |
|
108 |
response = get_llm_response(prompt)
|
109 |
-
|
|
|
|
|
|
|
110 |
|
111 |
|
112 |
# Main Workflow
|
@@ -128,20 +135,19 @@ def main_workflow(transcripts_folder_path, collection):
|
|
128 |
print("Ending the conversation. Goodbye")
|
129 |
break
|
130 |
query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
|
131 |
-
resolved_query = resolve_coreference_in_query(query_text_with_conversation_history, conversation_history)
|
132 |
-
retrived_docs, metadatas = query_database(collection,
|
133 |
print("-" * 50)
|
134 |
-
|
|
|
135 |
print("-" * 50)
|
136 |
if not retrived_docs:
|
137 |
print("No relevent documents is found")
|
138 |
continue
|
139 |
-
response = generate_response(conversation_history, query_text, retrived_docs)
|
140 |
conversation_history = update_conversation_history(conversation_history, query_text, response)
|
141 |
print("\nGenerated Response:")
|
142 |
print(response)
|
143 |
|
144 |
|
145 |
-
|
146 |
-
if __name__ == "__main__":
|
147 |
-
main_workflow(transcripts_folder_path, collection)
|
|
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
import google.generativeai as genai
|
5 |
import os
|
|
|
6 |
import logging
|
7 |
from Llm.llm_endpoints import get_llm_response
|
8 |
+
from utils.get_link import get_source_link
|
9 |
+
# from Rag.corefrence import resolve_coreference_in_query
|
10 |
# Configuration
|
11 |
API_KEY = os.getenv("GOOGLE_API_KEY")
|
12 |
if API_KEY:
|
13 |
genai.configure(api_key=API_KEY)
|
14 |
|
15 |
chromadb_path = "/home/nightwing/Codes/Xyzbot/Rag/chromadb.db"
|
16 |
+
# transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
|
17 |
processed_files_path = "/home/nightwing/Codes/Xyzbot/Rag/Processed_folder/processed_files.json"
|
18 |
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
19 |
|
|
|
87 |
return history
|
88 |
|
89 |
|
90 |
+
def generate_response(conversation_history, query_text, retrieved_docs, source_links):
|
91 |
"""Generate a response using retrieved documents and the generative AI model."""
|
92 |
|
93 |
context = " ".join(retrieved_docs)
|
94 |
history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
|
95 |
+
sources_str = "\n".join(source_links)
|
96 |
+
|
97 |
prompt = f"""
|
98 |
Using the context below and the conversation history, answer the question:
|
99 |
|
100 |
Context:
|
101 |
{context}
|
102 |
|
103 |
+
Conversation Sources:
|
104 |
+
{sources_str}
|
105 |
+
|
106 |
Conversation History:
|
107 |
{history_str}
|
108 |
|
|
|
110 |
"""
|
111 |
|
112 |
response = get_llm_response(prompt)
|
113 |
+
|
114 |
+
# Append sources to the response
|
115 |
+
full_response = f"{response}\n\nSources:\n{sources_str}"
|
116 |
+
return full_response
|
117 |
|
118 |
|
119 |
# Main Workflow
|
|
|
135 |
print("Ending the conversation. Goodbye")
|
136 |
break
|
137 |
query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
|
138 |
+
# resolved_query = resolve_coreference_in_query(query_text_with_conversation_history, conversation_history)
|
139 |
+
retrived_docs, metadatas = query_database(collection, query_text_with_conversation_history)
|
140 |
print("-" * 50)
|
141 |
+
source_link = get_source_link(metadatas)
|
142 |
+
print(source_link)
|
143 |
print("-" * 50)
|
144 |
if not retrived_docs:
|
145 |
print("No relevent documents is found")
|
146 |
continue
|
147 |
+
response = generate_response(conversation_history, query_text, retrived_docs, source_link)
|
148 |
conversation_history = update_conversation_history(conversation_history, query_text, response)
|
149 |
print("\nGenerated Response:")
|
150 |
print(response)
|
151 |
|
152 |
|
153 |
+
|
|
|
|
requirements.txt
CHANGED
@@ -13,3 +13,4 @@ flask_cors
|
|
13 |
sentence_transformers
|
14 |
tqdm
|
15 |
torch
|
|
|
|
13 |
sentence_transformers
|
14 |
tqdm
|
15 |
torch
|
16 |
+
transformers
|
utils/__init__.py
ADDED
File without changes
|
{Rag β utils}/corefrence.py
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
from transformers import pipeline
|
2 |
|
3 |
-
coref_pipeline = pipeline("coref-resolution", model="coref-
|
4 |
|
5 |
|
6 |
def resolve_coreference_in_query(query_text, conversation_history):
|
|
|
1 |
from transformers import pipeline
|
2 |
|
3 |
+
coref_pipeline = pipeline("coref-resolution", model="coref-roberta-large")
|
4 |
|
5 |
|
6 |
def resolve_coreference_in_query(query_text, conversation_history):
|
utils/get_link.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def get_source_link(metadatas):
|
2 |
+
link = 'https://www.youtube.com/watch?v='
|
3 |
+
yt_link = []
|
4 |
+
for metadata in metadatas:
|
5 |
+
source = metadata['source']
|
6 |
+
values = source.split('.txt')
|
7 |
+
|
8 |
+
link = link + values[0]
|
9 |
+
yt_link.append(link)
|
10 |
+
# print(yt_link)
|
11 |
+
return yt_link
|
{Rag β utils}/summarization.py
RENAMED
File without changes
|