Spaces:
Sleeping
Sleeping
Commit
·
d6feee3
1
Parent(s):
7d9257e
Simple workflow complete
Browse files
Data/Data/transcripts/3-ukCGQJk2c_20250121201324.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Rag/chunking.py
CHANGED
@@ -1,10 +1,106 @@
|
|
1 |
import chromadb
|
2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
-
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
import google.generativeai as genai
|
6 |
import os
|
7 |
import json
|
8 |
import logging
|
9 |
-
from dotenv import load_dotenv
|
10 |
from LLM.llm_endpoints import get_llm_response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import chromadb
|
2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
import google.generativeai as genai
|
5 |
import os
|
6 |
import json
|
7 |
import logging
|
|
|
8 |
from LLM.llm_endpoints import get_llm_response
|
9 |
+
# Configuration
|
10 |
+
API_KEY = os.getenv("GOOGLE_API_KEY")
|
11 |
+
if API_KEY:
|
12 |
+
genai.configure(api_key=API_KEY)
|
13 |
+
|
14 |
+
chromadb_path = "/home/nightwing/Codes/Xyzbot/Rag/chromadb.db"
|
15 |
+
transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
|
16 |
+
processed_files_path = "/home/nightwing/Codes/Xyzbot/Rag/Processed_folder/processed_files.json"
|
17 |
+
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
18 |
+
|
19 |
+
client = chromadb.PersistentClient(path=chromadb_path)
|
20 |
+
collection = client.get_or_create_collection(name="yt_transcript_collection")
|
21 |
+
|
22 |
+
# Logging
|
23 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
|
24 |
+
|
25 |
+
|
26 |
+
# Helper Functions
|
27 |
+
def split_text_to_chunks(docs, chunk_size=1000, chunk_overlap=200):
|
28 |
+
"""Split text into manageable chunks."""
|
29 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
30 |
+
chunks = text_splitter.split_text(docs)
|
31 |
+
return chunks
|
32 |
+
|
33 |
+
|
34 |
+
def get_new_files(transcripts_folder_path, collection):
|
35 |
+
"""Find new transcript files that haven't been processed yet."""
|
36 |
+
all_files = [f for f in os.listdir(transcripts_folder_path) if f.endswith(".txt")]
|
37 |
+
existing_files = [meta["source"] for meta in collection.get()['metadatas']]
|
38 |
+
return [f for f in all_files if f not in existing_files]
|
39 |
+
|
40 |
+
|
41 |
+
def process_and_add_new_files(transcripts_folder_path, collection):
|
42 |
+
"""Process and add new transcript files to the vector database."""
|
43 |
+
new_files = get_new_files(transcripts_folder_path, collection)
|
44 |
+
if not new_files:
|
45 |
+
return False
|
46 |
+
|
47 |
+
for new_file in new_files:
|
48 |
+
file_path = os.path.join(transcripts_folder_path, new_file)
|
49 |
+
with open(file_path, 'r') as f:
|
50 |
+
content = f.read()
|
51 |
+
|
52 |
+
chunks = split_text_to_chunks(content)
|
53 |
+
embeddings = embedding_model.encode(chunks).tolist()
|
54 |
+
|
55 |
+
ids = [f"{new_file}_chunk_{i}" for i in range(len(chunks))]
|
56 |
+
metadata = [{"source": new_file} for _ in range(len(chunks))]
|
57 |
+
collection.upsert(documents=chunks, embeddings=embeddings, metadatas=metadata, ids=ids)
|
58 |
+
|
59 |
+
logging.info(f"Added {new_file} to the database")
|
60 |
+
return True
|
61 |
+
|
62 |
+
|
63 |
+
def query_database(collection, query_text, n_results=3):
|
64 |
+
"""Retrieve the most relevant chunks for the query."""
|
65 |
+
query_embeddings = embedding_model.encode(query_text).tolist()
|
66 |
+
results = collection.query(query_embeddings=query_embeddings, n_results=n_results)
|
67 |
+
retrieved_docs = results['documents'][0]
|
68 |
+
metadatas = results['metadatas'][0]
|
69 |
+
return retrieved_docs, metadatas
|
70 |
+
|
71 |
+
|
72 |
+
def generate_response(query_text, retrieved_docs):
|
73 |
+
"""Generate a response using retrieved documents and the generative AI model."""
|
74 |
+
context = " ".join(retrieved_docs)
|
75 |
+
prompt = f"Using the context below, answer the question:\n\nContext:\n{context}\n\nQuestion: {query_text}"
|
76 |
+
response = get_llm_response(prompt)
|
77 |
+
return response
|
78 |
+
|
79 |
+
|
80 |
+
# Main Workflow
|
81 |
+
def main_workflow(transcripts_folder_path, collection):
|
82 |
+
"""Run the full RAG workflow."""
|
83 |
+
# Process new files
|
84 |
+
new_files_added = process_and_add_new_files(transcripts_folder_path, collection)
|
85 |
+
if new_files_added:
|
86 |
+
logging.info("New transcripts added to the database.")
|
87 |
+
else:
|
88 |
+
logging.info("No new files found. Using existing database.")
|
89 |
+
|
90 |
+
# User query
|
91 |
+
query_text = input("Enter your query: ")
|
92 |
+
retrieved_docs, metadatas = query_database(collection, query_text)
|
93 |
+
|
94 |
+
if not retrieved_docs:
|
95 |
+
print("No relevant documents found.")
|
96 |
+
return
|
97 |
+
|
98 |
+
# Generate response
|
99 |
+
response = generate_response(query_text, retrieved_docs)
|
100 |
+
print("\nGenerated Response:")
|
101 |
+
print(response)
|
102 |
+
|
103 |
+
|
104 |
+
# Run the application
|
105 |
+
if __name__ == "__main__":
|
106 |
+
main_workflow(transcripts_folder_path, collection)
|