Nightwing11 commited on
Commit
d6feee3
·
1 Parent(s): 7d9257e

Simple workflow complete

Browse files
Data/Data/transcripts/3-ukCGQJk2c_20250121201324.txt ADDED
The diff for this file is too large to render. See raw diff
 
Rag/chunking.py CHANGED
@@ -1,10 +1,106 @@
1
  import chromadb
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
-
4
  from sentence_transformers import SentenceTransformer
5
  import google.generativeai as genai
6
  import os
7
  import json
8
  import logging
9
- from dotenv import load_dotenv
10
  from LLM.llm_endpoints import get_llm_response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import chromadb
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
3
  from sentence_transformers import SentenceTransformer
4
  import google.generativeai as genai
5
  import os
6
  import json
7
  import logging
 
8
  from LLM.llm_endpoints import get_llm_response
9
+ # Configuration
10
+ API_KEY = os.getenv("GOOGLE_API_KEY")
11
+ if API_KEY:
12
+ genai.configure(api_key=API_KEY)
13
+
14
+ chromadb_path = "/home/nightwing/Codes/Xyzbot/Rag/chromadb.db"
15
+ transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
16
+ processed_files_path = "/home/nightwing/Codes/Xyzbot/Rag/Processed_folder/processed_files.json"
17
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
18
+
19
+ client = chromadb.PersistentClient(path=chromadb_path)
20
+ collection = client.get_or_create_collection(name="yt_transcript_collection")
21
+
22
+ # Logging
23
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
24
+
25
+
26
+ # Helper Functions
27
+ def split_text_to_chunks(docs, chunk_size=1000, chunk_overlap=200):
28
+ """Split text into manageable chunks."""
29
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
30
+ chunks = text_splitter.split_text(docs)
31
+ return chunks
32
+
33
+
34
+ def get_new_files(transcripts_folder_path, collection):
35
+ """Find new transcript files that haven't been processed yet."""
36
+ all_files = [f for f in os.listdir(transcripts_folder_path) if f.endswith(".txt")]
37
+ existing_files = [meta["source"] for meta in collection.get()['metadatas']]
38
+ return [f for f in all_files if f not in existing_files]
39
+
40
+
41
+ def process_and_add_new_files(transcripts_folder_path, collection):
42
+ """Process and add new transcript files to the vector database."""
43
+ new_files = get_new_files(transcripts_folder_path, collection)
44
+ if not new_files:
45
+ return False
46
+
47
+ for new_file in new_files:
48
+ file_path = os.path.join(transcripts_folder_path, new_file)
49
+ with open(file_path, 'r') as f:
50
+ content = f.read()
51
+
52
+ chunks = split_text_to_chunks(content)
53
+ embeddings = embedding_model.encode(chunks).tolist()
54
+
55
+ ids = [f"{new_file}_chunk_{i}" for i in range(len(chunks))]
56
+ metadata = [{"source": new_file} for _ in range(len(chunks))]
57
+ collection.upsert(documents=chunks, embeddings=embeddings, metadatas=metadata, ids=ids)
58
+
59
+ logging.info(f"Added {new_file} to the database")
60
+ return True
61
+
62
+
63
+ def query_database(collection, query_text, n_results=3):
64
+ """Retrieve the most relevant chunks for the query."""
65
+ query_embeddings = embedding_model.encode(query_text).tolist()
66
+ results = collection.query(query_embeddings=query_embeddings, n_results=n_results)
67
+ retrieved_docs = results['documents'][0]
68
+ metadatas = results['metadatas'][0]
69
+ return retrieved_docs, metadatas
70
+
71
+
72
+ def generate_response(query_text, retrieved_docs):
73
+ """Generate a response using retrieved documents and the generative AI model."""
74
+ context = " ".join(retrieved_docs)
75
+ prompt = f"Using the context below, answer the question:\n\nContext:\n{context}\n\nQuestion: {query_text}"
76
+ response = get_llm_response(prompt)
77
+ return response
78
+
79
+
80
+ # Main Workflow
81
+ def main_workflow(transcripts_folder_path, collection):
82
+ """Run the full RAG workflow."""
83
+ # Process new files
84
+ new_files_added = process_and_add_new_files(transcripts_folder_path, collection)
85
+ if new_files_added:
86
+ logging.info("New transcripts added to the database.")
87
+ else:
88
+ logging.info("No new files found. Using existing database.")
89
+
90
+ # User query
91
+ query_text = input("Enter your query: ")
92
+ retrieved_docs, metadatas = query_database(collection, query_text)
93
+
94
+ if not retrieved_docs:
95
+ print("No relevant documents found.")
96
+ return
97
+
98
+ # Generate response
99
+ response = generate_response(query_text, retrieved_docs)
100
+ print("\nGenerated Response:")
101
+ print(response)
102
+
103
+
104
+ # Run the application
105
+ if __name__ == "__main__":
106
+ main_workflow(transcripts_folder_path, collection)