Angel commited on
Commit
8b78e1d
·
2 Parent(s): e9d37a0 9132b79

Merge pull request #8 from Angel-dash/yt_rag

Browse files
Data/yt_transcript.py CHANGED
@@ -3,7 +3,6 @@ from youtube_transcript_api import YouTubeTranscriptApi
3
  from Data.get_video_link import video_links_main
4
  import os
5
  from datetime import datetime
6
-
7
  transcripts = []
8
 
9
  import os
@@ -109,8 +108,3 @@ def all_video_transcript_pipeline():
109
  print(f"Total transcripts loaded: {len(video_transcripts)}")
110
  return video_transcripts
111
 
112
-
113
- # if __name__ == '__main__':
114
- # full_transcripts = all_video_transcript_pipeline()
115
- # print("this is full transcripts of all the youtube videos")
116
- # print(full_transcripts)
 
3
  from Data.get_video_link import video_links_main
4
  import os
5
  from datetime import datetime
 
6
  transcripts = []
7
 
8
  import os
 
108
  print(f"Total transcripts loaded: {len(video_transcripts)}")
109
  return video_transcripts
110
 
 
 
 
 
 
Rag/chunking.py CHANGED
@@ -1,13 +1,7 @@
1
- from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
- from langchain_community.vectorstores import Chroma
4
  from langchain.chains import ConversationalRetrievalChain
5
  from langchain_community.document_loaders import TextLoader
6
  from langchain.schema import Document
7
- from langchain.memory import ConversationBufferMemory
8
- import google.generativeai as genai
9
- import os
10
- from typing import Dict, List
11
  import os
12
  import sys
13
  from Data.yt_transcript import all_video_transcript_pipeline
 
 
1
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
2
  from langchain.chains import ConversationalRetrievalChain
3
  from langchain_community.document_loaders import TextLoader
4
  from langchain.schema import Document
 
 
 
 
5
  import os
6
  import sys
7
  from Data.yt_transcript import all_video_transcript_pipeline
Rag/embeddings.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
2
+ from langchain_chroma import Chroma
3
+ from Rag.chunking import split_text_to_chunks
4
+ from tqdm import tqdm
5
+ import numpy as np
6
+ from chromadb.config import Settings
7
+ import chromadb
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ all_chunks = split_text_to_chunks()
11
+
12
+
13
+ def generate_embeddings(splits, batch_size = 32):
14
+ model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
15
+ texts = [chunk.page_content for chunk in splits]
16
+ chunks_embeddings = []
17
+ with tqdm(total=len(texts), desc="Generating embeddings") as pbar:
18
+ for i in range(0, len(texts), batch_size):
19
+ batch = texts[i:i + batch_size]
20
+ batch_embeddings = model.encode(batch)
21
+ chunks_embeddings.extend(batch_embeddings)
22
+ pbar.update(len(batch))
23
+
24
+ return np.array(chunks_embeddings)
25
+
26
+
27
+ def store_in_chroma(chunks, embeddings):
28
+ # Initialize Chroma client
29
+ client = chromadb.Client(Settings(
30
+ persist_directory="db" # This will store the database on disk
31
+ ))
32
+
33
+ # Create or get collection
34
+ collection = client.create_collection(
35
+ name="transcript_collection",
36
+ metadata={"description": "Video transcript embeddings"}
37
+ )
38
+
39
+ # Prepare data for insertion
40
+ ids = [str(i) for i in range(len(chunks))]
41
+ documents = [chunk.page_content for chunk in chunks]
42
+ metadatas = [chunk.metadata for chunk in chunks]
43
+
44
+ # Add data to collection
45
+ with tqdm(total=len(documents), desc="Storing in Chroma") as pbar:
46
+ # You might want to batch this too if dealing with very large datasets
47
+ collection.add(
48
+ ids=ids,
49
+ documents=documents,
50
+ embeddings=embeddings.tolist(),
51
+ metadatas=metadatas
52
+ )
53
+ pbar.update(len(documents))
54
+
55
+ return collection
56
+
57
+
58
+ def main():
59
+ # Get your chunks from your existing code
60
+ all_chunks = split_text_to_chunks()
61
+
62
+ print(f"Starting embedding generation for {len(all_chunks)} chunks...")
63
+
64
+ # Generate embeddings
65
+ embeddings = generate_embeddings(all_chunks)
66
+
67
+ print("Embeddings generated. Starting storage...")
68
+
69
+ # Store in ChromaDB
70
+ collection = store_in_chroma(all_chunks, embeddings)
71
+
72
+ print(f"Process complete. Collection contains {collection.count()} documents.")
73
+
74
+ return collection
75
+
76
+
77
+ if __name__ == "__main__":
78
+ main()
79
+
80
+
81
+ def store_embeddings_in_chroma(chunk_embeddings):
82
+ vector_db = Chroma(
83
+ collection_name='transcript_knowledge_base',
84
+ embedding_function=GoogleGenerativeAIEmbeddings(),
85
+
86
+ )
87
+ for chunk in chunk_embeddings:
88
+ vector_db.add_texts(chunk['text'], embeddings=chunk['embedding'])
89
+ return vector_db
90
+
91
+
92
+ transcripts_embeddings = generate_embeddings(all_chunks)
requirements.txt CHANGED
@@ -10,4 +10,6 @@ chromadb
10
  pypdf
11
  flask
12
  flask_cors
13
- chromadb
 
 
 
10
  pypdf
11
  flask
12
  flask_cors
13
+ sentence_transformers
14
+ tqdm
15
+ torch