Nightwing11 commited on
Commit
59eb93e
·
1 Parent(s): 71e0395

Delete Rag

Browse files
Rag/Processed_folder/processed_files.json DELETED
@@ -1 +0,0 @@
1
- ["VOfwbcveP84_20241225194621.txt", "In9Bq4EJMZw_20241225194705.txt", "DkS1pkKpILY_20241225194325.txt", "ajneRM-ET1Q_20241225194311.txt", "K4Ze-Sp6aUE_20241225194709.txt", "n28W4AmvMDE_20241225194626.txt", "UIy-WQCZd4M_20241225194819.txt", "etbfLTHD_VU_20241225194439.txt", "PVmQOLYckKQ_20241225194814.txt", "F9KrZd_-ge0_20241225194812.txt", "xjEFo3a1AnI_20241225194539.txt", "szqPAPKE5tQ_20241225194712.txt", "3_auLYOilb8_20241225194826.txt", "acgz0C-z-gc_20241225194817.txt", "zVCaYyUWWSw_20241225194412.txt", "doupx8SAs5Y_20241225194603.txt", "wAZn9dF3XTo_20241225194423.txt", "2XGREPnlI8U_20241225194659.txt", "UNCwdFxPtE8_20241225194521.txt", "at37Y8rKDlA_20241225194513.txt", "oL3SkPV1_Ik_20241225194837.txt", "nOgypsWKjm4_20241225194440.txt", "rW9QKc-iFoY_20241225194751.txt", "CQlTmOFM4Qs_20241225194550.txt", "tR73Ny4Dt9s_20241225194413.txt", "t1F7EEGPQwo_20241225194649.txt", "ccrbE0QHy94_20241225194608.txt", "SyWC8ZFVxGo_20241225194333.txt", "zlc4VrDx_qk_20241225194800.txt", "8qaBpM73NSk_20241225194409.txt", "sxgCC4H1dl8_20241225194524.txt", "RBK5KLA5Jjg_20241225194446.txt", "slUCmZJDXrk_20241225194627.txt", "h2aWYjSA1Jc_20241225194702.txt", "Ov4yyK15-K8_20241225194230.txt", "juD99_sPWGU_20241225194340.txt", "q1Ss8sTbFBY_20241225194647.txt", "X8Hw8zeCDTA_20241225194518.txt", "UChhXiFPRgg_20241225194443.txt", "pq6WHJzOkno_20241225194415.txt", "2Ds1m5gflCI_20241225194849.txt", "jGZ1mR9uLU0_20241225194808.txt", "VAEzZeaV5zM_20241225194347.txt", "EhlIkzJwPlk_20241225194656.txt", "HiyzzcuaAac_20241225194255.txt", "C3X0bUAiluE_20241225194259.txt", "kG5Qb9sr0YQ_20241225194810.txt", "wRsX_ZkzxvQ_20241225194619.txt", "U2BPitASUh0_20241225194358.txt", "Wcs2PFz5q6g_20241225194327.txt", "CuzL1qxUyHw_20241225194312.txt", "q37ARYnRDGc_20241225194623.txt", "cp9GXl9Qk_s_20241225194735.txt", "XT_6Lvkhxvo_20241225194342.txt", "bUr_9fgfnto_20241225194256.txt", "LTGGyQS1fZE_20241225194305.txt", "mAlt_HKX4as_20241225194420.txt", "SZSRgyl7pyQ_20241225194418.txt", "RI112zW8GDw_20241225194356.txt", "ycOBZZeVeAc_20241225194707.txt", "6YLdlK2hYnw_20241225194328.txt", "p4ZfkezDTXQ_20241225194615.txt", "LVxL_p_kToc_20241225194558.txt", "HXzTbCEqCJc_20241225194710.txt", "yOoVz9E9kfQ_20241225194901.txt", "C5KpIXjpzdY_20241225194400.txt", "__RAXBLt1iM_20241225194430.txt", "8N7mdkrXgbc_20241225194338.txt", "JnlSDaBjCCU_20241225194450.txt", "IOl28gj_RXw_20241225194431.txt", "Nr5xb-QCBGA_20241225194354.txt", "GzvzWO0NU50_20241225194605.txt", "DtmwtjOoSYU_20241225194633.txt", "CrtR12PBKb0_20241225194632.txt", "gMRph_BvHB4_20241225194516.txt", "QpoaNklmRPc_20241225194248.txt", "9tRohh0gErM_20241225194353.txt", "Xu1FMCxoEFc_20241225194346.txt", "15R2pMqU2ok_20241225194406.txt", "eIxVfln02Ss_20241225194335.txt", "0Dtt95_xabw_20241225194252.txt", "3ZGItIAUQmI_20241225194719.txt", "uxZFl4BDOGk_20241225194757.txt", "hvPGfcAgk9Y_20241225194754.txt", "HYVeP4F0GNU_20241225194559.txt", "z5W74QC3v2I_20241225194308.txt", "31wjVhCcI5Y_20241225194426.txt", "BMTt8gSl13s_20241225194836.txt", "aQDOU3hPci0_20241225194501.txt", "tkH2-_jMCSk_20241225194543.txt", "ntfcfJ28eiU_20241225194522.txt", "S8nPJU9xkNw_20241225194748.txt", "fcxjwA4C4Cw_20241225194553.txt", "iMvtHqLmEkI_20241225194855.txt", "099hgtRoUZw_20241225194436.txt", "4RFEkGKKhdE_20241225194907.txt", "eJU6Df_ffAE_20241225194635.txt", "nqNEtdHVUjM_20241225194437.txt", "1SXDXdngX2M_20241225194316.txt", "X4QE6t-MkYE_20241225194642.txt", "79p1X_7rAMo_20241225194630.txt", "6RZbGrq9BxE_20241225194306.txt", "pkJi9Raxikg_20241225194824.txt", "QbMxDZeB8Ks_20241225194247.txt", "RgAcOqVRfYA_20241225194657.txt", "ncSoor2Iw8k_20241225194833.txt", "i_DEPeCKxs8_20241225194235.txt", "FE0lTEUa7EY_20241225194753.txt", "gE0_8AjTFaM_20241225194852.txt", "kgr22uMsJ5o_20241225194317.txt", "ufsIA5NARIo_20241225194535.txt", "CyDLbrZK75U_20241225194434.txt", "7TkGDj4LaOU_20241225194244.txt", "XLr2RKoD-oY_20241225194738.txt", "yb5zpo5WDG4_20241225194645.txt", "a9yFKPmPZ90_20241225194556.txt", "TG8VM5-CTfw_20241225194636.txt", "eMqWH3LYiII_20241225194351.txt", "CVh3_8e5u8I_20241225194246.txt", "SuR0DaYoe0Y_20241225194302.txt", "FLxIoNguGRU_20241225194233.txt", "GA89kjVY6Ik_20241225194854.txt", "qJ3uV7coZbA_20241225194453.txt", "EQ3GjpGq5Y8_20241225194405.txt", "yOJvm_ri_hk_20241225194555.txt", "cwakOgHIT0E_20241225194421.txt", "DTCmprPCDqc_20241225194733.txt", "qPKd99Pa2iU_20241225194500.txt", "nm1TxQj9IsQ_20241225194611.txt", "LRM5LutB538_20241225194857.txt", "xTtM2AvCRyA_20241225194643.txt", "62lVH-6xYGY_20241225194250.txt", "Rxmv7rT9leo_20241225194417.txt", "ulHrUVV3Kq4_20241225194452.txt", "bGixnNGvSkg_20241225194231.txt", "1CxJVdeyltw_20241225194614.txt", "wgUjIRtote8_20241225194726.txt", "qPKd99Pa2iU_20241225194503.txt", "S_SrHS8FvMM_20241225194807.txt", "xX6hiEmDmxs_20241225194227.txt", "uXs-zPc63kM_20241225194449.txt", "4AwyVTHEU3s_20241225194904.txt", "xaE9XyMMAHY_20241225194848.txt", "hFL6qRIJZ_Y_20241225194428.txt", "FOi5s3OUogo_20241225194245.txt", "cS7cNaBrkxo_20241225194624.txt", "kpTJqwIfHcM_20241225194654.txt", "yixIc1Ai6jM_20241225194829.txt", "vfRtLI6cJrk_20241225194324.txt", "GLgKkG44MGo_20241225194729.txt", "KPlJcD-o-4Q_20241225194617.txt", "AtChcxeaukQ_20241225194646.txt", "tLS6t3FVOTI_20241225194714.txt", "GqPGXG5TlZw_20241225194541.txt", "UF0nqolsNZc_20241225194727.txt", "7R3-3HR6-u4_20241225194519.txt", "tLRCS48Ens4_20241225194447.txt", "V0Sdgn0_kFM_20241225194740.txt", "G1VUSu6sGoU_20241225194251.txt", "m_OazsImOiI_20241225194322.txt", "Og56hmAspV8_20241225194258.txt", "dFR_wFN23ZY_20241225194640.txt", "q-H_A_dQUxQ_20241225194303.txt", "KVjfFN89qvQ_20241225194314.txt", "zU5EYw06wtw_20241225194349.txt", "Z7MU6zrAXsM_20241225194442.txt", "LYYyQcAJZfk_20241225194508.txt", "E7W4OQfJWdw_20241225194717.txt", "azb3Ih68awQ_20241225194505.txt", "ouCWNRvPk20_20241225194401.txt", "uwWOc_RqTBA_20241225194858.txt", "pZX8ikmWvEU_20241225194510.txt", "n9IxomBusuw_20241225194545.txt", "BwyZIWeBpRw_20241225194534.txt", "XY0rBdaDXD8_20241225194226.txt", "1Wo6SqLNmLk_20241225194845.txt", "ddq8JIMhz7c_20241225194529.txt", "VQLU7gpk_X8_20241225194821.txt", "jC8Pu9HBd48_20241225194321.txt", "rZkMpVLcVsg_20241225194319.txt", "gbQFSMayJxk_20241225194736.txt", "F54qXuTpgfM_20241225194843.txt", "p3JLaF_4Tz8_20241225194537.txt", "FeRgqJVALMQ_20241225194433.txt", "hF32FvBH4gI_20241225194332.txt", "CDUetQMKM6g_20241225194454.txt", "wG3UFHR1o48_20241225194229.txt", "6P8hrzjnetU_20241225194336.txt", "WFcYF_pxLgA_20241225194458.txt", "77CdVSpnUX4_20241225194746.txt", "VOfwbcveP84_20241225194742.txt", "VRvn3Oj5r3E_20241225194839.txt", "Gf-kC30SLtc_20241225194846.txt", "S8jWFcDGz4Y_20241225194805.txt", "x3MgDtZovks_20241225194526.txt", "lIo9FcrljDk_20241225194309.txt", "-e9ErUozQo4_20241225194903.txt", "aXvDEmo6uS4_20241225194629.txt", "3gtvNYa3Nd8_20241225194531.txt", "5tYR7e5Wpyc_20241225194238.txt", "OadokY8fcAA_20241225194601.txt", "O640yAgq5f8_20241225194744.txt", "zbpb1wd-wvs_20241225194827.txt", "gXvuJu1kt48_20241225194638.txt", "zEYE-vcVKy8_20241225194547.txt", "Ky-ZJ9SS-x4_20241225194240.txt", "0RYyQRQFgFk_20241225194532.txt", "4F_RBc1akC8_20241225194724.txt", "nDLb8_wgX50_20241225194540.txt", "tcueMCe-0zo_20241225194236.txt", "K-TW2Chpz4k_20241225194330.txt", "XcvhERcZpWw_20241225194731.txt", "Ze2pc6NwsHQ_20241225194704.txt", "_ltcLEM-5HU_20241225194612.txt", "jouFvyRZntk_20241225194507.txt", "uWV9a3zEaL4_20241225194823.txt", "-OBCwiPPfEU_20241225194747.txt", "dzOvi0Aa2EA_20241225194301.txt", "K9lORz2_XSU_20241225194527.txt", "j2sMqSDLd4k_20241225194407.txt", "oNkDA2F7CjM_20241225194651.txt", "50BZQRT1dAg_20241225194403.txt", "q8CHXefn7B4_20241225194411.txt", "Jy4rJcYmtUM_20241225194344.txt", "QmOF0crdyRU_20241225194456.txt", "6ZrlsVx85ek_20241225194758.txt", "CD0bRU1e1ZM_20241225194425.txt", "IAnhFUUCq6c_20241225194804.txt", "Phm-Alz1Zjo_20241225194906.txt", "csubiPlvFWk_20241225194606.txt", "GpgqXCkRO-w_20241225194701.txt", "W5zqC5cYcS0_20241225194241.txt", "T65RDBiB5Hs_20241225194715.txt", "6I5I56uVvLw_20241225194801.txt", "i5611OvTFGM_20241225194548.txt", "wTBSGgbIvsY_20241225194552.txt", "O1YRwWmue4Y_20241225194815.txt", "29n0WG317tM_20241225194511.txt", "xmhsWAqP_0Y_20241225194851.txt", "x4m_PdFbu-s_20241225194722.txt"]
 
 
Rag/__init__.py DELETED
File without changes
Rag/chromadb.db/01e34d25-3e37-4b52-8953-794b0e9b61dd/header.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b5a01d1e5da2cd59bebe66372da65fee337e4d2d160c1170c240dae082bc3f5
3
- size 100
 
 
 
 
Rag/chromadb.db/b338c320-325e-4f10-8e0c-ce336d2e26c9/header.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9fc8728a7f84f5a3f3e775af521ea3d3062f3b0d229f88d76086b8805199074
3
- size 100
 
 
 
 
Rag/rag_pipeline.py DELETED
@@ -1,197 +0,0 @@
1
- import chromadb
2
- from langchain.text_splitter import RecursiveCharacterTextSplitter
3
- from sentence_transformers import SentenceTransformer
4
- import google.generativeai as genai
5
- import os
6
- import logging
7
- from concurrent.futures import ProcessPoolExecutor, as_completed
8
- from tqdm import tqdm
9
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
10
- # Configuration
11
- API_KEY = os.getenv("GOOGLE_API_KEY")
12
- if API_KEY:
13
- genai.configure(api_key=API_KEY)
14
-
15
- # Logging setup
16
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
17
-
18
- def initialize_chroma_client(chromadb_path):
19
- """Initialize ChromaDB client and get/create collection."""
20
- client = chromadb.PersistentClient(path=chromadb_path)
21
- collection = client.get_or_create_collection(
22
- name="yt_transcript_collection",
23
- embedding_function=None # Don't specify embedding function here
24
- )
25
- return collection
26
-
27
- def split_text_to_chunks(text, chunk_size=1000, chunk_overlap=200):
28
- """Split text into manageable chunks."""
29
- text_splitter = RecursiveCharacterTextSplitter(
30
- chunk_size=chunk_size,
31
- chunk_overlap=chunk_overlap
32
- )
33
- return text_splitter.split_text(text)
34
-
35
- def get_new_files(transcripts_folder_path, collection):
36
- """Find new transcript files that haven't been processed yet."""
37
- existing_files = set(meta["source"] for meta in collection.get()['metadatas'])
38
- all_files = set(f for f in os.listdir(transcripts_folder_path) if f.endswith(".txt"))
39
- return list(all_files - existing_files)
40
-
41
- def process_and_add_new_files(transcripts_folder_path, collection, embedding_model):
42
- """Process and add only new transcript files to the vector database."""
43
- new_files = get_new_files(transcripts_folder_path, collection)
44
-
45
- if not new_files:
46
- logging.info("No new files to process")
47
- return False
48
-
49
- logging.info(f"Found {len(new_files)} new files to process")
50
-
51
- for filename in new_files:
52
- try:
53
- file_path = os.path.join(transcripts_folder_path, filename)
54
- with open(file_path, 'r', encoding='utf-8') as f:
55
- content = f.read()
56
-
57
- # Split into chunks
58
- chunks = split_text_to_chunks(content)
59
-
60
- # Generate embeddings for new chunks only
61
- logging.info(f"Generating embeddings for {filename}")
62
- embeddings = embedding_model.encode(chunks, show_progress_bar=True)
63
-
64
- # Prepare metadata and IDs
65
- metadata = [{"source": filename} for _ in range(len(chunks))]
66
- ids = [f"{filename}_chunk_{i}" for i in range(len(chunks))]
67
-
68
- # Add to database
69
- collection.add(
70
- documents=chunks,
71
- embeddings=embeddings.tolist(),
72
- metadatas=metadata,
73
- ids=ids
74
- )
75
- logging.info(f"Successfully processed and added {filename} to database")
76
- except Exception as e:
77
- logging.error(f"Error processing {filename}: {str(e)}")
78
- continue
79
-
80
- return True
81
-
82
- def get_source_link(metadatas):
83
- """Convert metadata into source links."""
84
- sources = []
85
- for meta in metadatas:
86
- if 'source' in meta:
87
- sources.append(f"Source: {meta['source']}")
88
- return list(set(sources))
89
-
90
- def query_database(collection, query_text, embedding_model, n_results=3):
91
- """Query the database using the embedding model."""
92
- query_embedding = embedding_model.encode(query_text).tolist()
93
- results = collection.query(
94
- query_embeddings=query_embedding,
95
- n_results=n_results
96
- )
97
- return results['documents'][0], results['metadatas'][0]
98
-
99
- def enhance_query_with_history(query_text, conversation_history):
100
- """Enhance the query with conversation history."""
101
- history_str = "\n".join([
102
- f"User: {turn['user']}\nBot: {turn['bot']}"
103
- for turn in conversation_history[-3:] # Only use last 3 turns
104
- ])
105
- return f"{query_text}\n\nPrevious conversation:\n{history_str}"
106
-
107
- def update_conversation_history(history, user_query, bot_response):
108
- """Update and track conversation history."""
109
- history.append({"user": user_query, "bot": bot_response})
110
- return history
111
-
112
- def generate_response(conversation_history, query_text, retrieved_docs, source_links):
113
- """Generate a response using retrieved documents and the LLM."""
114
- context = " ".join(retrieved_docs)
115
- history_str = "\n".join([
116
- f"User: {turn['user']}\nBot: {turn['bot']}"
117
- for turn in conversation_history
118
- ])
119
- sources_str = "\n".join(source_links)
120
-
121
- # You can modify this prompt template according to your needs
122
- prompt = f"""
123
- Context from documents: {context}
124
-
125
- Previous conversation:
126
- {history_str}
127
-
128
- Current question: {query_text}
129
-
130
- Based on the context provided, please answer the question.
131
- Include relevant information from the source documents.
132
- If you're not sure about something, say so.
133
-
134
- Sources: {sources_str}
135
- """
136
-
137
- response = get_llm_response(prompt) # Implement this function based on your LLM choice
138
- return f"{response}\n\nSources:\n{sources_str}"
139
-
140
- def get_llm_response(prompt):
141
- """Get response from LLM."""
142
- # Implement this function based on your chosen LLM
143
- # For example, using Google's generative AI:
144
- model = genai.GenerativeModel('gemini-pro')
145
- response = model.generate_content(prompt)
146
- return response.text
147
-
148
- def main_workflow(transcripts_folder_path, chromadb_path):
149
- """Run the optimized RAG workflow."""
150
- # Initialize components
151
- collection = initialize_chroma_client(chromadb_path)
152
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
153
-
154
- # Only process new files
155
- new_files_added = process_and_add_new_files(transcripts_folder_path, collection)
156
-
157
- if not new_files_added:
158
- logging.info("Using existing database - no new files to process")
159
-
160
- conversation_history = []
161
-
162
- while True:
163
- query_text = input("\nEnter your query (or type 'exit' to end): ").strip()
164
- if query_text.lower() == 'exit':
165
- break
166
-
167
- # Enhance query with conversation history
168
- enhanced_query = enhance_query_with_history(query_text, conversation_history)
169
-
170
- # Get relevant documents
171
- retrieved_docs, metadatas = query_database(
172
- collection,
173
- enhanced_query,
174
- embedding_model
175
- )
176
-
177
- # Get source links
178
- source_links = get_source_link(metadatas)
179
-
180
- # Generate response
181
- response = generate_response(
182
- conversation_history,
183
- query_text,
184
- retrieved_docs,
185
- source_links
186
- )
187
-
188
- # Update conversation history
189
- conversation_history = update_conversation_history(
190
- conversation_history,
191
- query_text,
192
- response
193
- )
194
-
195
- # Print response
196
- print("\nGenerated Response:")
197
- print(response)