Nightwing11 commited on
Commit
a6a76ee
·
1 Parent(s): 59eb93e

Upload 18 files

Browse files
Rag/Processed_folder/processed_files.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["VOfwbcveP84_20241225194621.txt", "In9Bq4EJMZw_20241225194705.txt", "DkS1pkKpILY_20241225194325.txt", "ajneRM-ET1Q_20241225194311.txt", "K4Ze-Sp6aUE_20241225194709.txt", "n28W4AmvMDE_20241225194626.txt", "UIy-WQCZd4M_20241225194819.txt", "etbfLTHD_VU_20241225194439.txt", "PVmQOLYckKQ_20241225194814.txt", "F9KrZd_-ge0_20241225194812.txt", "xjEFo3a1AnI_20241225194539.txt", "szqPAPKE5tQ_20241225194712.txt", "3_auLYOilb8_20241225194826.txt", "acgz0C-z-gc_20241225194817.txt", "zVCaYyUWWSw_20241225194412.txt", "doupx8SAs5Y_20241225194603.txt", "wAZn9dF3XTo_20241225194423.txt", "2XGREPnlI8U_20241225194659.txt", "UNCwdFxPtE8_20241225194521.txt", "at37Y8rKDlA_20241225194513.txt", "oL3SkPV1_Ik_20241225194837.txt", "nOgypsWKjm4_20241225194440.txt", "rW9QKc-iFoY_20241225194751.txt", "CQlTmOFM4Qs_20241225194550.txt", "tR73Ny4Dt9s_20241225194413.txt", "t1F7EEGPQwo_20241225194649.txt", "ccrbE0QHy94_20241225194608.txt", "SyWC8ZFVxGo_20241225194333.txt", "zlc4VrDx_qk_20241225194800.txt", "8qaBpM73NSk_20241225194409.txt", "sxgCC4H1dl8_20241225194524.txt", "RBK5KLA5Jjg_20241225194446.txt", "slUCmZJDXrk_20241225194627.txt", "h2aWYjSA1Jc_20241225194702.txt", "Ov4yyK15-K8_20241225194230.txt", "juD99_sPWGU_20241225194340.txt", "q1Ss8sTbFBY_20241225194647.txt", "X8Hw8zeCDTA_20241225194518.txt", "UChhXiFPRgg_20241225194443.txt", "pq6WHJzOkno_20241225194415.txt", "2Ds1m5gflCI_20241225194849.txt", "jGZ1mR9uLU0_20241225194808.txt", "VAEzZeaV5zM_20241225194347.txt", "EhlIkzJwPlk_20241225194656.txt", "HiyzzcuaAac_20241225194255.txt", "C3X0bUAiluE_20241225194259.txt", "kG5Qb9sr0YQ_20241225194810.txt", "wRsX_ZkzxvQ_20241225194619.txt", "U2BPitASUh0_20241225194358.txt", "Wcs2PFz5q6g_20241225194327.txt", "CuzL1qxUyHw_20241225194312.txt", "q37ARYnRDGc_20241225194623.txt", "cp9GXl9Qk_s_20241225194735.txt", "XT_6Lvkhxvo_20241225194342.txt", "bUr_9fgfnto_20241225194256.txt", "LTGGyQS1fZE_20241225194305.txt", "mAlt_HKX4as_20241225194420.txt", "SZSRgyl7pyQ_20241225194418.txt", "RI112zW8GDw_20241225194356.txt", "ycOBZZeVeAc_20241225194707.txt", "6YLdlK2hYnw_20241225194328.txt", "p4ZfkezDTXQ_20241225194615.txt", "LVxL_p_kToc_20241225194558.txt", "HXzTbCEqCJc_20241225194710.txt", "yOoVz9E9kfQ_20241225194901.txt", "C5KpIXjpzdY_20241225194400.txt", "__RAXBLt1iM_20241225194430.txt", "8N7mdkrXgbc_20241225194338.txt", "JnlSDaBjCCU_20241225194450.txt", "IOl28gj_RXw_20241225194431.txt", "Nr5xb-QCBGA_20241225194354.txt", "GzvzWO0NU50_20241225194605.txt", "DtmwtjOoSYU_20241225194633.txt", "CrtR12PBKb0_20241225194632.txt", "gMRph_BvHB4_20241225194516.txt", "QpoaNklmRPc_20241225194248.txt", "9tRohh0gErM_20241225194353.txt", "Xu1FMCxoEFc_20241225194346.txt", "15R2pMqU2ok_20241225194406.txt", "eIxVfln02Ss_20241225194335.txt", "0Dtt95_xabw_20241225194252.txt", "3ZGItIAUQmI_20241225194719.txt", "uxZFl4BDOGk_20241225194757.txt", "hvPGfcAgk9Y_20241225194754.txt", "HYVeP4F0GNU_20241225194559.txt", "z5W74QC3v2I_20241225194308.txt", "31wjVhCcI5Y_20241225194426.txt", "BMTt8gSl13s_20241225194836.txt", "aQDOU3hPci0_20241225194501.txt", "tkH2-_jMCSk_20241225194543.txt", "ntfcfJ28eiU_20241225194522.txt", "S8nPJU9xkNw_20241225194748.txt", "fcxjwA4C4Cw_20241225194553.txt", "iMvtHqLmEkI_20241225194855.txt", "099hgtRoUZw_20241225194436.txt", "4RFEkGKKhdE_20241225194907.txt", "eJU6Df_ffAE_20241225194635.txt", "nqNEtdHVUjM_20241225194437.txt", "1SXDXdngX2M_20241225194316.txt", "X4QE6t-MkYE_20241225194642.txt", "79p1X_7rAMo_20241225194630.txt", "6RZbGrq9BxE_20241225194306.txt", "pkJi9Raxikg_20241225194824.txt", "QbMxDZeB8Ks_20241225194247.txt", "RgAcOqVRfYA_20241225194657.txt", "ncSoor2Iw8k_20241225194833.txt", "i_DEPeCKxs8_20241225194235.txt", "FE0lTEUa7EY_20241225194753.txt", "gE0_8AjTFaM_20241225194852.txt", "kgr22uMsJ5o_20241225194317.txt", "ufsIA5NARIo_20241225194535.txt", "CyDLbrZK75U_20241225194434.txt", "7TkGDj4LaOU_20241225194244.txt", "XLr2RKoD-oY_20241225194738.txt", "yb5zpo5WDG4_20241225194645.txt", "a9yFKPmPZ90_20241225194556.txt", "TG8VM5-CTfw_20241225194636.txt", "eMqWH3LYiII_20241225194351.txt", "CVh3_8e5u8I_20241225194246.txt", "SuR0DaYoe0Y_20241225194302.txt", "FLxIoNguGRU_20241225194233.txt", "GA89kjVY6Ik_20241225194854.txt", "qJ3uV7coZbA_20241225194453.txt", "EQ3GjpGq5Y8_20241225194405.txt", "yOJvm_ri_hk_20241225194555.txt", "cwakOgHIT0E_20241225194421.txt", "DTCmprPCDqc_20241225194733.txt", "qPKd99Pa2iU_20241225194500.txt", "nm1TxQj9IsQ_20241225194611.txt", "LRM5LutB538_20241225194857.txt", "xTtM2AvCRyA_20241225194643.txt", "62lVH-6xYGY_20241225194250.txt", "Rxmv7rT9leo_20241225194417.txt", "ulHrUVV3Kq4_20241225194452.txt", "bGixnNGvSkg_20241225194231.txt", "1CxJVdeyltw_20241225194614.txt", "wgUjIRtote8_20241225194726.txt", "qPKd99Pa2iU_20241225194503.txt", "S_SrHS8FvMM_20241225194807.txt", "xX6hiEmDmxs_20241225194227.txt", "uXs-zPc63kM_20241225194449.txt", "4AwyVTHEU3s_20241225194904.txt", "xaE9XyMMAHY_20241225194848.txt", "hFL6qRIJZ_Y_20241225194428.txt", "FOi5s3OUogo_20241225194245.txt", "cS7cNaBrkxo_20241225194624.txt", "kpTJqwIfHcM_20241225194654.txt", "yixIc1Ai6jM_20241225194829.txt", "vfRtLI6cJrk_20241225194324.txt", "GLgKkG44MGo_20241225194729.txt", "KPlJcD-o-4Q_20241225194617.txt", "AtChcxeaukQ_20241225194646.txt", "tLS6t3FVOTI_20241225194714.txt", "GqPGXG5TlZw_20241225194541.txt", "UF0nqolsNZc_20241225194727.txt", "7R3-3HR6-u4_20241225194519.txt", "tLRCS48Ens4_20241225194447.txt", "V0Sdgn0_kFM_20241225194740.txt", "G1VUSu6sGoU_20241225194251.txt", "m_OazsImOiI_20241225194322.txt", "Og56hmAspV8_20241225194258.txt", "dFR_wFN23ZY_20241225194640.txt", "q-H_A_dQUxQ_20241225194303.txt", "KVjfFN89qvQ_20241225194314.txt", "zU5EYw06wtw_20241225194349.txt", "Z7MU6zrAXsM_20241225194442.txt", "LYYyQcAJZfk_20241225194508.txt", "E7W4OQfJWdw_20241225194717.txt", "azb3Ih68awQ_20241225194505.txt", "ouCWNRvPk20_20241225194401.txt", "uwWOc_RqTBA_20241225194858.txt", "pZX8ikmWvEU_20241225194510.txt", "n9IxomBusuw_20241225194545.txt", "BwyZIWeBpRw_20241225194534.txt", "XY0rBdaDXD8_20241225194226.txt", "1Wo6SqLNmLk_20241225194845.txt", "ddq8JIMhz7c_20241225194529.txt", "VQLU7gpk_X8_20241225194821.txt", "jC8Pu9HBd48_20241225194321.txt", "rZkMpVLcVsg_20241225194319.txt", "gbQFSMayJxk_20241225194736.txt", "F54qXuTpgfM_20241225194843.txt", "p3JLaF_4Tz8_20241225194537.txt", "FeRgqJVALMQ_20241225194433.txt", "hF32FvBH4gI_20241225194332.txt", "CDUetQMKM6g_20241225194454.txt", "wG3UFHR1o48_20241225194229.txt", "6P8hrzjnetU_20241225194336.txt", "WFcYF_pxLgA_20241225194458.txt", "77CdVSpnUX4_20241225194746.txt", "VOfwbcveP84_20241225194742.txt", "VRvn3Oj5r3E_20241225194839.txt", "Gf-kC30SLtc_20241225194846.txt", "S8jWFcDGz4Y_20241225194805.txt", "x3MgDtZovks_20241225194526.txt", "lIo9FcrljDk_20241225194309.txt", "-e9ErUozQo4_20241225194903.txt", "aXvDEmo6uS4_20241225194629.txt", "3gtvNYa3Nd8_20241225194531.txt", "5tYR7e5Wpyc_20241225194238.txt", "OadokY8fcAA_20241225194601.txt", "O640yAgq5f8_20241225194744.txt", "zbpb1wd-wvs_20241225194827.txt", "gXvuJu1kt48_20241225194638.txt", "zEYE-vcVKy8_20241225194547.txt", "Ky-ZJ9SS-x4_20241225194240.txt", "0RYyQRQFgFk_20241225194532.txt", "4F_RBc1akC8_20241225194724.txt", "nDLb8_wgX50_20241225194540.txt", "tcueMCe-0zo_20241225194236.txt", "K-TW2Chpz4k_20241225194330.txt", "XcvhERcZpWw_20241225194731.txt", "Ze2pc6NwsHQ_20241225194704.txt", "_ltcLEM-5HU_20241225194612.txt", "jouFvyRZntk_20241225194507.txt", "uWV9a3zEaL4_20241225194823.txt", "-OBCwiPPfEU_20241225194747.txt", "dzOvi0Aa2EA_20241225194301.txt", "K9lORz2_XSU_20241225194527.txt", "j2sMqSDLd4k_20241225194407.txt", "oNkDA2F7CjM_20241225194651.txt", "50BZQRT1dAg_20241225194403.txt", "q8CHXefn7B4_20241225194411.txt", "Jy4rJcYmtUM_20241225194344.txt", "QmOF0crdyRU_20241225194456.txt", "6ZrlsVx85ek_20241225194758.txt", "CD0bRU1e1ZM_20241225194425.txt", "IAnhFUUCq6c_20241225194804.txt", "Phm-Alz1Zjo_20241225194906.txt", "csubiPlvFWk_20241225194606.txt", "GpgqXCkRO-w_20241225194701.txt", "W5zqC5cYcS0_20241225194241.txt", "T65RDBiB5Hs_20241225194715.txt", "6I5I56uVvLw_20241225194801.txt", "i5611OvTFGM_20241225194548.txt", "wTBSGgbIvsY_20241225194552.txt", "O1YRwWmue4Y_20241225194815.txt", "29n0WG317tM_20241225194511.txt", "xmhsWAqP_0Y_20241225194851.txt", "x4m_PdFbu-s_20241225194722.txt"]
Rag/__init__.py ADDED
File without changes
Rag/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (153 Bytes). View file
 
Rag/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (111 Bytes). View file
 
Rag/__pycache__/rag_pipeline.cpython-311.pyc ADDED
Binary file (11.6 kB). View file
 
Rag/chromadb.db/01e34d25-3e37-4b52-8953-794b0e9b61dd/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b5a01d1e5da2cd59bebe66372da65fee337e4d2d160c1170c240dae082bc3f5
3
+ size 100
Rag/chromadb.db/b338c320-325e-4f10-8e0c-ce336d2e26c9/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9fc8728a7f84f5a3f3e775af521ea3d3062f3b0d229f88d76086b8805199074
3
+ size 100
Rag/error_log.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Traceback (most recent call last):
2
+ File "/home/nightwing/Codes/Xyzbot/Rag/chunking.py", line 52, in split_text_to_chunks
3
+ chunks = text_splitter.split_documents(docs)
4
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
5
+ File "/home/nightwing/anaconda3/envs/xyzbot/lib/python3.11/site-packages/langchain_text_splitters/base.py", line 94, in split_documents
6
+ texts.append(doc.page_content)
7
+ ^^^^^^^^^^^^^^^^
8
+ AttributeError: 'str' object has no attribute 'page_content'
Rag/rag_pipeline.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from sentence_transformers import SentenceTransformer
4
+ import google.generativeai as genai
5
+ import os
6
+ import logging
7
+ from concurrent.futures import ProcessPoolExecutor, as_completed
8
+ from Llm.llm_endpoints import get_llm_response
9
+ from utils.get_link import get_source_link
10
+ from Prompts.huberman_prompt import huberman_prompt
11
+ from tqdm import tqdm
12
+ # Configuration
13
+ API_KEY = os.getenv("GOOGLE_API_KEY")
14
+ if API_KEY:
15
+ genai.configure(api_key=API_KEY)
16
+
17
+ chromadb_path = "app/Rag/chromadb.db"
18
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
19
+
20
+ # Logging
21
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
22
+
23
+
24
+ # Helper Functions
25
+ def split_text_to_chunks(docs, chunk_size=1000, chunk_overlap=200):
26
+ """Split text into manageable chunks."""
27
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
28
+ chunks = text_splitter.split_text(docs)
29
+ return chunks
30
+
31
+
32
+ def get_new_files(transcripts_folder_path, collection):
33
+ """Find new transcript files that haven't been processed yet."""
34
+ all_files = [f for f in os.listdir(transcripts_folder_path) if f.endswith(".txt")]
35
+ existing_files = [meta["source"] for meta in collection.get()['metadatas']]
36
+ return [f for f in all_files if f not in existing_files]
37
+
38
+
39
+ def process_single_file(file_path):
40
+ """Process a single file and return its chunks."""
41
+ with open(file_path, 'r') as f:
42
+ content = f.read()
43
+ chunks = split_text_to_chunks(content)
44
+ return chunks, os.path.basename(file_path)
45
+
46
+
47
+ def batch_embed_chunks(chunks, batch_size=32):
48
+ """Embed chunks in batches."""
49
+ embeddings = []
50
+ for i in tqdm(range(0, len(chunks), batch_size),desc = "Embedding chunks"):
51
+ batch = chunks[i:i + batch_size]
52
+ batch_embeddings = embedding_model.encode(batch, show_progress_bar=True)
53
+ embeddings.extend(batch_embeddings.tolist())
54
+ return embeddings
55
+
56
+
57
+ def process_and_add_new_files(transcripts_folder_path, collection):
58
+ """Process and add new transcript files to the vector database."""
59
+ new_files = get_new_files(transcripts_folder_path, collection)
60
+ if not new_files:
61
+ logging.info("No new files to process")
62
+ return False
63
+
64
+ # Use a reasonable number of workers (4 is usually a good default)
65
+ n_workers = min(8, len(new_files))
66
+ logging.info(f"Using {n_workers} workers for processing")
67
+
68
+ all_chunks = []
69
+ all_metadata = []
70
+ all_ids = []
71
+
72
+ # Process files in parallel
73
+ with ProcessPoolExecutor(max_workers=n_workers) as executor:
74
+ futures = {
75
+ executor.submit(process_single_file, os.path.join(transcripts_folder_path, file)): file
76
+ for file in new_files
77
+ }
78
+
79
+ for future in as_completed(futures):
80
+ file = futures[future]
81
+ try:
82
+ chunks, filename = future.result()
83
+ file_metadata = [{"source": filename} for _ in range(len(chunks))]
84
+ file_ids = [f"{filename}_chunk_{i}" for i in range(len(chunks))]
85
+
86
+ all_chunks.extend(chunks)
87
+ all_metadata.extend(file_metadata)
88
+ all_ids.extend(file_ids)
89
+
90
+ logging.info(f"Processed {filename}")
91
+ except Exception as e:
92
+ logging.error(f"Error processing {file}: {str(e)}")
93
+ continue
94
+
95
+ # Process embeddings in batches
96
+ logging.info(f"Generating embeddings for {len(all_chunks)} chunks")
97
+ embeddings = batch_embed_chunks(all_chunks)
98
+
99
+ # Add to database in batches
100
+ batch_size = 500
101
+ for i in range(0, len(all_chunks), batch_size):
102
+ end_idx = min(i + batch_size, len(all_chunks))
103
+ collection.upsert(
104
+ documents=all_chunks[i:end_idx],
105
+ embeddings=embeddings[i:end_idx],
106
+ metadatas=all_metadata[i:end_idx],
107
+ ids=all_ids[i:end_idx]
108
+ )
109
+ logging.info(f"Added batch {i // batch_size + 1} to database")
110
+
111
+ logging.info(f"Successfully processed {len(new_files)} files")
112
+ return True
113
+
114
+
115
+ def query_database(collection, query_text, n_results=3):
116
+ """Retrieve the most relevant chunks for the query."""
117
+ query_embeddings = embedding_model.encode(query_text).tolist()
118
+ results = collection.query(query_embeddings=query_embeddings, n_results=n_results)
119
+ retrieved_docs = results['documents'][0]
120
+ metadatas = results['metadatas'][0]
121
+ return retrieved_docs, metadatas
122
+
123
+
124
+ def enhance_query_with_history(query_text, summarized_history):
125
+ enhance_query = f"{query_text}*2\n\n{summarized_history}"
126
+ return enhance_query
127
+
128
+
129
+ def update_conversation_history(history, user_query, bot_response):
130
+ """Update and keeps track of conversation history between user and the bot."""
131
+ history.append({"user": user_query, "bot": bot_response})
132
+ return history
133
+
134
+
135
+ def generate_response(conversation_history, query_text, retrieved_docs, source_links):
136
+ """Generate a response using retrieved documents and the generative AI model."""
137
+ context = " ".join(retrieved_docs)
138
+ history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
139
+ sources_str = "\n".join(source_links)
140
+
141
+ prompt = huberman_prompt.format(
142
+ context=context,
143
+ sources=sources_str,
144
+ history=history_str,
145
+ question=query_text
146
+ )
147
+
148
+ response = get_llm_response(prompt)
149
+ full_response = f"{response}\n\nSources:\n{sources_str}"
150
+ return full_response
151
+
152
+
153
+ def main_workflow(transcripts_folder_path, collection):
154
+ """Run the full RAG workflow."""
155
+ new_files_added = process_and_add_new_files(transcripts_folder_path, collection)
156
+ if new_files_added:
157
+ logging.info("New transcripts added to the database.")
158
+ else:
159
+ logging.info("No new files found. Using existing database.")
160
+
161
+ conversation_history = []
162
+
163
+ while True:
164
+ query_text = input("\nEnter your query(or type 'exit' to end):").strip()
165
+ if query_text.lower() == "exit":
166
+ print("Ending the conversation. Goodbye")
167
+ break
168
+
169
+ query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
170
+ retrived_docs, metadatas = query_database(collection, query_text_with_conversation_history)
171
+ print("-" * 50)
172
+ source_link = get_source_link(metadatas)
173
+ print(source_link)
174
+ print("-" * 50)
175
+
176
+ if not retrived_docs:
177
+ print("No relevent documents is found")
178
+ continue
179
+
180
+ response = generate_response(conversation_history, query_text, retrived_docs, source_link)
181
+ conversation_history = update_conversation_history(conversation_history, query_text, response)
182
+ print("\nGenerated Response:")
183
+ print(response)