Nightwing11 commited on
Commit
edebf8f
·
1 Parent(s): 11f956d

Upload 18 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Rag/chromadb.db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
Rag/Processed_folder/processed_files.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["VOfwbcveP84_20241225194621.txt", "In9Bq4EJMZw_20241225194705.txt", "DkS1pkKpILY_20241225194325.txt", "ajneRM-ET1Q_20241225194311.txt", "K4Ze-Sp6aUE_20241225194709.txt", "n28W4AmvMDE_20241225194626.txt", "UIy-WQCZd4M_20241225194819.txt", "etbfLTHD_VU_20241225194439.txt", "PVmQOLYckKQ_20241225194814.txt", "F9KrZd_-ge0_20241225194812.txt", "xjEFo3a1AnI_20241225194539.txt", "szqPAPKE5tQ_20241225194712.txt", "3_auLYOilb8_20241225194826.txt", "acgz0C-z-gc_20241225194817.txt", "zVCaYyUWWSw_20241225194412.txt", "doupx8SAs5Y_20241225194603.txt", "wAZn9dF3XTo_20241225194423.txt", "2XGREPnlI8U_20241225194659.txt", "UNCwdFxPtE8_20241225194521.txt", "at37Y8rKDlA_20241225194513.txt", "oL3SkPV1_Ik_20241225194837.txt", "nOgypsWKjm4_20241225194440.txt", "rW9QKc-iFoY_20241225194751.txt", "CQlTmOFM4Qs_20241225194550.txt", "tR73Ny4Dt9s_20241225194413.txt", "t1F7EEGPQwo_20241225194649.txt", "ccrbE0QHy94_20241225194608.txt", "SyWC8ZFVxGo_20241225194333.txt", "zlc4VrDx_qk_20241225194800.txt", "8qaBpM73NSk_20241225194409.txt", "sxgCC4H1dl8_20241225194524.txt", "RBK5KLA5Jjg_20241225194446.txt", "slUCmZJDXrk_20241225194627.txt", "h2aWYjSA1Jc_20241225194702.txt", "Ov4yyK15-K8_20241225194230.txt", "juD99_sPWGU_20241225194340.txt", "q1Ss8sTbFBY_20241225194647.txt", "X8Hw8zeCDTA_20241225194518.txt", "UChhXiFPRgg_20241225194443.txt", "pq6WHJzOkno_20241225194415.txt", "2Ds1m5gflCI_20241225194849.txt", "jGZ1mR9uLU0_20241225194808.txt", "VAEzZeaV5zM_20241225194347.txt", "EhlIkzJwPlk_20241225194656.txt", "HiyzzcuaAac_20241225194255.txt", "C3X0bUAiluE_20241225194259.txt", "kG5Qb9sr0YQ_20241225194810.txt", "wRsX_ZkzxvQ_20241225194619.txt", "U2BPitASUh0_20241225194358.txt", "Wcs2PFz5q6g_20241225194327.txt", "CuzL1qxUyHw_20241225194312.txt", "q37ARYnRDGc_20241225194623.txt", "cp9GXl9Qk_s_20241225194735.txt", "XT_6Lvkhxvo_20241225194342.txt", "bUr_9fgfnto_20241225194256.txt", "LTGGyQS1fZE_20241225194305.txt", "mAlt_HKX4as_20241225194420.txt", "SZSRgyl7pyQ_20241225194418.txt", "RI112zW8GDw_20241225194356.txt", "ycOBZZeVeAc_20241225194707.txt", "6YLdlK2hYnw_20241225194328.txt", "p4ZfkezDTXQ_20241225194615.txt", "LVxL_p_kToc_20241225194558.txt", "HXzTbCEqCJc_20241225194710.txt", "yOoVz9E9kfQ_20241225194901.txt", "C5KpIXjpzdY_20241225194400.txt", "__RAXBLt1iM_20241225194430.txt", "8N7mdkrXgbc_20241225194338.txt", "JnlSDaBjCCU_20241225194450.txt", "IOl28gj_RXw_20241225194431.txt", "Nr5xb-QCBGA_20241225194354.txt", "GzvzWO0NU50_20241225194605.txt", "DtmwtjOoSYU_20241225194633.txt", "CrtR12PBKb0_20241225194632.txt", "gMRph_BvHB4_20241225194516.txt", "QpoaNklmRPc_20241225194248.txt", "9tRohh0gErM_20241225194353.txt", "Xu1FMCxoEFc_20241225194346.txt", "15R2pMqU2ok_20241225194406.txt", "eIxVfln02Ss_20241225194335.txt", "0Dtt95_xabw_20241225194252.txt", "3ZGItIAUQmI_20241225194719.txt", "uxZFl4BDOGk_20241225194757.txt", "hvPGfcAgk9Y_20241225194754.txt", "HYVeP4F0GNU_20241225194559.txt", "z5W74QC3v2I_20241225194308.txt", "31wjVhCcI5Y_20241225194426.txt", "BMTt8gSl13s_20241225194836.txt", "aQDOU3hPci0_20241225194501.txt", "tkH2-_jMCSk_20241225194543.txt", "ntfcfJ28eiU_20241225194522.txt", "S8nPJU9xkNw_20241225194748.txt", "fcxjwA4C4Cw_20241225194553.txt", "iMvtHqLmEkI_20241225194855.txt", "099hgtRoUZw_20241225194436.txt", "4RFEkGKKhdE_20241225194907.txt", "eJU6Df_ffAE_20241225194635.txt", "nqNEtdHVUjM_20241225194437.txt", "1SXDXdngX2M_20241225194316.txt", "X4QE6t-MkYE_20241225194642.txt", "79p1X_7rAMo_20241225194630.txt", "6RZbGrq9BxE_20241225194306.txt", "pkJi9Raxikg_20241225194824.txt", "QbMxDZeB8Ks_20241225194247.txt", "RgAcOqVRfYA_20241225194657.txt", "ncSoor2Iw8k_20241225194833.txt", "i_DEPeCKxs8_20241225194235.txt", "FE0lTEUa7EY_20241225194753.txt", "gE0_8AjTFaM_20241225194852.txt", "kgr22uMsJ5o_20241225194317.txt", "ufsIA5NARIo_20241225194535.txt", "CyDLbrZK75U_20241225194434.txt", "7TkGDj4LaOU_20241225194244.txt", "XLr2RKoD-oY_20241225194738.txt", "yb5zpo5WDG4_20241225194645.txt", "a9yFKPmPZ90_20241225194556.txt", "TG8VM5-CTfw_20241225194636.txt", "eMqWH3LYiII_20241225194351.txt", "CVh3_8e5u8I_20241225194246.txt", "SuR0DaYoe0Y_20241225194302.txt", "FLxIoNguGRU_20241225194233.txt", "GA89kjVY6Ik_20241225194854.txt", "qJ3uV7coZbA_20241225194453.txt", "EQ3GjpGq5Y8_20241225194405.txt", "yOJvm_ri_hk_20241225194555.txt", "cwakOgHIT0E_20241225194421.txt", "DTCmprPCDqc_20241225194733.txt", "qPKd99Pa2iU_20241225194500.txt", "nm1TxQj9IsQ_20241225194611.txt", "LRM5LutB538_20241225194857.txt", "xTtM2AvCRyA_20241225194643.txt", "62lVH-6xYGY_20241225194250.txt", "Rxmv7rT9leo_20241225194417.txt", "ulHrUVV3Kq4_20241225194452.txt", "bGixnNGvSkg_20241225194231.txt", "1CxJVdeyltw_20241225194614.txt", "wgUjIRtote8_20241225194726.txt", "qPKd99Pa2iU_20241225194503.txt", "S_SrHS8FvMM_20241225194807.txt", "xX6hiEmDmxs_20241225194227.txt", "uXs-zPc63kM_20241225194449.txt", "4AwyVTHEU3s_20241225194904.txt", "xaE9XyMMAHY_20241225194848.txt", "hFL6qRIJZ_Y_20241225194428.txt", "FOi5s3OUogo_20241225194245.txt", "cS7cNaBrkxo_20241225194624.txt", "kpTJqwIfHcM_20241225194654.txt", "yixIc1Ai6jM_20241225194829.txt", "vfRtLI6cJrk_20241225194324.txt", "GLgKkG44MGo_20241225194729.txt", "KPlJcD-o-4Q_20241225194617.txt", "AtChcxeaukQ_20241225194646.txt", "tLS6t3FVOTI_20241225194714.txt", "GqPGXG5TlZw_20241225194541.txt", "UF0nqolsNZc_20241225194727.txt", "7R3-3HR6-u4_20241225194519.txt", "tLRCS48Ens4_20241225194447.txt", "V0Sdgn0_kFM_20241225194740.txt", "G1VUSu6sGoU_20241225194251.txt", "m_OazsImOiI_20241225194322.txt", "Og56hmAspV8_20241225194258.txt", "dFR_wFN23ZY_20241225194640.txt", "q-H_A_dQUxQ_20241225194303.txt", "KVjfFN89qvQ_20241225194314.txt", "zU5EYw06wtw_20241225194349.txt", "Z7MU6zrAXsM_20241225194442.txt", "LYYyQcAJZfk_20241225194508.txt", "E7W4OQfJWdw_20241225194717.txt", "azb3Ih68awQ_20241225194505.txt", "ouCWNRvPk20_20241225194401.txt", "uwWOc_RqTBA_20241225194858.txt", "pZX8ikmWvEU_20241225194510.txt", "n9IxomBusuw_20241225194545.txt", "BwyZIWeBpRw_20241225194534.txt", "XY0rBdaDXD8_20241225194226.txt", "1Wo6SqLNmLk_20241225194845.txt", "ddq8JIMhz7c_20241225194529.txt", "VQLU7gpk_X8_20241225194821.txt", "jC8Pu9HBd48_20241225194321.txt", "rZkMpVLcVsg_20241225194319.txt", "gbQFSMayJxk_20241225194736.txt", "F54qXuTpgfM_20241225194843.txt", "p3JLaF_4Tz8_20241225194537.txt", "FeRgqJVALMQ_20241225194433.txt", "hF32FvBH4gI_20241225194332.txt", "CDUetQMKM6g_20241225194454.txt", "wG3UFHR1o48_20241225194229.txt", "6P8hrzjnetU_20241225194336.txt", "WFcYF_pxLgA_20241225194458.txt", "77CdVSpnUX4_20241225194746.txt", "VOfwbcveP84_20241225194742.txt", "VRvn3Oj5r3E_20241225194839.txt", "Gf-kC30SLtc_20241225194846.txt", "S8jWFcDGz4Y_20241225194805.txt", "x3MgDtZovks_20241225194526.txt", "lIo9FcrljDk_20241225194309.txt", "-e9ErUozQo4_20241225194903.txt", "aXvDEmo6uS4_20241225194629.txt", "3gtvNYa3Nd8_20241225194531.txt", "5tYR7e5Wpyc_20241225194238.txt", "OadokY8fcAA_20241225194601.txt", "O640yAgq5f8_20241225194744.txt", "zbpb1wd-wvs_20241225194827.txt", "gXvuJu1kt48_20241225194638.txt", "zEYE-vcVKy8_20241225194547.txt", "Ky-ZJ9SS-x4_20241225194240.txt", "0RYyQRQFgFk_20241225194532.txt", "4F_RBc1akC8_20241225194724.txt", "nDLb8_wgX50_20241225194540.txt", "tcueMCe-0zo_20241225194236.txt", "K-TW2Chpz4k_20241225194330.txt", "XcvhERcZpWw_20241225194731.txt", "Ze2pc6NwsHQ_20241225194704.txt", "_ltcLEM-5HU_20241225194612.txt", "jouFvyRZntk_20241225194507.txt", "uWV9a3zEaL4_20241225194823.txt", "-OBCwiPPfEU_20241225194747.txt", "dzOvi0Aa2EA_20241225194301.txt", "K9lORz2_XSU_20241225194527.txt", "j2sMqSDLd4k_20241225194407.txt", "oNkDA2F7CjM_20241225194651.txt", "50BZQRT1dAg_20241225194403.txt", "q8CHXefn7B4_20241225194411.txt", "Jy4rJcYmtUM_20241225194344.txt", "QmOF0crdyRU_20241225194456.txt", "6ZrlsVx85ek_20241225194758.txt", "CD0bRU1e1ZM_20241225194425.txt", "IAnhFUUCq6c_20241225194804.txt", "Phm-Alz1Zjo_20241225194906.txt", "csubiPlvFWk_20241225194606.txt", "GpgqXCkRO-w_20241225194701.txt", "W5zqC5cYcS0_20241225194241.txt", "T65RDBiB5Hs_20241225194715.txt", "6I5I56uVvLw_20241225194801.txt", "i5611OvTFGM_20241225194548.txt", "wTBSGgbIvsY_20241225194552.txt", "O1YRwWmue4Y_20241225194815.txt", "29n0WG317tM_20241225194511.txt", "xmhsWAqP_0Y_20241225194851.txt", "x4m_PdFbu-s_20241225194722.txt"]
Rag/__init__.py ADDED
File without changes
Rag/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (153 Bytes). View file
 
Rag/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (111 Bytes). View file
 
Rag/__pycache__/rag_pipeline.cpython-311.pyc ADDED
Binary file (8.26 kB). View file
 
Rag/chromadb.db/01e34d25-3e37-4b52-8953-794b0e9b61dd/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b5a01d1e5da2cd59bebe66372da65fee337e4d2d160c1170c240dae082bc3f5
3
+ size 100
Rag/chromadb.db/b338c320-325e-4f10-8e0c-ce336d2e26c9/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9fc8728a7f84f5a3f3e775af521ea3d3062f3b0d229f88d76086b8805199074
3
+ size 100
Rag/error_log.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Traceback (most recent call last):
2
+ File "/home/nightwing/Codes/Xyzbot/Rag/chunking.py", line 52, in split_text_to_chunks
3
+ chunks = text_splitter.split_documents(docs)
4
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
5
+ File "/home/nightwing/anaconda3/envs/xyzbot/lib/python3.11/site-packages/langchain_text_splitters/base.py", line 94, in split_documents
6
+ texts.append(doc.page_content)
7
+ ^^^^^^^^^^^^^^^^
8
+ AttributeError: 'str' object has no attribute 'page_content'
Rag/rag_pipeline.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from sentence_transformers import SentenceTransformer
4
+ import google.generativeai as genai
5
+ import os
6
+ import logging
7
+ from Llm.llm_endpoints import get_llm_response
8
+ from utils.get_link import get_source_link
9
+ # from utils.corefrence import resolve_coreferences
10
+ from Prompts.huberman_prompt import huberman_prompt
11
+ # Configuration
12
+ API_KEY = os.getenv("GOOGLE_API_KEY")
13
+ if API_KEY:
14
+ genai.configure(api_key=API_KEY)
15
+
16
+ chromadb_path = "app/Rag/chromadb.db"
17
+ # transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
18
+ # processed_files_path = "/home/nightwing/Codes/Xyzbot/Rag/Processed_folder/processed_files.json"
19
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
20
+
21
+ # client = chromadb.PersistentClient(path=chromadb_path)
22
+ # collection = client.get_or_create_collection(name="yt_transcript_collection")
23
+
24
+ # Logging
25
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
26
+
27
+
28
+ # Helper Functions
29
+ def split_text_to_chunks(docs, chunk_size=1000, chunk_overlap=200):
30
+ """Split text into manageable chunks."""
31
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
32
+ chunks = text_splitter.split_text(docs)
33
+ return chunks
34
+
35
+
36
+ def get_new_files(transcripts_folder_path, collection):
37
+ """Find new transcript files that haven't been processed yet."""
38
+ all_files = [f for f in os.listdir(transcripts_folder_path) if f.endswith(".txt")]
39
+ existing_files = [meta["source"] for meta in collection.get()['metadatas']]
40
+ return [f for f in all_files if f not in existing_files]
41
+
42
+
43
+ def process_and_add_new_files(transcripts_folder_path, collection):
44
+ """Process and add new transcript files to the vector database."""
45
+ new_files = get_new_files(transcripts_folder_path, collection)
46
+ if not new_files:
47
+ return False
48
+
49
+ for new_file in new_files:
50
+ file_path = os.path.join(transcripts_folder_path, new_file)
51
+ with open(file_path, 'r') as f:
52
+ content = f.read()
53
+
54
+ chunks = split_text_to_chunks(content)
55
+ embeddings = embedding_model.encode(chunks).tolist()
56
+
57
+ ids = [f"{new_file}_chunk_{i}" for i in range(len(chunks))]
58
+ metadata = [{"source": new_file} for _ in range(len(chunks))]
59
+ collection.upsert(documents=chunks, embeddings=embeddings, metadatas=metadata, ids=ids)
60
+
61
+ logging.info(f"Added {new_file} to the database")
62
+ return True
63
+
64
+
65
+ def query_database(collection, query_text, n_results=3):
66
+ """Retrieve the most relevant chunks for the query."""
67
+ query_embeddings = embedding_model.encode(query_text).tolist()
68
+ results = collection.query(query_embeddings=query_embeddings, n_results=n_results)
69
+ retrieved_docs = results['documents'][0]
70
+ metadatas = results['metadatas'][0]
71
+ return retrieved_docs, metadatas
72
+
73
+
74
+ def enhance_query_with_history(query_text, summarized_history):
75
+ enhance_query = f"{query_text}*2\n\n{summarized_history}"
76
+ return enhance_query
77
+
78
+
79
+ def update_conversation_history(history, user_query, bot_response):
80
+ """
81
+ Update and keeps track of conversation history between user and the bot
82
+ :param history:
83
+ :param user_query:
84
+ :param bot_response:
85
+ :return:
86
+ """
87
+ history.append({"user": user_query, "bot": bot_response})
88
+ return history
89
+
90
+
91
+ def generate_response(conversation_history, query_text, retrieved_docs, source_links):
92
+ """Generate a response using retrieved documents and the generative AI model."""
93
+
94
+ context = " ".join(retrieved_docs)
95
+ history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
96
+ sources_str = "\n".join(source_links)
97
+
98
+ prompt = huberman_prompt.format(
99
+ context=context,
100
+ sources=sources_str,
101
+ history=history_str,
102
+ question=query_text
103
+ )
104
+
105
+ response = get_llm_response(prompt)
106
+
107
+ # Append sources to the response
108
+ full_response = f"{response}\n\nSources:\n{sources_str}"
109
+ return full_response
110
+
111
+
112
+ # Main Workflow
113
+ def main_workflow(transcripts_folder_path, collection):
114
+ """Run the full RAG workflow."""
115
+ # Process new files
116
+ new_files_added = process_and_add_new_files(transcripts_folder_path, collection)
117
+ if new_files_added:
118
+ logging.info("New transcripts added to the database.")
119
+ else:
120
+ logging.info("No new files found. Using existing database.")
121
+
122
+ #Initialize conversation history
123
+ conversation_history = []
124
+
125
+ while True:
126
+ query_text = input("\nEnter your query(or type 'exit' to end):").strip()
127
+ if query_text.lower() == "exit":
128
+ print("Ending the conversation. Goodbye")
129
+ break
130
+ # resolved_query = resolve_coreferences(query_text, conversation_history)
131
+ query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
132
+ # resolved_query = resolve_coreference_in_query(query_text_with_conversation_history, conversation_history)
133
+ retrived_docs, metadatas = query_database(collection, query_text_with_conversation_history)
134
+ print("-" * 50)
135
+ source_link = get_source_link(metadatas)
136
+ print(source_link)
137
+ print("-" * 50)
138
+ if not retrived_docs:
139
+ print("No relevent documents is found")
140
+ continue
141
+ response = generate_response(conversation_history, query_text, retrived_docs, source_link)
142
+ conversation_history = update_conversation_history(conversation_history, query_text, response)
143
+ print("\nGenerated Response:")
144
+ print(response)
145
+
146
+
147
+