Spaces:

Nightwing11
/

Hubermanbot2

Sleeping

App Files Files

xet

Community

Nightwing11 commited on Feb 16

Commit

a6a76ee

1 Parent(s): 59eb93e

Upload 18 files

Browse files

Files changed (9) hide show

Rag/Processed_folder/processed_files.json +1 -0
Rag/__init__.py +0 -0
Rag/__pycache__/__init__.cpython-311.pyc +0 -0
Rag/__pycache__/__init__.cpython-39.pyc +0 -0
Rag/__pycache__/rag_pipeline.cpython-311.pyc +0 -0
Rag/chromadb.db/01e34d25-3e37-4b52-8953-794b0e9b61dd/header.bin +3 -0
Rag/chromadb.db/b338c320-325e-4f10-8e0c-ce336d2e26c9/header.bin +3 -0
Rag/error_log.txt +8 -0
Rag/rag_pipeline.py +183 -0

Rag/Processed_folder/processed_files.json ADDED Viewed

	@@ -0,0 +1 @@

+ ["VOfwbcveP84_20241225194621.txt", "In9Bq4EJMZw_20241225194705.txt", "DkS1pkKpILY_20241225194325.txt", "ajneRM-ET1Q_20241225194311.txt", "K4Ze-Sp6aUE_20241225194709.txt", "n28W4AmvMDE_20241225194626.txt", "UIy-WQCZd4M_20241225194819.txt", "etbfLTHD_VU_20241225194439.txt", "PVmQOLYckKQ_20241225194814.txt", "F9KrZd_-ge0_20241225194812.txt", "xjEFo3a1AnI_20241225194539.txt", "szqPAPKE5tQ_20241225194712.txt", "3_auLYOilb8_20241225194826.txt", "acgz0C-z-gc_20241225194817.txt", "zVCaYyUWWSw_20241225194412.txt", "doupx8SAs5Y_20241225194603.txt", "wAZn9dF3XTo_20241225194423.txt", "2XGREPnlI8U_20241225194659.txt", "UNCwdFxPtE8_20241225194521.txt", "at37Y8rKDlA_20241225194513.txt", "oL3SkPV1_Ik_20241225194837.txt", "nOgypsWKjm4_20241225194440.txt", "rW9QKc-iFoY_20241225194751.txt", "CQlTmOFM4Qs_20241225194550.txt", "tR73Ny4Dt9s_20241225194413.txt", "t1F7EEGPQwo_20241225194649.txt", "ccrbE0QHy94_20241225194608.txt", "SyWC8ZFVxGo_20241225194333.txt", "zlc4VrDx_qk_20241225194800.txt", "8qaBpM73NSk_20241225194409.txt", "sxgCC4H1dl8_20241225194524.txt", "RBK5KLA5Jjg_20241225194446.txt", "slUCmZJDXrk_20241225194627.txt", "h2aWYjSA1Jc_20241225194702.txt", "Ov4yyK15-K8_20241225194230.txt", "juD99_sPWGU_20241225194340.txt", "q1Ss8sTbFBY_20241225194647.txt", "X8Hw8zeCDTA_20241225194518.txt", "UChhXiFPRgg_20241225194443.txt", "pq6WHJzOkno_20241225194415.txt", "2Ds1m5gflCI_20241225194849.txt", "jGZ1mR9uLU0_20241225194808.txt", "VAEzZeaV5zM_20241225194347.txt", "EhlIkzJwPlk_20241225194656.txt", "HiyzzcuaAac_20241225194255.txt", "C3X0bUAiluE_20241225194259.txt", "kG5Qb9sr0YQ_20241225194810.txt", "wRsX_ZkzxvQ_20241225194619.txt", "U2BPitASUh0_20241225194358.txt", "Wcs2PFz5q6g_20241225194327.txt", "CuzL1qxUyHw_20241225194312.txt", "q37ARYnRDGc_20241225194623.txt", "cp9GXl9Qk_s_20241225194735.txt", "XT_6Lvkhxvo_20241225194342.txt", "bUr_9fgfnto_20241225194256.txt", "LTGGyQS1fZE_20241225194305.txt", "mAlt_HKX4as_20241225194420.txt", "SZSRgyl7pyQ_20241225194418.txt", "RI112zW8GDw_20241225194356.txt", "ycOBZZeVeAc_20241225194707.txt", "6YLdlK2hYnw_20241225194328.txt", "p4ZfkezDTXQ_20241225194615.txt", "LVxL_p_kToc_20241225194558.txt", "HXzTbCEqCJc_20241225194710.txt", "yOoVz9E9kfQ_20241225194901.txt", "C5KpIXjpzdY_20241225194400.txt", "__RAXBLt1iM_20241225194430.txt", "8N7mdkrXgbc_20241225194338.txt", "JnlSDaBjCCU_20241225194450.txt", "IOl28gj_RXw_20241225194431.txt", "Nr5xb-QCBGA_20241225194354.txt", "GzvzWO0NU50_20241225194605.txt", "DtmwtjOoSYU_20241225194633.txt", "CrtR12PBKb0_20241225194632.txt", "gMRph_BvHB4_20241225194516.txt", "QpoaNklmRPc_20241225194248.txt", "9tRohh0gErM_20241225194353.txt", "Xu1FMCxoEFc_20241225194346.txt", "15R2pMqU2ok_20241225194406.txt", "eIxVfln02Ss_20241225194335.txt", "0Dtt95_xabw_20241225194252.txt", "3ZGItIAUQmI_20241225194719.txt", "uxZFl4BDOGk_20241225194757.txt", "hvPGfcAgk9Y_20241225194754.txt", "HYVeP4F0GNU_20241225194559.txt", "z5W74QC3v2I_20241225194308.txt", "31wjVhCcI5Y_20241225194426.txt", "BMTt8gSl13s_20241225194836.txt", "aQDOU3hPci0_20241225194501.txt", "tkH2-_jMCSk_20241225194543.txt", "ntfcfJ28eiU_20241225194522.txt", "S8nPJU9xkNw_20241225194748.txt", "fcxjwA4C4Cw_20241225194553.txt", "iMvtHqLmEkI_20241225194855.txt", "099hgtRoUZw_20241225194436.txt", "4RFEkGKKhdE_20241225194907.txt", "eJU6Df_ffAE_20241225194635.txt", "nqNEtdHVUjM_20241225194437.txt", "1SXDXdngX2M_20241225194316.txt", "X4QE6t-MkYE_20241225194642.txt", "79p1X_7rAMo_20241225194630.txt", "6RZbGrq9BxE_20241225194306.txt", "pkJi9Raxikg_20241225194824.txt", "QbMxDZeB8Ks_20241225194247.txt", "RgAcOqVRfYA_20241225194657.txt", "ncSoor2Iw8k_20241225194833.txt", "i_DEPeCKxs8_20241225194235.txt", "FE0lTEUa7EY_20241225194753.txt", "gE0_8AjTFaM_20241225194852.txt", "kgr22uMsJ5o_20241225194317.txt", "ufsIA5NARIo_20241225194535.txt", "CyDLbrZK75U_20241225194434.txt", "7TkGDj4LaOU_20241225194244.txt", "XLr2RKoD-oY_20241225194738.txt", "yb5zpo5WDG4_20241225194645.txt", "a9yFKPmPZ90_20241225194556.txt", "TG8VM5-CTfw_20241225194636.txt", "eMqWH3LYiII_20241225194351.txt", "CVh3_8e5u8I_20241225194246.txt", "SuR0DaYoe0Y_20241225194302.txt", "FLxIoNguGRU_20241225194233.txt", "GA89kjVY6Ik_20241225194854.txt", "qJ3uV7coZbA_20241225194453.txt", "EQ3GjpGq5Y8_20241225194405.txt", "yOJvm_ri_hk_20241225194555.txt", "cwakOgHIT0E_20241225194421.txt", "DTCmprPCDqc_20241225194733.txt", "qPKd99Pa2iU_20241225194500.txt", "nm1TxQj9IsQ_20241225194611.txt", "LRM5LutB538_20241225194857.txt", "xTtM2AvCRyA_20241225194643.txt", "62lVH-6xYGY_20241225194250.txt", "Rxmv7rT9leo_20241225194417.txt", "ulHrUVV3Kq4_20241225194452.txt", "bGixnNGvSkg_20241225194231.txt", "1CxJVdeyltw_20241225194614.txt", "wgUjIRtote8_20241225194726.txt", "qPKd99Pa2iU_20241225194503.txt", "S_SrHS8FvMM_20241225194807.txt", "xX6hiEmDmxs_20241225194227.txt", "uXs-zPc63kM_20241225194449.txt", "4AwyVTHEU3s_20241225194904.txt", "xaE9XyMMAHY_20241225194848.txt", "hFL6qRIJZ_Y_20241225194428.txt", "FOi5s3OUogo_20241225194245.txt", "cS7cNaBrkxo_20241225194624.txt", "kpTJqwIfHcM_20241225194654.txt", "yixIc1Ai6jM_20241225194829.txt", "vfRtLI6cJrk_20241225194324.txt", "GLgKkG44MGo_20241225194729.txt", "KPlJcD-o-4Q_20241225194617.txt", "AtChcxeaukQ_20241225194646.txt", "tLS6t3FVOTI_20241225194714.txt", "GqPGXG5TlZw_20241225194541.txt", "UF0nqolsNZc_20241225194727.txt", "7R3-3HR6-u4_20241225194519.txt", "tLRCS48Ens4_20241225194447.txt", "V0Sdgn0_kFM_20241225194740.txt", "G1VUSu6sGoU_20241225194251.txt", "m_OazsImOiI_20241225194322.txt", "Og56hmAspV8_20241225194258.txt", "dFR_wFN23ZY_20241225194640.txt", "q-H_A_dQUxQ_20241225194303.txt", "KVjfFN89qvQ_20241225194314.txt", "zU5EYw06wtw_20241225194349.txt", "Z7MU6zrAXsM_20241225194442.txt", "LYYyQcAJZfk_20241225194508.txt", "E7W4OQfJWdw_20241225194717.txt", "azb3Ih68awQ_20241225194505.txt", "ouCWNRvPk20_20241225194401.txt", "uwWOc_RqTBA_20241225194858.txt", "pZX8ikmWvEU_20241225194510.txt", "n9IxomBusuw_20241225194545.txt", "BwyZIWeBpRw_20241225194534.txt", "XY0rBdaDXD8_20241225194226.txt", "1Wo6SqLNmLk_20241225194845.txt", "ddq8JIMhz7c_20241225194529.txt", "VQLU7gpk_X8_20241225194821.txt", "jC8Pu9HBd48_20241225194321.txt", "rZkMpVLcVsg_20241225194319.txt", "gbQFSMayJxk_20241225194736.txt", "F54qXuTpgfM_20241225194843.txt", "p3JLaF_4Tz8_20241225194537.txt", "FeRgqJVALMQ_20241225194433.txt", "hF32FvBH4gI_20241225194332.txt", "CDUetQMKM6g_20241225194454.txt", "wG3UFHR1o48_20241225194229.txt", "6P8hrzjnetU_20241225194336.txt", "WFcYF_pxLgA_20241225194458.txt", "77CdVSpnUX4_20241225194746.txt", "VOfwbcveP84_20241225194742.txt", "VRvn3Oj5r3E_20241225194839.txt", "Gf-kC30SLtc_20241225194846.txt", "S8jWFcDGz4Y_20241225194805.txt", "x3MgDtZovks_20241225194526.txt", "lIo9FcrljDk_20241225194309.txt", "-e9ErUozQo4_20241225194903.txt", "aXvDEmo6uS4_20241225194629.txt", "3gtvNYa3Nd8_20241225194531.txt", "5tYR7e5Wpyc_20241225194238.txt", "OadokY8fcAA_20241225194601.txt", "O640yAgq5f8_20241225194744.txt", "zbpb1wd-wvs_20241225194827.txt", "gXvuJu1kt48_20241225194638.txt", "zEYE-vcVKy8_20241225194547.txt", "Ky-ZJ9SS-x4_20241225194240.txt", "0RYyQRQFgFk_20241225194532.txt", "4F_RBc1akC8_20241225194724.txt", "nDLb8_wgX50_20241225194540.txt", "tcueMCe-0zo_20241225194236.txt", "K-TW2Chpz4k_20241225194330.txt", "XcvhERcZpWw_20241225194731.txt", "Ze2pc6NwsHQ_20241225194704.txt", "_ltcLEM-5HU_20241225194612.txt", "jouFvyRZntk_20241225194507.txt", "uWV9a3zEaL4_20241225194823.txt", "-OBCwiPPfEU_20241225194747.txt", "dzOvi0Aa2EA_20241225194301.txt", "K9lORz2_XSU_20241225194527.txt", "j2sMqSDLd4k_20241225194407.txt", "oNkDA2F7CjM_20241225194651.txt", "50BZQRT1dAg_20241225194403.txt", "q8CHXefn7B4_20241225194411.txt", "Jy4rJcYmtUM_20241225194344.txt", "QmOF0crdyRU_20241225194456.txt", "6ZrlsVx85ek_20241225194758.txt", "CD0bRU1e1ZM_20241225194425.txt", "IAnhFUUCq6c_20241225194804.txt", "Phm-Alz1Zjo_20241225194906.txt", "csubiPlvFWk_20241225194606.txt", "GpgqXCkRO-w_20241225194701.txt", "W5zqC5cYcS0_20241225194241.txt", "T65RDBiB5Hs_20241225194715.txt", "6I5I56uVvLw_20241225194801.txt", "i5611OvTFGM_20241225194548.txt", "wTBSGgbIvsY_20241225194552.txt", "O1YRwWmue4Y_20241225194815.txt", "29n0WG317tM_20241225194511.txt", "xmhsWAqP_0Y_20241225194851.txt", "x4m_PdFbu-s_20241225194722.txt"]

Rag/__init__.py ADDED Viewed

File without changes

Rag/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (153 Bytes). View file

Rag/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (111 Bytes). View file

Rag/__pycache__/rag_pipeline.cpython-311.pyc ADDED Viewed

Binary file (11.6 kB). View file

Rag/chromadb.db/01e34d25-3e37-4b52-8953-794b0e9b61dd/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b5a01d1e5da2cd59bebe66372da65fee337e4d2d160c1170c240dae082bc3f5
+size 100

Rag/chromadb.db/b338c320-325e-4f10-8e0c-ce336d2e26c9/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9fc8728a7f84f5a3f3e775af521ea3d3062f3b0d229f88d76086b8805199074
+size 100

Rag/error_log.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+Traceback (most recent call last):
+  File "/home/nightwing/Codes/Xyzbot/Rag/chunking.py", line 52, in split_text_to_chunks
+    chunks = text_splitter.split_documents(docs)
+             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/nightwing/anaconda3/envs/xyzbot/lib/python3.11/site-packages/langchain_text_splitters/base.py", line 94, in split_documents
+    texts.append(doc.page_content)
+                 ^^^^^^^^^^^^^^^^
+AttributeError: 'str' object has no attribute 'page_content'

Rag/rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import chromadb
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from sentence_transformers import SentenceTransformer
+import google.generativeai as genai
+import os
+import logging
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from Llm.llm_endpoints import get_llm_response
+from utils.get_link import get_source_link
+from Prompts.huberman_prompt import huberman_prompt
+from tqdm import tqdm
+# Configuration
+API_KEY = os.getenv("GOOGLE_API_KEY")
+if API_KEY:
+    genai.configure(api_key=API_KEY)
+chromadb_path = "app/Rag/chromadb.db"
+embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+# Logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
+# Helper Functions
+def split_text_to_chunks(docs, chunk_size=1000, chunk_overlap=200):
+    """Split text into manageable chunks."""
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    chunks = text_splitter.split_text(docs)
+    return chunks
+def get_new_files(transcripts_folder_path, collection):
+    """Find new transcript files that haven't been processed yet."""
+    all_files = [f for f in os.listdir(transcripts_folder_path) if f.endswith(".txt")]
+    existing_files = [meta["source"] for meta in collection.get()['metadatas']]
+    return [f for f in all_files if f not in existing_files]
+def process_single_file(file_path):
+    """Process a single file and return its chunks."""
+    with open(file_path, 'r') as f:
+        content = f.read()
+    chunks = split_text_to_chunks(content)
+    return chunks, os.path.basename(file_path)
+def batch_embed_chunks(chunks, batch_size=32):
+    """Embed chunks in batches."""
+    embeddings = []
+    for i in tqdm(range(0, len(chunks), batch_size),desc = "Embedding chunks"):
+        batch = chunks[i:i + batch_size]
+        batch_embeddings = embedding_model.encode(batch, show_progress_bar=True)
+        embeddings.extend(batch_embeddings.tolist())
+    return embeddings
+def process_and_add_new_files(transcripts_folder_path, collection):
+    """Process and add new transcript files to the vector database."""
+    new_files = get_new_files(transcripts_folder_path, collection)
+    if not new_files:
+        logging.info("No new files to process")
+        return False
+    # Use a reasonable number of workers (4 is usually a good default)
+    n_workers = min(8, len(new_files))
+    logging.info(f"Using {n_workers} workers for processing")
+    all_chunks = []
+    all_metadata = []
+    all_ids = []
+    # Process files in parallel
+    with ProcessPoolExecutor(max_workers=n_workers) as executor:
+        futures = {
+            executor.submit(process_single_file, os.path.join(transcripts_folder_path, file)): file
+            for file in new_files
+        }
+        for future in as_completed(futures):
+            file = futures[future]
+            try:
+                chunks, filename = future.result()
+                file_metadata = [{"source": filename} for _ in range(len(chunks))]
+                file_ids = [f"{filename}_chunk_{i}" for i in range(len(chunks))]
+                all_chunks.extend(chunks)
+                all_metadata.extend(file_metadata)
+                all_ids.extend(file_ids)
+                logging.info(f"Processed {filename}")
+            except Exception as e:
+                logging.error(f"Error processing {file}: {str(e)}")
+                continue
+    # Process embeddings in batches
+    logging.info(f"Generating embeddings for {len(all_chunks)} chunks")
+    embeddings = batch_embed_chunks(all_chunks)
+    # Add to database in batches
+    batch_size = 500
+    for i in range(0, len(all_chunks), batch_size):
+        end_idx = min(i + batch_size, len(all_chunks))
+        collection.upsert(
+            documents=all_chunks[i:end_idx],
+            embeddings=embeddings[i:end_idx],
+            metadatas=all_metadata[i:end_idx],
+            ids=all_ids[i:end_idx]
+        )
+        logging.info(f"Added batch {i // batch_size + 1} to database")
+    logging.info(f"Successfully processed {len(new_files)} files")
+    return True
+def query_database(collection, query_text, n_results=3):
+    """Retrieve the most relevant chunks for the query."""
+    query_embeddings = embedding_model.encode(query_text).tolist()
+    results = collection.query(query_embeddings=query_embeddings, n_results=n_results)
+    retrieved_docs = results['documents'][0]
+    metadatas = results['metadatas'][0]
+    return retrieved_docs, metadatas
+def enhance_query_with_history(query_text, summarized_history):
+    enhance_query = f"{query_text}*2\n\n{summarized_history}"
+    return enhance_query
+def update_conversation_history(history, user_query, bot_response):
+    """Update and keeps track of conversation history between user and the bot."""
+    history.append({"user": user_query, "bot": bot_response})
+    return history
+def generate_response(conversation_history, query_text, retrieved_docs, source_links):
+    """Generate a response using retrieved documents and the generative AI model."""
+    context = " ".join(retrieved_docs)
+    history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
+    sources_str = "\n".join(source_links)
+    prompt = huberman_prompt.format(
+        context=context,
+        sources=sources_str,
+        history=history_str,
+        question=query_text
+    )
+    response = get_llm_response(prompt)
+    full_response = f"{response}\n\nSources:\n{sources_str}"
+    return full_response
+def main_workflow(transcripts_folder_path, collection):
+    """Run the full RAG workflow."""
+    new_files_added = process_and_add_new_files(transcripts_folder_path, collection)
+    if new_files_added:
+        logging.info("New transcripts added to the database.")
+    else:
+        logging.info("No new files found. Using existing database.")
+    conversation_history = []
+    while True:
+        query_text = input("\nEnter your query(or type 'exit' to end):").strip()
+        if query_text.lower() == "exit":
+            print("Ending the conversation. Goodbye")
+            break
+        query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
+        retrived_docs, metadatas = query_database(collection, query_text_with_conversation_history)
+        print("-" * 50)
+        source_link = get_source_link(metadatas)
+        print(source_link)
+        print("-" * 50)
+        if not retrived_docs:
+            print("No relevent documents is found")
+            continue
+        response = generate_response(conversation_history, query_text, retrived_docs, source_link)
+        conversation_history = update_conversation_history(conversation_history, query_text, response)
+        print("\nGenerated Response:")
+        print(response)