Spaces:

Nightwing11
/

Hubermanbot2

Sleeping

App Files Files

xet

Community

Nightwing11 commited on Feb 16

Commit

075d061

2 Parent(s): fba61a7 100e32f

Resolved merge conflict in README.md

Browse files

Files changed (30) hide show

.dockerignore +23 -0
.gitignore +255 -0
.gradio/certificate.pem +31 -0
Data/__init__.py +0 -0
Data/get_video_link.py +152 -0
Data/new_video_added.py +22 -0
Data/yt_transcript.py +94 -0
Dockerfile +49 -0
Example/__init__.py +0 -0
Example/rag_example.py +18 -0
Llm/__init__.py +0 -0
Llm/llm_endpoints.py +14 -0
Prompts/__init__.py +0 -0
Prompts/huberman_prompt.py +20 -0
Prompts/summary_prompt.py +10 -0
README.md +99 -0
Rag/Processed_folder/processed_files.json +1 -0
Rag/__init__.py +0 -0
Rag/rag_pipeline.py +183 -0
poetry.lock +0 -0
pyproject.toml +34 -0
requirements.in +18 -0
requirements.txt +20 -0
setup.sh +10 -0
ui/__init__.py +0 -0
ui/app.py +147 -0
utils/__init__.py +0 -0
utils/corefrence.py +52 -0
utils/get_link.py +11 -0
utils/summarization.py +14 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,23 @@

+# Ignore version control
+.git
+.gitignore
+# Ignore notebooks
+Notebook/
+# Ignore databases and logs
+**/*.db
+**/*.sqlite3
+**/chromadb.db
+**/error_log.txt
+# Ignore cache
+**/__pycache__/
+**/*.pyc
+**/*.pyo
+**/*.pyd
+# Ignore environment files
+.env
+venv/
+.venv/

.gitignore ADDED Viewed

	@@ -0,0 +1,255 @@

+tmp
+.idea
+models
+stanford-ner-2015-04-20.zip
+stanford-ner-2015-04-20
+*.pyc
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+service.log.*
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+Data/transcripts/
+Data/videolinks/
+Rag/db/
+Rag/db/chroma.sqlite3
+Rag/chromadb.db/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.env
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+# Generated files
+.idea/**/contentModel.xml
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+# CMake
+cmake-build-*/
+#
+Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+# File-based project format
+*.iws
+# IntelliJ
+out/
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+# JIRA plugin
+atlassian-ide-plugin.xml
+# Cursive Clojure plugin
+.idea/replstate.xml
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+# Editor-based Rest Client
+.idea/httpRequests
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+### VirtualEnv template
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+.Python
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+pyvenv.cfg
+.venv
+pip-selfcheck.json
+files
+Files
+*.tmp
+.vscode
+my_virtual_environment
+dist
+crf_py_utils.egg-info
+build
+datas
+tests/data
+venv
+create_docker_image.sh
+anydonebert/data
+results
+train_test_split
+anydonebert/models/sbert.net_models_paraphrase-distilroberta-base-v1
+anydonebert/models/sbert.net_models_paraphrase-distilroberta-base-v2
+resources/conll_files/*
+resources/test_xml_files/*
+resources/xml_files/*
+config.ini
+flowcess/commons/settings.py

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

Data/__init__.py ADDED Viewed

File without changes

Data/get_video_link.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import os
+import requests
+from dotenv import load_dotenv
+from Data.new_video_added import get_new_video_url
+from datetime import datetime
+import json
+from pathlib import Path
+load_dotenv()
+api_key = os.getenv('API_KEY')
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+BASE_URL = "https://www.googleapis.com/youtube/v3"
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+channel = "https://www.youtube.com/@hubermanlab/videos"
+new_video_added = False
+# video_links_folder_name = os.path.join(BASE_DIR, "videolinks")
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+# print("THIS IS BASE DIR:", BASE_DIR)
+# print("THIS is current dir:", CURRENT_DIR)
+# video_links_folder_name = os.path.join(CURRENT_DIR, "videolinks")
+video_links_folder_name = os.path.join(PROJECT_ROOT, "Data", "video_links")
+def ensure_directories():
+    if not os.path.exists(video_links_folder_name):
+        os.makedirs(video_links_folder_name)
+        print(f"Directory {video_links_folder_name} created")
+def get_chanel_id(chanel_name):
+    url = f"{BASE_URL}/search"
+    params = {
+        "part": "snippet",
+        "q": chanel_name,
+        "type": "channel",
+        "key": api_key
+    }
+    response = requests.get(url, params)
+    response_data = response.json()
+    if "items" in response_data and len(response_data["items"]) > 0:
+        return response_data["items"][0]["snippet"]["channelId"]
+    else:
+        return None
+def get_video_links(channel_id):
+    url = f"{BASE_URL}/search"
+    video_links = []
+    next_page_token = None
+    while True:
+        params = {
+            "part": "snippet",
+            "channelId": channel_id,
+            "maxResults": 50,
+            "type": "video",
+            "key": api_key,
+        }
+        if next_page_token:
+            params["pageToken"] = next_page_token
+        response = requests.get(url, params=params)
+        response_data = response.json()
+        if "items" not in response_data:
+            break
+        for item in response_data["items"]:
+            video_id = item["id"]["videoId"]
+            video_links.append(f"https://www.youtube.com/watch?v={video_id}")
+        next_page_token = response_data.get("nextPageToken")
+        if not next_page_token:
+            break
+    return video_links
+def save_video_links(video_links):
+    if not os.path.exists(video_links_folder_name):
+        os.makedirs(video_links_folder_name)
+    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+    filename = f"video_links_{timestamp}.json"
+    filepath = os.path.join(video_links_folder_name, filename)
+    with open(filepath, 'w') as file:
+        json.dump(video_links, file)
+    print(f"{len(video_links)} The video links is saved successfully to {filename}")
+def load_video_links():
+    """
+    Load the most recent video links file based on timestamp in the filename.
+    """
+    # List all files in the current directory
+    if not os.path.exists(video_links_folder_name):
+        print(f"{video_links_folder_name} does not exits")
+    files = [f for f in os.listdir(video_links_folder_name) if f.startswith("video_links_") and f.endswith(".json")]
+    if not files:
+        print("No video links file found.")
+        return []
+    # Sort files by the timestamp in their names (descending)
+    files.sort(key=lambda x: datetime.strptime(x[len("video_links_"):-len(".json")], "%Y%m%d%H%M%S"), reverse=True)
+    # Load the most recent file
+    latest_file = files[0]
+    filepath = os.path.join(video_links_folder_name, latest_file)
+    try:
+        with open(filepath, 'r') as file:
+            video_links = json.load(file)
+            print(f"{len(video_links)} video links loaded successfully from {latest_file}.")
+            return video_links
+    except Exception as e:
+        print(f"Error loading {latest_file}: {e}")
+        return []
+def video_links_main():
+    ensure_directories()
+    video_links = load_video_links()
+    if video_links:
+        print(f"Using {len(video_links)} saved video links")
+    else:
+        channel_name = input("Enter the YouTube channel name: ")
+        channel_id = get_chanel_id(channel_name)
+        if channel_id:
+            print(f"Fetching videos for channel: {channel_name} (ID: {channel_id})")
+            video_links = get_video_links(channel_id)
+            save_video_links(video_links)
+        else:
+            print("Failed to fetch video links")
+    # for link in video_links:
+    #     # print(link)
+    new_video_url = get_new_video_url(channel)
+    # new_video_url = new_video_url[:3]
+    new_videos = [url for url in new_video_url if url not in video_links]
+    if new_videos:
+        print(f"{len(new_videos)} new video founds")
+        video_links.extend(new_videos)
+        save_video_links(video_links)
+        new_video_added = True
+    else:
+        print("No new video founds")
+        new_video_added = False
+    # print(new_video_added)
+    return video_links, new_video_added, new_videos
+if __name__ == "__main__":
+    video_links_main()

Data/new_video_added.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import requests
+import re
+def get_new_video_url(channel):
+    """
+    Fetch all video URLs from the given YouTube channel page.
+    """
+    try:
+        html = requests.get(channel).text
+        # Extract all video IDs from the HTML
+        video_ids = re.findall(r'(?<="videoId":").*?(?=")', html)
+        video_urls = [f"https://www.youtube.com/watch?v={video_id}" for video_id in video_ids]
+        # Remove duplicates while preserving order
+        video_urls = list(dict.fromkeys(video_urls))
+        print(f"Fetched {len(video_urls)} video URLs from the channel.")
+        return video_urls
+    except Exception as e:
+        print(f"Error fetching video URLs: {e}")
+        return []

Data/yt_transcript.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from youtube_transcript_api import YouTubeTranscriptApi
+from Data.get_video_link import video_links_main
+from pathlib import Path
+from datetime import datetime
+# Dynamically get the root directory of the project
+PROJECT_ROOT = Path(__file__).resolve().parent.parent  # Moves up from /Data/
+TRANSCRIPTS_FOLDER = PROJECT_ROOT / "Data" / "transcripts"
+def save_transcript(video_id, transcript_text):
+    """
+    Saves transcripts to the local folder
+    """
+    # Ensure the transcripts folder exists
+    TRANSCRIPTS_FOLDER.mkdir(parents=True, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+    filename = f"{video_id}_{timestamp}.txt"
+    file_path = TRANSCRIPTS_FOLDER / filename
+    file_path.write_text('\n'.join(transcript_text), encoding="utf-8")
+    return file_path
+def get_video_id(video_links_list):
+    return [link.replace("https://www.youtube.com/watch?v=", "") for link in video_links_list]
+def fetch_yt_transcript(video_ids):
+    """
+    Fetches YouTube transcripts using video IDs.
+    """
+    video_transcripts = {}
+    for video_id in video_ids:
+        print(f"Fetching transcript for: {video_id}")
+        try:
+            output = YouTubeTranscriptApi.get_transcript(video_id)
+            transcript_text = [item['text'] for item in output]
+            # Save transcript and get file path
+            file_path = save_transcript(video_id, transcript_text)
+            video_transcripts[video_id] = {
+                'text': transcript_text,
+                'file_path': str(file_path)
+            }
+            print(f"Transcript saved to: {file_path}")
+        except Exception as e:
+            print(f"Transcript not found for video: {video_id}")
+            video_transcripts[video_id] = {
+                'text': [],
+                'file_path': None
+            }
+    return video_transcripts
+def all_video_transcript_pipeline():
+    """
+    Handles fetching and storing transcripts, checking for new videos.
+    """
+    print(f"Looking for transcripts in: {TRANSCRIPTS_FOLDER}")
+    video_links_list, new_video_added, new_videos_link = video_links_main()
+    video_transcripts = {}
+    # Always load existing transcripts
+    if TRANSCRIPTS_FOLDER.exists():
+        existing_files = list(TRANSCRIPTS_FOLDER.glob("*.txt"))
+        print(f"Found {len(existing_files)} transcript files.")
+        for file in existing_files:
+            video_id = file.stem.split("_")[0]  # Extract video ID
+            try:
+                transcript_text = file.read_text(encoding="utf-8").splitlines()
+                video_transcripts[video_id] = {
+                    'text': transcript_text,
+                    'file_path': str(file)
+                }
+                print(f"Loaded transcript for video: {video_id}")
+            except Exception as e:
+                print(f"Error loading transcript {file.name}: {e}")
+    else:
+        print(f"Transcripts folder not found at: {TRANSCRIPTS_FOLDER}, creating it.")
+        TRANSCRIPTS_FOLDER.mkdir(parents=True, exist_ok=True)
+    # Fetch new transcripts if needed
+    if new_video_added and new_videos_link:
+        print("New videos detected... Fetching transcripts.")
+        new_video_ids = [url.split("v=")[1] for url in new_videos_link]  # Extract video IDs
+        new_transcripts = fetch_yt_transcript(new_video_ids)
+    print(f"Total transcripts loaded: {len(video_transcripts)}")

Dockerfile ADDED Viewed

	@@ -0,0 +1,49 @@

+# Declare build arguments at the top (for initial stage)
+ARG USER_UID=1000
+ARG USER_GID=1000
+# Stage 1: Build dependencies
+FROM python:3.11-slim AS builder
+WORKDIR /app
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    git && \
+    rm -rf /var/lib/apt/lists/*
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Stage 2: Final image
+FROM python:3.11-slim
+# Re-declare build arguments for this stage
+ARG USER_UID=1000
+ARG USER_GID=1000
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+WORKDIR /app
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    libgomp1 && \
+    rm -rf /var/lib/apt/lists/*
+COPY . .
+# Create the group and user first
+RUN groupadd -g ${USER_GID} appuser && \
+    useradd -m -u ${USER_UID} -g appuser appuser
+# Create directories and set permissions
+RUN mkdir -p /app/Rag/chromadb.db && \
+    mkdir -p /app/Data && \
+    chown -R appuser:appuser /app
+USER appuser
+# Make sure your Python code uses this path for ChromaDB
+ENV CHROMA_PERSISTENCE_DIRECTORY=/app/Rag/chromadb.db
+CMD ["python", "-m","ui.app"]

Example/__init__.py ADDED Viewed

File without changes

Example/rag_example.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import sys
+import chromadb
+from pathlib import Path
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+# transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
+# transcripts_folder_path = 'Data/transcripts'
+transcripts_folder_path = PROJECT_ROOT / "Data" / "transcripts"
+chromadb_path = PROJECT_ROOT / "Rag" / "chromadb.db"
+client = chromadb.PersistentClient(path=str(chromadb_path))
+collection = client.get_or_create_collection(name="yt_transcript_collection")
+sys.path.append(str(PROJECT_ROOT))
+sys.path.append(str(PROJECT_ROOT / "Rag"))
+# print("Python path:", sys.path)
+from Rag.rag_pipeline import main_workflow
+# Run the application
+if __name__ == "__main__":
+    main_workflow(transcripts_folder_path, collection)

Llm/__init__.py ADDED Viewed

File without changes

Llm/llm_endpoints.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from dotenv import load_dotenv
+import os
+import google.generativeai as genai
+# Configure the Generative AI model with the API key from the environment
+load_dotenv()
+genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
+gemini_model = genai.GenerativeModel("models/gemini-1.5-flash")
+# Function to get a response from the generative model
+def get_llm_response(prompt: str) -> str:
+    response = gemini_model.generate_content(prompt)
+    return response.text

Prompts/__init__.py ADDED Viewed

File without changes

Prompts/huberman_prompt.py ADDED Viewed

	@@ -0,0 +1,20 @@

+huberman_prompt = """
+You are Dr. Andrew Huberman, an expert neuroscientist and educator known for your clear, engaging, and scientifically accurate explanations. When answering, please consider the following:
+1. Provide a clear and concise summary of the scientific concepts involved.
+2. Highlight any relevant research or studies.
+3. Offer actionable insights or practical advice.
+Context:
+{context}
+Sources:
+{sources}
+Conversation History:
+{history}
+Question:
+{question}
+Please respond in a manner that is informative, research-backed, and reflective of your unique style.
+"""

Prompts/summary_prompt.py ADDED Viewed

	@@ -0,0 +1,10 @@

+summary_prompts = """
+    #System
+    You are an AI agents whose job is to summarize the conversation between AI bots and the user
+    here is the conversation history
+    {{}}
+    #Output format
+"""

README.md CHANGED Viewed

@@ -12,3 +12,102 @@ short_description: a bot
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Andrew Huberman RAG-Based AI Chatbot
+## Overview
+Xyzbot is an AI chatbot that extracts and synthesizes insights from Andrew Huberman's YouTube videos. It automatically retrieves video transcripts, updates its knowledge base in ChromaDB, and provides citation-linked responses.
+## 🚀 Key Features
+- Mimics Andrew Huberman's insights using YouTube video transcripts
+- Automatic transcript retrieval and knowledge base updates
+- RAG-powered response generation with direct video citations
+- Interactive Streamlit user interface
+- Docker-based deployment for easy scalability
+## 🛠 Tech Stack
+- Backend: Python, LangChain, OpenAI API
+- Frontend: Streamlit
+- Database: ChromaDB
+- Deployment: Docker
+## 📂 Project Structure
+```
+📦 Xyzbot
+├── 📂 Data
+├── 📂 Example
+├── 📂 Llm
+├── 📂 Notebook
+├── 📂 Prompts
+├── 📂 Rag
+│   ├── chromadb.db
+│   └── 📂 Processed_folder
+├── 📂 utils
+├── Dockerfile
+└── pyproject.toml
+```
+## 🔧 Prerequisites
+- Python 3.8+
+- Docker (optional)
+## 🔑 API Keys Required
+1. Google Gemini API Key
+2. YouTube API Key
+## 🚀 Installation
+### Local Setup
+1. Clone the repository
+   ```bash
+   git clone https://github.com/Angel-dash/Xyzbot.git
+   cd Xyzbot
+   ```
+2. Create virtual environment
+   ```bash
+   python3 -m venv venv
+   source venv/bin/activate
+   pip install -r requirements.txt
+   ```
+### Docker Setup
+#### Option 1: Build Locally
+```bash
+docker build -t xyzbot:v1.0 .
+docker run -it \
+  -v $(pwd)/Rag:/app/Rag:rw \
+  -e GOOGLE_API_KEY=your_api_key \
+  xyzbot:v1.0
+```
+#### Option 2: Pull from Docker Hub
+```bash
+docker pull angeldash/xyzbot:v1.0
+docker run -it \
+  -v $(pwd)/Rag:/app/Rag:rw \
+  -e GOOGLE_API_KEY=your_api_key \
+  angeldash/xyzbot:v1.0
+```
+## 🖥️ Running the Application
+```bash
+streamlit run src/main.py
+```
+## 📈 Future Roadmap
+- Fine-tuned LLM response generation
+- Real-time multi-channel monitoring
+- Enhanced citation formatting
+- AI agent conversation handling
+- Performance optimization
+## 📜 License
+MIT License
+## 🤝 Contributing
+Contributions are welcome! Open an issue or submit a pull request.
+---
+**Author:** Angel Dash | **GitHub:** [@Angel-dash](https://github.com/Angel-dash)

Rag/Processed_folder/processed_files.json ADDED Viewed

	@@ -0,0 +1 @@

+ ["VOfwbcveP84_20241225194621.txt", "In9Bq4EJMZw_20241225194705.txt", "DkS1pkKpILY_20241225194325.txt", "ajneRM-ET1Q_20241225194311.txt", "K4Ze-Sp6aUE_20241225194709.txt", "n28W4AmvMDE_20241225194626.txt", "UIy-WQCZd4M_20241225194819.txt", "etbfLTHD_VU_20241225194439.txt", "PVmQOLYckKQ_20241225194814.txt", "F9KrZd_-ge0_20241225194812.txt", "xjEFo3a1AnI_20241225194539.txt", "szqPAPKE5tQ_20241225194712.txt", "3_auLYOilb8_20241225194826.txt", "acgz0C-z-gc_20241225194817.txt", "zVCaYyUWWSw_20241225194412.txt", "doupx8SAs5Y_20241225194603.txt", "wAZn9dF3XTo_20241225194423.txt", "2XGREPnlI8U_20241225194659.txt", "UNCwdFxPtE8_20241225194521.txt", "at37Y8rKDlA_20241225194513.txt", "oL3SkPV1_Ik_20241225194837.txt", "nOgypsWKjm4_20241225194440.txt", "rW9QKc-iFoY_20241225194751.txt", "CQlTmOFM4Qs_20241225194550.txt", "tR73Ny4Dt9s_20241225194413.txt", "t1F7EEGPQwo_20241225194649.txt", "ccrbE0QHy94_20241225194608.txt", "SyWC8ZFVxGo_20241225194333.txt", "zlc4VrDx_qk_20241225194800.txt", "8qaBpM73NSk_20241225194409.txt", "sxgCC4H1dl8_20241225194524.txt", "RBK5KLA5Jjg_20241225194446.txt", "slUCmZJDXrk_20241225194627.txt", "h2aWYjSA1Jc_20241225194702.txt", "Ov4yyK15-K8_20241225194230.txt", "juD99_sPWGU_20241225194340.txt", "q1Ss8sTbFBY_20241225194647.txt", "X8Hw8zeCDTA_20241225194518.txt", "UChhXiFPRgg_20241225194443.txt", "pq6WHJzOkno_20241225194415.txt", "2Ds1m5gflCI_20241225194849.txt", "jGZ1mR9uLU0_20241225194808.txt", "VAEzZeaV5zM_20241225194347.txt", "EhlIkzJwPlk_20241225194656.txt", "HiyzzcuaAac_20241225194255.txt", "C3X0bUAiluE_20241225194259.txt", "kG5Qb9sr0YQ_20241225194810.txt", "wRsX_ZkzxvQ_20241225194619.txt", "U2BPitASUh0_20241225194358.txt", "Wcs2PFz5q6g_20241225194327.txt", "CuzL1qxUyHw_20241225194312.txt", "q37ARYnRDGc_20241225194623.txt", "cp9GXl9Qk_s_20241225194735.txt", "XT_6Lvkhxvo_20241225194342.txt", "bUr_9fgfnto_20241225194256.txt", "LTGGyQS1fZE_20241225194305.txt", "mAlt_HKX4as_20241225194420.txt", "SZSRgyl7pyQ_20241225194418.txt", "RI112zW8GDw_20241225194356.txt", "ycOBZZeVeAc_20241225194707.txt", "6YLdlK2hYnw_20241225194328.txt", "p4ZfkezDTXQ_20241225194615.txt", "LVxL_p_kToc_20241225194558.txt", "HXzTbCEqCJc_20241225194710.txt", "yOoVz9E9kfQ_20241225194901.txt", "C5KpIXjpzdY_20241225194400.txt", "__RAXBLt1iM_20241225194430.txt", "8N7mdkrXgbc_20241225194338.txt", "JnlSDaBjCCU_20241225194450.txt", "IOl28gj_RXw_20241225194431.txt", "Nr5xb-QCBGA_20241225194354.txt", "GzvzWO0NU50_20241225194605.txt", "DtmwtjOoSYU_20241225194633.txt", "CrtR12PBKb0_20241225194632.txt", "gMRph_BvHB4_20241225194516.txt", "QpoaNklmRPc_20241225194248.txt", "9tRohh0gErM_20241225194353.txt", "Xu1FMCxoEFc_20241225194346.txt", "15R2pMqU2ok_20241225194406.txt", "eIxVfln02Ss_20241225194335.txt", "0Dtt95_xabw_20241225194252.txt", "3ZGItIAUQmI_20241225194719.txt", "uxZFl4BDOGk_20241225194757.txt", "hvPGfcAgk9Y_20241225194754.txt", "HYVeP4F0GNU_20241225194559.txt", "z5W74QC3v2I_20241225194308.txt", "31wjVhCcI5Y_20241225194426.txt", "BMTt8gSl13s_20241225194836.txt", "aQDOU3hPci0_20241225194501.txt", "tkH2-_jMCSk_20241225194543.txt", "ntfcfJ28eiU_20241225194522.txt", "S8nPJU9xkNw_20241225194748.txt", "fcxjwA4C4Cw_20241225194553.txt", "iMvtHqLmEkI_20241225194855.txt", "099hgtRoUZw_20241225194436.txt", "4RFEkGKKhdE_20241225194907.txt", "eJU6Df_ffAE_20241225194635.txt", "nqNEtdHVUjM_20241225194437.txt", "1SXDXdngX2M_20241225194316.txt", "X4QE6t-MkYE_20241225194642.txt", "79p1X_7rAMo_20241225194630.txt", "6RZbGrq9BxE_20241225194306.txt", "pkJi9Raxikg_20241225194824.txt", "QbMxDZeB8Ks_20241225194247.txt", "RgAcOqVRfYA_20241225194657.txt", "ncSoor2Iw8k_20241225194833.txt", "i_DEPeCKxs8_20241225194235.txt", "FE0lTEUa7EY_20241225194753.txt", "gE0_8AjTFaM_20241225194852.txt", "kgr22uMsJ5o_20241225194317.txt", "ufsIA5NARIo_20241225194535.txt", "CyDLbrZK75U_20241225194434.txt", "7TkGDj4LaOU_20241225194244.txt", "XLr2RKoD-oY_20241225194738.txt", "yb5zpo5WDG4_20241225194645.txt", "a9yFKPmPZ90_20241225194556.txt", "TG8VM5-CTfw_20241225194636.txt", "eMqWH3LYiII_20241225194351.txt", "CVh3_8e5u8I_20241225194246.txt", "SuR0DaYoe0Y_20241225194302.txt", "FLxIoNguGRU_20241225194233.txt", "GA89kjVY6Ik_20241225194854.txt", "qJ3uV7coZbA_20241225194453.txt", "EQ3GjpGq5Y8_20241225194405.txt", "yOJvm_ri_hk_20241225194555.txt", "cwakOgHIT0E_20241225194421.txt", "DTCmprPCDqc_20241225194733.txt", "qPKd99Pa2iU_20241225194500.txt", "nm1TxQj9IsQ_20241225194611.txt", "LRM5LutB538_20241225194857.txt", "xTtM2AvCRyA_20241225194643.txt", "62lVH-6xYGY_20241225194250.txt", "Rxmv7rT9leo_20241225194417.txt", "ulHrUVV3Kq4_20241225194452.txt", "bGixnNGvSkg_20241225194231.txt", "1CxJVdeyltw_20241225194614.txt", "wgUjIRtote8_20241225194726.txt", "qPKd99Pa2iU_20241225194503.txt", "S_SrHS8FvMM_20241225194807.txt", "xX6hiEmDmxs_20241225194227.txt", "uXs-zPc63kM_20241225194449.txt", "4AwyVTHEU3s_20241225194904.txt", "xaE9XyMMAHY_20241225194848.txt", "hFL6qRIJZ_Y_20241225194428.txt", "FOi5s3OUogo_20241225194245.txt", "cS7cNaBrkxo_20241225194624.txt", "kpTJqwIfHcM_20241225194654.txt", "yixIc1Ai6jM_20241225194829.txt", "vfRtLI6cJrk_20241225194324.txt", "GLgKkG44MGo_20241225194729.txt", "KPlJcD-o-4Q_20241225194617.txt", "AtChcxeaukQ_20241225194646.txt", "tLS6t3FVOTI_20241225194714.txt", "GqPGXG5TlZw_20241225194541.txt", "UF0nqolsNZc_20241225194727.txt", "7R3-3HR6-u4_20241225194519.txt", "tLRCS48Ens4_20241225194447.txt", "V0Sdgn0_kFM_20241225194740.txt", "G1VUSu6sGoU_20241225194251.txt", "m_OazsImOiI_20241225194322.txt", "Og56hmAspV8_20241225194258.txt", "dFR_wFN23ZY_20241225194640.txt", "q-H_A_dQUxQ_20241225194303.txt", "KVjfFN89qvQ_20241225194314.txt", "zU5EYw06wtw_20241225194349.txt", "Z7MU6zrAXsM_20241225194442.txt", "LYYyQcAJZfk_20241225194508.txt", "E7W4OQfJWdw_20241225194717.txt", "azb3Ih68awQ_20241225194505.txt", "ouCWNRvPk20_20241225194401.txt", "uwWOc_RqTBA_20241225194858.txt", "pZX8ikmWvEU_20241225194510.txt", "n9IxomBusuw_20241225194545.txt", "BwyZIWeBpRw_20241225194534.txt", "XY0rBdaDXD8_20241225194226.txt", "1Wo6SqLNmLk_20241225194845.txt", "ddq8JIMhz7c_20241225194529.txt", "VQLU7gpk_X8_20241225194821.txt", "jC8Pu9HBd48_20241225194321.txt", "rZkMpVLcVsg_20241225194319.txt", "gbQFSMayJxk_20241225194736.txt", "F54qXuTpgfM_20241225194843.txt", "p3JLaF_4Tz8_20241225194537.txt", "FeRgqJVALMQ_20241225194433.txt", "hF32FvBH4gI_20241225194332.txt", "CDUetQMKM6g_20241225194454.txt", "wG3UFHR1o48_20241225194229.txt", "6P8hrzjnetU_20241225194336.txt", "WFcYF_pxLgA_20241225194458.txt", "77CdVSpnUX4_20241225194746.txt", "VOfwbcveP84_20241225194742.txt", "VRvn3Oj5r3E_20241225194839.txt", "Gf-kC30SLtc_20241225194846.txt", "S8jWFcDGz4Y_20241225194805.txt", "x3MgDtZovks_20241225194526.txt", "lIo9FcrljDk_20241225194309.txt", "-e9ErUozQo4_20241225194903.txt", "aXvDEmo6uS4_20241225194629.txt", "3gtvNYa3Nd8_20241225194531.txt", "5tYR7e5Wpyc_20241225194238.txt", "OadokY8fcAA_20241225194601.txt", "O640yAgq5f8_20241225194744.txt", "zbpb1wd-wvs_20241225194827.txt", "gXvuJu1kt48_20241225194638.txt", "zEYE-vcVKy8_20241225194547.txt", "Ky-ZJ9SS-x4_20241225194240.txt", "0RYyQRQFgFk_20241225194532.txt", "4F_RBc1akC8_20241225194724.txt", "nDLb8_wgX50_20241225194540.txt", "tcueMCe-0zo_20241225194236.txt", "K-TW2Chpz4k_20241225194330.txt", "XcvhERcZpWw_20241225194731.txt", "Ze2pc6NwsHQ_20241225194704.txt", "_ltcLEM-5HU_20241225194612.txt", "jouFvyRZntk_20241225194507.txt", "uWV9a3zEaL4_20241225194823.txt", "-OBCwiPPfEU_20241225194747.txt", "dzOvi0Aa2EA_20241225194301.txt", "K9lORz2_XSU_20241225194527.txt", "j2sMqSDLd4k_20241225194407.txt", "oNkDA2F7CjM_20241225194651.txt", "50BZQRT1dAg_20241225194403.txt", "q8CHXefn7B4_20241225194411.txt", "Jy4rJcYmtUM_20241225194344.txt", "QmOF0crdyRU_20241225194456.txt", "6ZrlsVx85ek_20241225194758.txt", "CD0bRU1e1ZM_20241225194425.txt", "IAnhFUUCq6c_20241225194804.txt", "Phm-Alz1Zjo_20241225194906.txt", "csubiPlvFWk_20241225194606.txt", "GpgqXCkRO-w_20241225194701.txt", "W5zqC5cYcS0_20241225194241.txt", "T65RDBiB5Hs_20241225194715.txt", "6I5I56uVvLw_20241225194801.txt", "i5611OvTFGM_20241225194548.txt", "wTBSGgbIvsY_20241225194552.txt", "O1YRwWmue4Y_20241225194815.txt", "29n0WG317tM_20241225194511.txt", "xmhsWAqP_0Y_20241225194851.txt", "x4m_PdFbu-s_20241225194722.txt"]

Rag/__init__.py ADDED Viewed

File without changes

Rag/rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import chromadb
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from sentence_transformers import SentenceTransformer
+import google.generativeai as genai
+import os
+import logging
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from Llm.llm_endpoints import get_llm_response
+from utils.get_link import get_source_link
+from Prompts.huberman_prompt import huberman_prompt
+from tqdm import tqdm
+# Configuration
+API_KEY = os.getenv("GOOGLE_API_KEY")
+if API_KEY:
+    genai.configure(api_key=API_KEY)
+chromadb_path = "app/Rag/chromadb.db"
+embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+# Logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
+# Helper Functions
+def split_text_to_chunks(docs, chunk_size=1000, chunk_overlap=200):
+    """Split text into manageable chunks."""
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    chunks = text_splitter.split_text(docs)
+    return chunks
+def get_new_files(transcripts_folder_path, collection):
+    """Find new transcript files that haven't been processed yet."""
+    all_files = [f for f in os.listdir(transcripts_folder_path) if f.endswith(".txt")]
+    existing_files = [meta["source"] for meta in collection.get()['metadatas']]
+    return [f for f in all_files if f not in existing_files]
+def process_single_file(file_path):
+    """Process a single file and return its chunks."""
+    with open(file_path, 'r') as f:
+        content = f.read()
+    chunks = split_text_to_chunks(content)
+    return chunks, os.path.basename(file_path)
+def batch_embed_chunks(chunks, batch_size=32):
+    """Embed chunks in batches."""
+    embeddings = []
+    for i in tqdm(range(0, len(chunks), batch_size),desc = "Embedding chunks"):
+        batch = chunks[i:i + batch_size]
+        batch_embeddings = embedding_model.encode(batch, show_progress_bar=True)
+        embeddings.extend(batch_embeddings.tolist())
+    return embeddings
+def process_and_add_new_files(transcripts_folder_path, collection):
+    """Process and add new transcript files to the vector database."""
+    new_files = get_new_files(transcripts_folder_path, collection)
+    if not new_files:
+        logging.info("No new files to process")
+        return False
+    # Use a reasonable number of workers (4 is usually a good default)
+    n_workers = min(4, len(new_files))
+    logging.info(f"Using {n_workers} workers for processing")
+    all_chunks = []
+    all_metadata = []
+    all_ids = []
+    # Process files in parallel
+    with ProcessPoolExecutor(max_workers=n_workers) as executor:
+        futures = {
+            executor.submit(process_single_file, os.path.join(transcripts_folder_path, file)): file
+            for file in new_files
+        }
+        for future in as_completed(futures):
+            file = futures[future]
+            try:
+                chunks, filename = future.result()
+                file_metadata = [{"source": filename} for _ in range(len(chunks))]
+                file_ids = [f"{filename}_chunk_{i}" for i in range(len(chunks))]
+                all_chunks.extend(chunks)
+                all_metadata.extend(file_metadata)
+                all_ids.extend(file_ids)
+                logging.info(f"Processed {filename}")
+            except Exception as e:
+                logging.error(f"Error processing {file}: {str(e)}")
+                continue
+    # Process embeddings in batches
+    logging.info(f"Generating embeddings for {len(all_chunks)} chunks")
+    embeddings = batch_embed_chunks(all_chunks)
+    # Add to database in batches
+    batch_size = 500
+    for i in range(0, len(all_chunks), batch_size):
+        end_idx = min(i + batch_size, len(all_chunks))
+        collection.upsert(
+            documents=all_chunks[i:end_idx],
+            embeddings=embeddings[i:end_idx],
+            metadatas=all_metadata[i:end_idx],
+            ids=all_ids[i:end_idx]
+        )
+        logging.info(f"Added batch {i // batch_size + 1} to database")
+    logging.info(f"Successfully processed {len(new_files)} files")
+    return True
+def query_database(collection, query_text, n_results=3):
+    """Retrieve the most relevant chunks for the query."""
+    query_embeddings = embedding_model.encode(query_text).tolist()
+    results = collection.query(query_embeddings=query_embeddings, n_results=n_results)
+    retrieved_docs = results['documents'][0]
+    metadatas = results['metadatas'][0]
+    return retrieved_docs, metadatas
+def enhance_query_with_history(query_text, summarized_history):
+    enhance_query = f"{query_text}*2\n\n{summarized_history}"
+    return enhance_query
+def update_conversation_history(history, user_query, bot_response):
+    """Update and keeps track of conversation history between user and the bot."""
+    history.append({"user": user_query, "bot": bot_response})
+    return history
+def generate_response(conversation_history, query_text, retrieved_docs, source_links):
+    """Generate a response using retrieved documents and the generative AI model."""
+    context = " ".join(retrieved_docs)
+    history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
+    sources_str = "\n".join(source_links)
+    prompt = huberman_prompt.format(
+        context=context,
+        sources=sources_str,
+        history=history_str,
+        question=query_text
+    )
+    response = get_llm_response(prompt)
+    full_response = f"{response}\n\nSources:\n{sources_str}"
+    return full_response
+def main_workflow(transcripts_folder_path, collection):
+    """Run the full RAG workflow."""
+    new_files_added = process_and_add_new_files(transcripts_folder_path, collection)
+    if new_files_added:
+        logging.info("New transcripts added to the database.")
+    else:
+        logging.info("No new files found. Using existing database.")
+    conversation_history = []
+    while True:
+        query_text = input("\nEnter your query(or type 'exit' to end):").strip()
+        if query_text.lower() == "exit":
+            print("Ending the conversation. Goodbye")
+            break
+        query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
+        retrived_docs, metadatas = query_database(collection, query_text_with_conversation_history)
+        print("-" * 50)
+        source_link = get_source_link(metadatas)
+        print(source_link)
+        print("-" * 50)
+        if not retrived_docs:
+            print("No relevent documents is found")
+            continue
+        response = generate_response(conversation_history, query_text, retrived_docs, source_link)
+        conversation_history = update_conversation_history(conversation_history, query_text, response)
+        print("\nGenerated Response:")
+        print(response)

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,34 @@

+[project]
+name = "xyzbot"
+version = "0.1.0"
+description = "A rag application"
+authors = [
+    {name = "Angel njlghmr@gmail.com"}
+]
+license = {text = "MIT"}
+readme = "README.md"
+requires-python =">=3.11,<3.12"
+dependencies = [
+    "pyarrow (>=19.0.0,<20.0.0)",
+    "pandas (>=2.2.3,<3.0.0)",
+    "pendulum (>=3.0.0,<4.0.0)",
+    "google-generativeai (>=0.8.4,<0.9.0)",
+    "langchain (>=0.3.16,<0.4.0)",
+    "langchain-openai (>=0.3.3,<0.4.0)",
+    "langchain-chroma (>=0.2.1,<0.3.0)",
+    "langchain-community (>=0.3.16,<0.4.0)",
+    "chromadb (>=0.4.14)",
+    "pypdf (==4.2.0)",
+    "flask (==3.0.1)",
+    "flask-cors (==3.0.10)",
+    "sentence-transformers (==3.3.1)",
+    "tqdm (==4.67.1)",
+    "torch (==2.5.1)",
+    "transformers (==4.46.3)",
+    "pydantic (>=2.7.4,<3.0.0)"
+]
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"

requirements.in ADDED Viewed

	@@ -0,0 +1,18 @@

+pyarrow
+pandas[performance, parquet, aws]
+pendulum
+google.generativeai
+langchain
+langchain_openai
+langchain_chroma
+langchain_community
+chromadb==0.4.8
+pypdf
+flask
+flask_cors
+sentence_transformers
+tqdm
+torch
+transformers
+spacy==3.5.0
+coreferee==1.4.1

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+# Core dependencies
+langchain>=0.3.16,<0.4.0
+langchain_openai
+langchain_chroma
+langchain-community>=0.3.16,<0.4.0
+chromadb>=0.4.14
+flask==3.0.1
+flask_cors==3.0.10
+google.generativeai
+pydantic>=2.7.4,<3.0.0
+streamlit
+# PDF Processing
+pypdf==4.2.0
+# ML/AI Dependencies (with CPU-only versions)
+sentence_transformers==2.3.1
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.1.0+cpu
+gradio

setup.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+# Install Python dependencies
+pip install -r requirements.txt
+# Download spaCy model
+python -m spacy download en_core_web_sm
+# Install Coreferee for English
+python -m coreferee install en
+echo "Setup completed successfully!"

ui/__init__.py ADDED Viewed

File without changes

ui/app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import gradio as gr
+import chromadb
+from typing import List, Dict
+import sys
+from pathlib import Path
+project_root = Path(__file__).resolve().parent.parent
+sys.path.append(str(project_root))
+sys.path.append(str(project_root / "Rag"))
+sys.path.append(str(project_root / "Data"))
+sys.path.append(str(project_root / "Data" / "transcripts"))
+sys.path.append(str(project_root / "Data" / "video_links"))
+sys.path.append(str(project_root / "Llm"))
+sys.path.append(str(project_root / "Prompts"))
+sys.path.append(str(project_root / "utils"))
+from Rag.rag_pipeline import (
+    query_database,
+    generate_response,
+    enhance_query_with_history,
+    update_conversation_history,
+    process_and_add_new_files
+)
+INTRODUCTION = """
+# 🧠 Welcome to HubermanBot!
+I am your AI assistant trained on Andrew Huberman's podcast content. My knowledge base includes detailed information about:
+- 🎯 Peak Performance & Focus
+- 😴 Sleep Science & Optimization
+- 🏋️ Physical Fitness & Recovery
+- 🧘 Mental Health & Stress Management
+- 🧪 Neuroscience & Biology
+- 💪 Habit Formation & Behavior Change
+For each response, I'll provide:
+- Detailed answers based on podcast content
+- Direct source links to specific episodes
+- Scientific context when available
+Ask me anything about these topics, and I'll help you find relevant information from the Huberman Lab Podcast!
+Example questions you might ask:
+- "What does Dr. Huberman recommend for better sleep?"
+- "How can I improve my focus and concentration?"
+- "What are the best practices for morning routines?"
+"""
+def format_youtube_url(filename: str) -> str:
+    """Convert filename to YouTube URL"""
+    # Extract video ID by removing the timestamp and .txt extension
+    video_id = filename.split('_')[0]
+    return f"https://www.youtube.com/watch?v={video_id}"
+class RAGChatInterface:
+    def __init__(self, transcripts_folder_path: str, collection):
+        self.transcripts_folder_path = transcripts_folder_path
+        self.collection = collection
+        self.conversation_history: List[Dict[str, str]] = []
+    def process_query(self, message: str, history: List[List[str]]) -> str:
+        """Process a single query and return the response"""
+        # Convert Gradio history format to our conversation history format
+        self.conversation_history = [
+            {"user": user_msg, "bot": bot_msg}
+            for user_msg, bot_msg in history
+        ]
+        # Enhance query with conversation history
+        query_with_history = enhance_query_with_history(message, self.conversation_history)
+        # Get relevant documents
+        retrieved_docs, metadatas = query_database(self.collection, query_with_history)
+        if not retrieved_docs:
+            return "I apologize, but I couldn't find any relevant information about that in my knowledge base. Could you try rephrasing your question or ask about a different topic covered in the Huberman Lab Podcast?"
+        # Generate response
+        source_links = [meta["source"] for meta in metadatas]
+        response = generate_response(
+            self.conversation_history,
+            message,
+            retrieved_docs,
+            source_links
+        )
+        # Remove duplicate sources and convert to YouTube URLs
+        unique_sources = list(set(source_links))
+        youtube_urls = [format_youtube_url(source) for source in unique_sources]
+        # Format response with markdown for better readability
+        formatted_response = f"{response}\n\n---\n📚 **Source Episodes:**\n"
+        for url in youtube_urls:
+            formatted_response += f"- {url}\n"
+        return formatted_response
+def create_interface(transcripts_folder_path: str, collection) -> gr.Interface:
+    """Create and configure the Gradio interface"""
+    # Initialize the RAG chat interface
+    rag_chat = RAGChatInterface(transcripts_folder_path, collection)
+    # Create the Gradio interface with custom styling
+    interface = gr.ChatInterface(
+        fn=rag_chat.process_query,
+        title="🧠 HubermanBot - Your Neuroscience & Wellness AI Assistant",
+        description=INTRODUCTION,
+        examples=[
+            "What are Dr. Huberman's top recommendations for better sleep?",
+            "How does sunlight exposure affect our circadian rhythm?",
+            "What supplements does Dr. Huberman recommend for focus?",
+            "What are the best practices for morning routines according to Dr. Huberman?",
+            "How can I optimize my workout recovery based on neuroscience?",
+        ],
+        theme=gr.themes.Soft(
+            primary_hue="indigo",
+            secondary_hue="blue",
+        )
+    )
+    return interface
+def main():
+    # Get absolute path for ChromaDB
+    project_root = Path(__file__).parent.parent
+    chromadb_path = project_root / "Rag" / "chromadb.db"
+    client = chromadb.PersistentClient(path=str(chromadb_path))
+    collection = client.get_or_create_collection(name="yt_transcript_collection")
+    # Use absolute path for transcripts folder too
+    transcripts_folder_path = project_root / "Data" / "transcripts"
+    # Process any new files
+    process_and_add_new_files(str(transcripts_folder_path), collection)
+    # Create and launch the interface
+    interface = create_interface(str(transcripts_folder_path), collection)
+    interface.launch(share=True, server_port=7860)
+if __name__ == "__main__":
+    main()

utils/__init__.py ADDED Viewed

File without changes

utils/corefrence.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import spacy
+from spacy.tokens import Doc
+import coreferee
+# Load spaCy model
+nlp = spacy.load('en_core_web_sm')
+nlp.add_pipe("coreferee")
+# Register the custom extension attribute
+Doc.set_extension('resolved_text', default=None, force=True)
+def resolve_coreferences(query_text, conversation_history):
+    """
+    Resolve coreferences in the given text using spaCy and coreferee.
+    Args:
+        query_text (str): The current query to resolve
+        conversation_history (list): List of dictionaries containing previous conversation turns
+    Returns:
+        str: Text with resolved coreferences
+    """
+    # Combine conversation history and current query
+    combined_text = []
+    for turn in conversation_history:
+        combined_text.append(f"User: {turn['user']}")
+        combined_text.append(f"Bot: {turn['Bot']}")
+    combined_text.append(f"User: {query_text}")
+    text = "\n".join(combined_text)
+    # Process the text
+    doc = nlp(text)
+    # Get all tokens and their potential antecedents
+    resolved_tokens = list(doc)
+    # Resolve coreferences
+    for chain in doc._.coref_chains:
+        for mention in chain:
+            if mention.root_index != chain.most_specific.root_index:
+                # Replace mention with its antecedent
+                resolved_tokens[mention.root_index] = doc[chain.most_specific.root_index]
+    # Reconstruct the text with resolved references
+    resolved_text = "".join([token.text_with_ws if isinstance(token, spacy.tokens.Token)
+                             else token.text + " " for token in resolved_tokens])
+    # Extract the resolved query (last line)
+    resolved_query = resolved_text.split('\n')[-1].replace("User: ", "").strip()
+    return resolved_query

utils/get_link.py ADDED Viewed

	@@ -0,0 +1,11 @@

+def get_source_link(metadatas):
+    link = 'https://www.youtube.com/watch?v='
+    yt_link = []
+    for metadata in metadatas:
+        source = metadata['source']
+        values = source.split('.txt')
+        link = link + values[0]
+        yt_link.append(link)
+        # print(yt_link)
+    return yt_link

utils/summarization.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from Llm.llm_endpoints import get_llm_response
+def summarize_conversation(conversation_history):
+    try:
+        summary_prompt = "Summarize the following conversation:\n" + "\n".join(
+            [f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
+        summary = get_llm_response(summary_prompt)
+        print("*************************************************")
+        print(summary)
+        print("*************************************************")
+        return summary
+    except:
+        return ""