Nightwing11 commited on
Commit
075d061
·
2 Parent(s): fba61a7 100e32f

Resolved merge conflict in README.md

Browse files
.dockerignore ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore version control
2
+ .git
3
+ .gitignore
4
+
5
+ # Ignore notebooks
6
+ Notebook/
7
+
8
+ # Ignore databases and logs
9
+ **/*.db
10
+ **/*.sqlite3
11
+ **/chromadb.db
12
+ **/error_log.txt
13
+
14
+ # Ignore cache
15
+ **/__pycache__/
16
+ **/*.pyc
17
+ **/*.pyo
18
+ **/*.pyd
19
+
20
+ # Ignore environment files
21
+ .env
22
+ venv/
23
+ .venv/
.gitignore ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tmp
2
+ .idea
3
+ models
4
+
5
+ stanford-ner-2015-04-20.zip
6
+ stanford-ner-2015-04-20
7
+ *.pyc
8
+
9
+ ### Python template
10
+ # Byte-compiled / optimized / DLL files
11
+ __pycache__/
12
+ *.py[cod]
13
+ *$py.class
14
+
15
+ # C extensions
16
+ *.so
17
+
18
+ # Distribution / packaging
19
+ .Python
20
+ build/
21
+ develop-eggs/
22
+ dist/
23
+ downloads/
24
+ eggs/
25
+ .eggs/
26
+ lib/
27
+ lib64/
28
+ parts/
29
+ sdist/
30
+ var/
31
+ wheels/
32
+ pip-wheel-metadata/
33
+ share/python-wheels/
34
+ *.egg-info/
35
+ .installed.cfg
36
+ *.egg
37
+ MANIFEST
38
+
39
+ # PyInstaller
40
+ # Usually these files are written by a python script from a template
41
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
42
+ *.manifest
43
+ *.spec
44
+
45
+ # Installer logs
46
+ pip-log.txt
47
+ pip-delete-this-directory.txt
48
+
49
+ # Unit test / coverage reports
50
+ htmlcov/
51
+ .tox/
52
+ .nox/
53
+ .coverage
54
+ .coverage.*
55
+ .cache
56
+ nosetests.xml
57
+ coverage.xml
58
+ *.cover
59
+ .hypothesis/
60
+ .pytest_cache/
61
+
62
+ # Translations
63
+ *.mo
64
+ *.pot
65
+
66
+ service.log.*
67
+
68
+ # Django stuff:
69
+ *.log
70
+ local_settings.py
71
+ db.sqlite3
72
+ db.sqlite3-journal
73
+
74
+ # Flask stuff:
75
+ instance/
76
+ .webassets-cache
77
+
78
+ # Scrapy stuff:
79
+ .scrapy
80
+
81
+ # Sphinx documentation
82
+ docs/_build/
83
+
84
+ # PyBuilder
85
+ target/
86
+
87
+ # Jupyter Notebook
88
+ .ipynb_checkpoints
89
+
90
+ # IPython
91
+ profile_default/
92
+ ipython_config.py
93
+
94
+ # pyenv
95
+ .python-version
96
+
97
+ # pipenv
98
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
99
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
100
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
101
+ # install all needed dependencies.
102
+ #Pipfile.lock
103
+
104
+ # celery beat schedule file
105
+ celerybeat-schedule
106
+
107
+ # SageMath parsed files
108
+ *.sage.py
109
+
110
+ # Environments
111
+ .env
112
+ .venv
113
+ env/
114
+ venv/
115
+ ENV/
116
+ env.bak/
117
+ venv.bak/
118
+ Data/transcripts/
119
+ Data/videolinks/
120
+ Rag/db/
121
+ Rag/db/chroma.sqlite3
122
+ Rag/chromadb.db/
123
+ # Spyder project settings
124
+ .spyderproject
125
+ .spyproject
126
+
127
+ # Rope project settings
128
+ .ropeproject
129
+
130
+ # mkdocs documentation
131
+ /site
132
+ __pycache__/
133
+ *.pyc
134
+ *.pyo
135
+ *.pyd
136
+ .env
137
+ # mypy
138
+ .mypy_cache/
139
+ .dmypy.json
140
+ dmypy.json
141
+
142
+ # Pyre type checker
143
+ .pyre/
144
+
145
+ ### JetBrains template
146
+ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
147
+ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
148
+
149
+ # User-specific stuff
150
+ .idea/**/workspace.xml
151
+ .idea/**/tasks.xml
152
+ .idea/**/usage.statistics.xml
153
+ .idea/**/dictionaries
154
+ .idea/**/shelf
155
+
156
+ # Generated files
157
+ .idea/**/contentModel.xml
158
+
159
+ # Sensitive or high-churn files
160
+ .idea/**/dataSources/
161
+ .idea/**/dataSources.ids
162
+ .idea/**/dataSources.local.xml
163
+ .idea/**/sqlDataSources.xml
164
+ .idea/**/dynamic.xml
165
+ .idea/**/uiDesigner.xml
166
+ .idea/**/dbnavigator.xml
167
+
168
+ # Gradle
169
+ .idea/**/gradle.xml
170
+ .idea/**/libraries
171
+
172
+ # Gradle and Maven with auto-import
173
+ # When using Gradle or Maven with auto-import, you should exclude module files,
174
+ # since they will be recreated, and may cause churn. Uncomment if using
175
+ # auto-import.
176
+ # .idea/modules.xml
177
+ # .idea/*.iml
178
+ # .idea/modules
179
+ # *.iml
180
+ # *.ipr
181
+
182
+ # CMake
183
+ cmake-build-*/
184
+
185
+ #
186
+ Mongo Explorer plugin
187
+ .idea/**/mongoSettings.xml
188
+
189
+ # File-based project format
190
+ *.iws
191
+
192
+ # IntelliJ
193
+ out/
194
+
195
+ # mpeltonen/sbt-idea plugin
196
+ .idea_modules/
197
+
198
+ # JIRA plugin
199
+ atlassian-ide-plugin.xml
200
+
201
+ # Cursive Clojure plugin
202
+ .idea/replstate.xml
203
+
204
+ # Crashlytics plugin (for Android Studio and IntelliJ)
205
+ com_crashlytics_export_strings.xml
206
+ crashlytics.properties
207
+ crashlytics-build.properties
208
+ fabric.properties
209
+
210
+ # Editor-based Rest Client
211
+ .idea/httpRequests
212
+
213
+ # Android studio 3.1+ serialized cache file
214
+ .idea/caches/build_file_checksums.ser
215
+
216
+ ### VirtualEnv template
217
+ # Virtualenv
218
+ # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
219
+ .Python
220
+ [Bb]in
221
+ [Ii]nclude
222
+ [Ll]ib
223
+ [Ll]ib64
224
+ [Ll]ocal
225
+ [Ss]cripts
226
+ pyvenv.cfg
227
+ .venv
228
+ pip-selfcheck.json
229
+
230
+ files
231
+ Files
232
+ *.tmp
233
+ .vscode
234
+ my_virtual_environment
235
+ dist
236
+ crf_py_utils.egg-info
237
+ build
238
+ datas
239
+ tests/data
240
+ venv
241
+ create_docker_image.sh
242
+
243
+ anydonebert/data
244
+
245
+ results
246
+ train_test_split
247
+
248
+ anydonebert/models/sbert.net_models_paraphrase-distilroberta-base-v1
249
+ anydonebert/models/sbert.net_models_paraphrase-distilroberta-base-v2
250
+ resources/conll_files/*
251
+ resources/test_xml_files/*
252
+ resources/xml_files/*
253
+ config.ini
254
+ flowcess/commons/settings.py
255
+
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
Data/__init__.py ADDED
File without changes
Data/get_video_link.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from dotenv import load_dotenv
4
+ from Data.new_video_added import get_new_video_url
5
+ from datetime import datetime
6
+ import json
7
+ from pathlib import Path
8
+ load_dotenv()
9
+
10
+ api_key = os.getenv('API_KEY')
11
+ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
12
+ BASE_URL = "https://www.googleapis.com/youtube/v3"
13
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
14
+ channel = "https://www.youtube.com/@hubermanlab/videos"
15
+ new_video_added = False
16
+ # video_links_folder_name = os.path.join(BASE_DIR, "videolinks")
17
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
18
+ # print("THIS IS BASE DIR:", BASE_DIR)
19
+ # print("THIS is current dir:", CURRENT_DIR)
20
+ # video_links_folder_name = os.path.join(CURRENT_DIR, "videolinks")
21
+ video_links_folder_name = os.path.join(PROJECT_ROOT, "Data", "video_links")
22
+
23
+ def ensure_directories():
24
+ if not os.path.exists(video_links_folder_name):
25
+ os.makedirs(video_links_folder_name)
26
+ print(f"Directory {video_links_folder_name} created")
27
+
28
+
29
+ def get_chanel_id(chanel_name):
30
+ url = f"{BASE_URL}/search"
31
+ params = {
32
+ "part": "snippet",
33
+ "q": chanel_name,
34
+ "type": "channel",
35
+ "key": api_key
36
+ }
37
+ response = requests.get(url, params)
38
+ response_data = response.json()
39
+ if "items" in response_data and len(response_data["items"]) > 0:
40
+ return response_data["items"][0]["snippet"]["channelId"]
41
+ else:
42
+ return None
43
+
44
+
45
+ def get_video_links(channel_id):
46
+ url = f"{BASE_URL}/search"
47
+ video_links = []
48
+ next_page_token = None
49
+
50
+ while True:
51
+ params = {
52
+ "part": "snippet",
53
+ "channelId": channel_id,
54
+ "maxResults": 50,
55
+ "type": "video",
56
+ "key": api_key,
57
+ }
58
+ if next_page_token:
59
+ params["pageToken"] = next_page_token
60
+
61
+ response = requests.get(url, params=params)
62
+ response_data = response.json()
63
+
64
+ if "items" not in response_data:
65
+ break
66
+
67
+ for item in response_data["items"]:
68
+ video_id = item["id"]["videoId"]
69
+ video_links.append(f"https://www.youtube.com/watch?v={video_id}")
70
+
71
+ next_page_token = response_data.get("nextPageToken")
72
+ if not next_page_token:
73
+ break
74
+
75
+ return video_links
76
+
77
+
78
+ def save_video_links(video_links):
79
+ if not os.path.exists(video_links_folder_name):
80
+ os.makedirs(video_links_folder_name)
81
+ timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
82
+ filename = f"video_links_{timestamp}.json"
83
+ filepath = os.path.join(video_links_folder_name, filename)
84
+ with open(filepath, 'w') as file:
85
+ json.dump(video_links, file)
86
+ print(f"{len(video_links)} The video links is saved successfully to {filename}")
87
+
88
+
89
+ def load_video_links():
90
+ """
91
+ Load the most recent video links file based on timestamp in the filename.
92
+ """
93
+ # List all files in the current directory
94
+ if not os.path.exists(video_links_folder_name):
95
+ print(f"{video_links_folder_name} does not exits")
96
+ files = [f for f in os.listdir(video_links_folder_name) if f.startswith("video_links_") and f.endswith(".json")]
97
+
98
+ if not files:
99
+ print("No video links file found.")
100
+ return []
101
+
102
+ # Sort files by the timestamp in their names (descending)
103
+ files.sort(key=lambda x: datetime.strptime(x[len("video_links_"):-len(".json")], "%Y%m%d%H%M%S"), reverse=True)
104
+
105
+ # Load the most recent file
106
+ latest_file = files[0]
107
+ filepath = os.path.join(video_links_folder_name, latest_file)
108
+ try:
109
+ with open(filepath, 'r') as file:
110
+ video_links = json.load(file)
111
+ print(f"{len(video_links)} video links loaded successfully from {latest_file}.")
112
+ return video_links
113
+ except Exception as e:
114
+ print(f"Error loading {latest_file}: {e}")
115
+ return []
116
+
117
+
118
+ def video_links_main():
119
+ ensure_directories()
120
+ video_links = load_video_links()
121
+ if video_links:
122
+ print(f"Using {len(video_links)} saved video links")
123
+ else:
124
+ channel_name = input("Enter the YouTube channel name: ")
125
+ channel_id = get_chanel_id(channel_name)
126
+
127
+ if channel_id:
128
+ print(f"Fetching videos for channel: {channel_name} (ID: {channel_id})")
129
+ video_links = get_video_links(channel_id)
130
+ save_video_links(video_links)
131
+ else:
132
+ print("Failed to fetch video links")
133
+ # for link in video_links:
134
+ # # print(link)
135
+ new_video_url = get_new_video_url(channel)
136
+ # new_video_url = new_video_url[:3]
137
+ new_videos = [url for url in new_video_url if url not in video_links]
138
+
139
+ if new_videos:
140
+ print(f"{len(new_videos)} new video founds")
141
+ video_links.extend(new_videos)
142
+ save_video_links(video_links)
143
+ new_video_added = True
144
+ else:
145
+ print("No new video founds")
146
+ new_video_added = False
147
+ # print(new_video_added)
148
+ return video_links, new_video_added, new_videos
149
+
150
+
151
+ if __name__ == "__main__":
152
+ video_links_main()
Data/new_video_added.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import re
3
+
4
+
5
+
6
+ def get_new_video_url(channel):
7
+ """
8
+ Fetch all video URLs from the given YouTube channel page.
9
+ """
10
+ try:
11
+ html = requests.get(channel).text
12
+ # Extract all video IDs from the HTML
13
+ video_ids = re.findall(r'(?<="videoId":").*?(?=")', html)
14
+ video_urls = [f"https://www.youtube.com/watch?v={video_id}" for video_id in video_ids]
15
+
16
+ # Remove duplicates while preserving order
17
+ video_urls = list(dict.fromkeys(video_urls))
18
+ print(f"Fetched {len(video_urls)} video URLs from the channel.")
19
+ return video_urls
20
+ except Exception as e:
21
+ print(f"Error fetching video URLs: {e}")
22
+ return []
Data/yt_transcript.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from youtube_transcript_api import YouTubeTranscriptApi
2
+ from Data.get_video_link import video_links_main
3
+ from pathlib import Path
4
+ from datetime import datetime
5
+
6
+ # Dynamically get the root directory of the project
7
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent # Moves up from /Data/
8
+ TRANSCRIPTS_FOLDER = PROJECT_ROOT / "Data" / "transcripts"
9
+
10
+ def save_transcript(video_id, transcript_text):
11
+ """
12
+ Saves transcripts to the local folder
13
+ """
14
+ # Ensure the transcripts folder exists
15
+ TRANSCRIPTS_FOLDER.mkdir(parents=True, exist_ok=True)
16
+
17
+ timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
18
+ filename = f"{video_id}_{timestamp}.txt"
19
+ file_path = TRANSCRIPTS_FOLDER / filename
20
+
21
+ file_path.write_text('\n'.join(transcript_text), encoding="utf-8")
22
+ return file_path
23
+
24
+
25
+ def get_video_id(video_links_list):
26
+ return [link.replace("https://www.youtube.com/watch?v=", "") for link in video_links_list]
27
+
28
+
29
+ def fetch_yt_transcript(video_ids):
30
+ """
31
+ Fetches YouTube transcripts using video IDs.
32
+ """
33
+ video_transcripts = {}
34
+
35
+ for video_id in video_ids:
36
+ print(f"Fetching transcript for: {video_id}")
37
+ try:
38
+ output = YouTubeTranscriptApi.get_transcript(video_id)
39
+ transcript_text = [item['text'] for item in output]
40
+
41
+ # Save transcript and get file path
42
+ file_path = save_transcript(video_id, transcript_text)
43
+ video_transcripts[video_id] = {
44
+ 'text': transcript_text,
45
+ 'file_path': str(file_path)
46
+ }
47
+ print(f"Transcript saved to: {file_path}")
48
+
49
+ except Exception as e:
50
+ print(f"Transcript not found for video: {video_id}")
51
+ video_transcripts[video_id] = {
52
+ 'text': [],
53
+ 'file_path': None
54
+ }
55
+
56
+ return video_transcripts
57
+
58
+
59
+ def all_video_transcript_pipeline():
60
+ """
61
+ Handles fetching and storing transcripts, checking for new videos.
62
+ """
63
+ print(f"Looking for transcripts in: {TRANSCRIPTS_FOLDER}")
64
+ video_links_list, new_video_added, new_videos_link = video_links_main()
65
+ video_transcripts = {}
66
+
67
+ # Always load existing transcripts
68
+ if TRANSCRIPTS_FOLDER.exists():
69
+ existing_files = list(TRANSCRIPTS_FOLDER.glob("*.txt"))
70
+ print(f"Found {len(existing_files)} transcript files.")
71
+
72
+ for file in existing_files:
73
+ video_id = file.stem.split("_")[0] # Extract video ID
74
+ try:
75
+ transcript_text = file.read_text(encoding="utf-8").splitlines()
76
+ video_transcripts[video_id] = {
77
+ 'text': transcript_text,
78
+ 'file_path': str(file)
79
+ }
80
+ print(f"Loaded transcript for video: {video_id}")
81
+ except Exception as e:
82
+ print(f"Error loading transcript {file.name}: {e}")
83
+ else:
84
+ print(f"Transcripts folder not found at: {TRANSCRIPTS_FOLDER}, creating it.")
85
+ TRANSCRIPTS_FOLDER.mkdir(parents=True, exist_ok=True)
86
+
87
+ # Fetch new transcripts if needed
88
+ if new_video_added and new_videos_link:
89
+ print("New videos detected... Fetching transcripts.")
90
+ new_video_ids = [url.split("v=")[1] for url in new_videos_link] # Extract video IDs
91
+ new_transcripts = fetch_yt_transcript(new_video_ids)
92
+
93
+ print(f"Total transcripts loaded: {len(video_transcripts)}")
94
+
Dockerfile ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Declare build arguments at the top (for initial stage)
2
+ ARG USER_UID=1000
3
+ ARG USER_GID=1000
4
+
5
+ # Stage 1: Build dependencies
6
+ FROM python:3.11-slim AS builder
7
+ WORKDIR /app
8
+ RUN apt-get update && \
9
+ apt-get install -y --no-install-recommends \
10
+ build-essential \
11
+ git && \
12
+ rm -rf /var/lib/apt/lists/*
13
+ RUN python -m venv /opt/venv
14
+ ENV PATH="/opt/venv/bin:$PATH"
15
+ COPY requirements.txt .
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Stage 2: Final image
19
+ FROM python:3.11-slim
20
+
21
+ # Re-declare build arguments for this stage
22
+ ARG USER_UID=1000
23
+ ARG USER_GID=1000
24
+
25
+ COPY --from=builder /opt/venv /opt/venv
26
+ ENV PATH="/opt/venv/bin:$PATH"
27
+ WORKDIR /app
28
+ RUN apt-get update && \
29
+ apt-get install -y --no-install-recommends \
30
+ libgomp1 && \
31
+ rm -rf /var/lib/apt/lists/*
32
+
33
+ COPY . .
34
+
35
+ # Create the group and user first
36
+ RUN groupadd -g ${USER_GID} appuser && \
37
+ useradd -m -u ${USER_UID} -g appuser appuser
38
+
39
+ # Create directories and set permissions
40
+ RUN mkdir -p /app/Rag/chromadb.db && \
41
+ mkdir -p /app/Data && \
42
+ chown -R appuser:appuser /app
43
+
44
+ USER appuser
45
+
46
+ # Make sure your Python code uses this path for ChromaDB
47
+ ENV CHROMA_PERSISTENCE_DIRECTORY=/app/Rag/chromadb.db
48
+
49
+ CMD ["python", "-m","ui.app"]
Example/__init__.py ADDED
File without changes
Example/rag_example.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import chromadb
3
+ from pathlib import Path
4
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
5
+ # transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
6
+ # transcripts_folder_path = 'Data/transcripts'
7
+ transcripts_folder_path = PROJECT_ROOT / "Data" / "transcripts"
8
+ chromadb_path = PROJECT_ROOT / "Rag" / "chromadb.db"
9
+ client = chromadb.PersistentClient(path=str(chromadb_path))
10
+ collection = client.get_or_create_collection(name="yt_transcript_collection")
11
+ sys.path.append(str(PROJECT_ROOT))
12
+ sys.path.append(str(PROJECT_ROOT / "Rag"))
13
+ # print("Python path:", sys.path)
14
+ from Rag.rag_pipeline import main_workflow
15
+
16
+ # Run the application
17
+ if __name__ == "__main__":
18
+ main_workflow(transcripts_folder_path, collection)
Llm/__init__.py ADDED
File without changes
Llm/llm_endpoints.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os
3
+ import google.generativeai as genai
4
+
5
+
6
+ # Configure the Generative AI model with the API key from the environment
7
+ load_dotenv()
8
+ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
9
+ gemini_model = genai.GenerativeModel("models/gemini-1.5-flash")
10
+
11
+ # Function to get a response from the generative model
12
+ def get_llm_response(prompt: str) -> str:
13
+ response = gemini_model.generate_content(prompt)
14
+ return response.text
Prompts/__init__.py ADDED
File without changes
Prompts/huberman_prompt.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ huberman_prompt = """
2
+ You are Dr. Andrew Huberman, an expert neuroscientist and educator known for your clear, engaging, and scientifically accurate explanations. When answering, please consider the following:
3
+ 1. Provide a clear and concise summary of the scientific concepts involved.
4
+ 2. Highlight any relevant research or studies.
5
+ 3. Offer actionable insights or practical advice.
6
+
7
+ Context:
8
+ {context}
9
+
10
+ Sources:
11
+ {sources}
12
+
13
+ Conversation History:
14
+ {history}
15
+
16
+ Question:
17
+ {question}
18
+
19
+ Please respond in a manner that is informative, research-backed, and reflective of your unique style.
20
+ """
Prompts/summary_prompt.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ summary_prompts = """
2
+ #System
3
+ You are an AI agents whose job is to summarize the conversation between AI bots and the user
4
+ here is the conversation history
5
+ {{}}
6
+
7
+ #Output format
8
+
9
+
10
+ """
README.md CHANGED
@@ -12,3 +12,102 @@ short_description: a bot
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
15
+ # Andrew Huberman RAG-Based AI Chatbot
16
+
17
+ ## Overview
18
+ Xyzbot is an AI chatbot that extracts and synthesizes insights from Andrew Huberman's YouTube videos. It automatically retrieves video transcripts, updates its knowledge base in ChromaDB, and provides citation-linked responses.
19
+
20
+ ## 🚀 Key Features
21
+ - Mimics Andrew Huberman's insights using YouTube video transcripts
22
+ - Automatic transcript retrieval and knowledge base updates
23
+ - RAG-powered response generation with direct video citations
24
+ - Interactive Streamlit user interface
25
+ - Docker-based deployment for easy scalability
26
+
27
+ ## 🛠 Tech Stack
28
+ - Backend: Python, LangChain, OpenAI API
29
+ - Frontend: Streamlit
30
+ - Database: ChromaDB
31
+ - Deployment: Docker
32
+
33
+ ## 📂 Project Structure
34
+ ```
35
+ 📦 Xyzbot
36
+ ├── 📂 Data
37
+ ├── 📂 Example
38
+ ├── 📂 Llm
39
+ ├── 📂 Notebook
40
+ ├── 📂 Prompts
41
+ ├── 📂 Rag
42
+ │ ├── chromadb.db
43
+ │ └── 📂 Processed_folder
44
+ ├── 📂 utils
45
+ ├── Dockerfile
46
+ └── pyproject.toml
47
+ ```
48
+
49
+ ## 🔧 Prerequisites
50
+ - Python 3.8+
51
+ - Docker (optional)
52
+
53
+ ## 🔑 API Keys Required
54
+ 1. Google Gemini API Key
55
+ 2. YouTube API Key
56
+
57
+ ## 🚀 Installation
58
+
59
+ ### Local Setup
60
+ 1. Clone the repository
61
+ ```bash
62
+ git clone https://github.com/Angel-dash/Xyzbot.git
63
+ cd Xyzbot
64
+ ```
65
+
66
+ 2. Create virtual environment
67
+ ```bash
68
+ python3 -m venv venv
69
+ source venv/bin/activate
70
+ pip install -r requirements.txt
71
+ ```
72
+
73
+ ### Docker Setup
74
+
75
+ #### Option 1: Build Locally
76
+ ```bash
77
+ docker build -t xyzbot:v1.0 .
78
+ docker run -it \
79
+ -v $(pwd)/Rag:/app/Rag:rw \
80
+ -e GOOGLE_API_KEY=your_api_key \
81
+ xyzbot:v1.0
82
+ ```
83
+
84
+ #### Option 2: Pull from Docker Hub
85
+ ```bash
86
+ docker pull angeldash/xyzbot:v1.0
87
+ docker run -it \
88
+ -v $(pwd)/Rag:/app/Rag:rw \
89
+ -e GOOGLE_API_KEY=your_api_key \
90
+ angeldash/xyzbot:v1.0
91
+ ```
92
+
93
+ ## 🖥️ Running the Application
94
+ ```bash
95
+ streamlit run src/main.py
96
+ ```
97
+
98
+ ## 📈 Future Roadmap
99
+ - Fine-tuned LLM response generation
100
+ - Real-time multi-channel monitoring
101
+ - Enhanced citation formatting
102
+ - AI agent conversation handling
103
+ - Performance optimization
104
+
105
+ ## 📜 License
106
+ MIT License
107
+
108
+ ## 🤝 Contributing
109
+ Contributions are welcome! Open an issue or submit a pull request.
110
+
111
+ ---
112
+ **Author:** Angel Dash | **GitHub:** [@Angel-dash](https://github.com/Angel-dash)
113
+
Rag/Processed_folder/processed_files.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["VOfwbcveP84_20241225194621.txt", "In9Bq4EJMZw_20241225194705.txt", "DkS1pkKpILY_20241225194325.txt", "ajneRM-ET1Q_20241225194311.txt", "K4Ze-Sp6aUE_20241225194709.txt", "n28W4AmvMDE_20241225194626.txt", "UIy-WQCZd4M_20241225194819.txt", "etbfLTHD_VU_20241225194439.txt", "PVmQOLYckKQ_20241225194814.txt", "F9KrZd_-ge0_20241225194812.txt", "xjEFo3a1AnI_20241225194539.txt", "szqPAPKE5tQ_20241225194712.txt", "3_auLYOilb8_20241225194826.txt", "acgz0C-z-gc_20241225194817.txt", "zVCaYyUWWSw_20241225194412.txt", "doupx8SAs5Y_20241225194603.txt", "wAZn9dF3XTo_20241225194423.txt", "2XGREPnlI8U_20241225194659.txt", "UNCwdFxPtE8_20241225194521.txt", "at37Y8rKDlA_20241225194513.txt", "oL3SkPV1_Ik_20241225194837.txt", "nOgypsWKjm4_20241225194440.txt", "rW9QKc-iFoY_20241225194751.txt", "CQlTmOFM4Qs_20241225194550.txt", "tR73Ny4Dt9s_20241225194413.txt", "t1F7EEGPQwo_20241225194649.txt", "ccrbE0QHy94_20241225194608.txt", "SyWC8ZFVxGo_20241225194333.txt", "zlc4VrDx_qk_20241225194800.txt", "8qaBpM73NSk_20241225194409.txt", "sxgCC4H1dl8_20241225194524.txt", "RBK5KLA5Jjg_20241225194446.txt", "slUCmZJDXrk_20241225194627.txt", "h2aWYjSA1Jc_20241225194702.txt", "Ov4yyK15-K8_20241225194230.txt", "juD99_sPWGU_20241225194340.txt", "q1Ss8sTbFBY_20241225194647.txt", "X8Hw8zeCDTA_20241225194518.txt", "UChhXiFPRgg_20241225194443.txt", "pq6WHJzOkno_20241225194415.txt", "2Ds1m5gflCI_20241225194849.txt", "jGZ1mR9uLU0_20241225194808.txt", "VAEzZeaV5zM_20241225194347.txt", "EhlIkzJwPlk_20241225194656.txt", "HiyzzcuaAac_20241225194255.txt", "C3X0bUAiluE_20241225194259.txt", "kG5Qb9sr0YQ_20241225194810.txt", "wRsX_ZkzxvQ_20241225194619.txt", "U2BPitASUh0_20241225194358.txt", "Wcs2PFz5q6g_20241225194327.txt", "CuzL1qxUyHw_20241225194312.txt", "q37ARYnRDGc_20241225194623.txt", "cp9GXl9Qk_s_20241225194735.txt", "XT_6Lvkhxvo_20241225194342.txt", "bUr_9fgfnto_20241225194256.txt", "LTGGyQS1fZE_20241225194305.txt", "mAlt_HKX4as_20241225194420.txt", "SZSRgyl7pyQ_20241225194418.txt", "RI112zW8GDw_20241225194356.txt", "ycOBZZeVeAc_20241225194707.txt", "6YLdlK2hYnw_20241225194328.txt", "p4ZfkezDTXQ_20241225194615.txt", "LVxL_p_kToc_20241225194558.txt", "HXzTbCEqCJc_20241225194710.txt", "yOoVz9E9kfQ_20241225194901.txt", "C5KpIXjpzdY_20241225194400.txt", "__RAXBLt1iM_20241225194430.txt", "8N7mdkrXgbc_20241225194338.txt", "JnlSDaBjCCU_20241225194450.txt", "IOl28gj_RXw_20241225194431.txt", "Nr5xb-QCBGA_20241225194354.txt", "GzvzWO0NU50_20241225194605.txt", "DtmwtjOoSYU_20241225194633.txt", "CrtR12PBKb0_20241225194632.txt", "gMRph_BvHB4_20241225194516.txt", "QpoaNklmRPc_20241225194248.txt", "9tRohh0gErM_20241225194353.txt", "Xu1FMCxoEFc_20241225194346.txt", "15R2pMqU2ok_20241225194406.txt", "eIxVfln02Ss_20241225194335.txt", "0Dtt95_xabw_20241225194252.txt", "3ZGItIAUQmI_20241225194719.txt", "uxZFl4BDOGk_20241225194757.txt", "hvPGfcAgk9Y_20241225194754.txt", "HYVeP4F0GNU_20241225194559.txt", "z5W74QC3v2I_20241225194308.txt", "31wjVhCcI5Y_20241225194426.txt", "BMTt8gSl13s_20241225194836.txt", "aQDOU3hPci0_20241225194501.txt", "tkH2-_jMCSk_20241225194543.txt", "ntfcfJ28eiU_20241225194522.txt", "S8nPJU9xkNw_20241225194748.txt", "fcxjwA4C4Cw_20241225194553.txt", "iMvtHqLmEkI_20241225194855.txt", "099hgtRoUZw_20241225194436.txt", "4RFEkGKKhdE_20241225194907.txt", "eJU6Df_ffAE_20241225194635.txt", "nqNEtdHVUjM_20241225194437.txt", "1SXDXdngX2M_20241225194316.txt", "X4QE6t-MkYE_20241225194642.txt", "79p1X_7rAMo_20241225194630.txt", "6RZbGrq9BxE_20241225194306.txt", "pkJi9Raxikg_20241225194824.txt", "QbMxDZeB8Ks_20241225194247.txt", "RgAcOqVRfYA_20241225194657.txt", "ncSoor2Iw8k_20241225194833.txt", "i_DEPeCKxs8_20241225194235.txt", "FE0lTEUa7EY_20241225194753.txt", "gE0_8AjTFaM_20241225194852.txt", "kgr22uMsJ5o_20241225194317.txt", "ufsIA5NARIo_20241225194535.txt", "CyDLbrZK75U_20241225194434.txt", "7TkGDj4LaOU_20241225194244.txt", "XLr2RKoD-oY_20241225194738.txt", "yb5zpo5WDG4_20241225194645.txt", "a9yFKPmPZ90_20241225194556.txt", "TG8VM5-CTfw_20241225194636.txt", "eMqWH3LYiII_20241225194351.txt", "CVh3_8e5u8I_20241225194246.txt", "SuR0DaYoe0Y_20241225194302.txt", "FLxIoNguGRU_20241225194233.txt", "GA89kjVY6Ik_20241225194854.txt", "qJ3uV7coZbA_20241225194453.txt", "EQ3GjpGq5Y8_20241225194405.txt", "yOJvm_ri_hk_20241225194555.txt", "cwakOgHIT0E_20241225194421.txt", "DTCmprPCDqc_20241225194733.txt", "qPKd99Pa2iU_20241225194500.txt", "nm1TxQj9IsQ_20241225194611.txt", "LRM5LutB538_20241225194857.txt", "xTtM2AvCRyA_20241225194643.txt", "62lVH-6xYGY_20241225194250.txt", "Rxmv7rT9leo_20241225194417.txt", "ulHrUVV3Kq4_20241225194452.txt", "bGixnNGvSkg_20241225194231.txt", "1CxJVdeyltw_20241225194614.txt", "wgUjIRtote8_20241225194726.txt", "qPKd99Pa2iU_20241225194503.txt", "S_SrHS8FvMM_20241225194807.txt", "xX6hiEmDmxs_20241225194227.txt", "uXs-zPc63kM_20241225194449.txt", "4AwyVTHEU3s_20241225194904.txt", "xaE9XyMMAHY_20241225194848.txt", "hFL6qRIJZ_Y_20241225194428.txt", "FOi5s3OUogo_20241225194245.txt", "cS7cNaBrkxo_20241225194624.txt", "kpTJqwIfHcM_20241225194654.txt", "yixIc1Ai6jM_20241225194829.txt", "vfRtLI6cJrk_20241225194324.txt", "GLgKkG44MGo_20241225194729.txt", "KPlJcD-o-4Q_20241225194617.txt", "AtChcxeaukQ_20241225194646.txt", "tLS6t3FVOTI_20241225194714.txt", "GqPGXG5TlZw_20241225194541.txt", "UF0nqolsNZc_20241225194727.txt", "7R3-3HR6-u4_20241225194519.txt", "tLRCS48Ens4_20241225194447.txt", "V0Sdgn0_kFM_20241225194740.txt", "G1VUSu6sGoU_20241225194251.txt", "m_OazsImOiI_20241225194322.txt", "Og56hmAspV8_20241225194258.txt", "dFR_wFN23ZY_20241225194640.txt", "q-H_A_dQUxQ_20241225194303.txt", "KVjfFN89qvQ_20241225194314.txt", "zU5EYw06wtw_20241225194349.txt", "Z7MU6zrAXsM_20241225194442.txt", "LYYyQcAJZfk_20241225194508.txt", "E7W4OQfJWdw_20241225194717.txt", "azb3Ih68awQ_20241225194505.txt", "ouCWNRvPk20_20241225194401.txt", "uwWOc_RqTBA_20241225194858.txt", "pZX8ikmWvEU_20241225194510.txt", "n9IxomBusuw_20241225194545.txt", "BwyZIWeBpRw_20241225194534.txt", "XY0rBdaDXD8_20241225194226.txt", "1Wo6SqLNmLk_20241225194845.txt", "ddq8JIMhz7c_20241225194529.txt", "VQLU7gpk_X8_20241225194821.txt", "jC8Pu9HBd48_20241225194321.txt", "rZkMpVLcVsg_20241225194319.txt", "gbQFSMayJxk_20241225194736.txt", "F54qXuTpgfM_20241225194843.txt", "p3JLaF_4Tz8_20241225194537.txt", "FeRgqJVALMQ_20241225194433.txt", "hF32FvBH4gI_20241225194332.txt", "CDUetQMKM6g_20241225194454.txt", "wG3UFHR1o48_20241225194229.txt", "6P8hrzjnetU_20241225194336.txt", "WFcYF_pxLgA_20241225194458.txt", "77CdVSpnUX4_20241225194746.txt", "VOfwbcveP84_20241225194742.txt", "VRvn3Oj5r3E_20241225194839.txt", "Gf-kC30SLtc_20241225194846.txt", "S8jWFcDGz4Y_20241225194805.txt", "x3MgDtZovks_20241225194526.txt", "lIo9FcrljDk_20241225194309.txt", "-e9ErUozQo4_20241225194903.txt", "aXvDEmo6uS4_20241225194629.txt", "3gtvNYa3Nd8_20241225194531.txt", "5tYR7e5Wpyc_20241225194238.txt", "OadokY8fcAA_20241225194601.txt", "O640yAgq5f8_20241225194744.txt", "zbpb1wd-wvs_20241225194827.txt", "gXvuJu1kt48_20241225194638.txt", "zEYE-vcVKy8_20241225194547.txt", "Ky-ZJ9SS-x4_20241225194240.txt", "0RYyQRQFgFk_20241225194532.txt", "4F_RBc1akC8_20241225194724.txt", "nDLb8_wgX50_20241225194540.txt", "tcueMCe-0zo_20241225194236.txt", "K-TW2Chpz4k_20241225194330.txt", "XcvhERcZpWw_20241225194731.txt", "Ze2pc6NwsHQ_20241225194704.txt", "_ltcLEM-5HU_20241225194612.txt", "jouFvyRZntk_20241225194507.txt", "uWV9a3zEaL4_20241225194823.txt", "-OBCwiPPfEU_20241225194747.txt", "dzOvi0Aa2EA_20241225194301.txt", "K9lORz2_XSU_20241225194527.txt", "j2sMqSDLd4k_20241225194407.txt", "oNkDA2F7CjM_20241225194651.txt", "50BZQRT1dAg_20241225194403.txt", "q8CHXefn7B4_20241225194411.txt", "Jy4rJcYmtUM_20241225194344.txt", "QmOF0crdyRU_20241225194456.txt", "6ZrlsVx85ek_20241225194758.txt", "CD0bRU1e1ZM_20241225194425.txt", "IAnhFUUCq6c_20241225194804.txt", "Phm-Alz1Zjo_20241225194906.txt", "csubiPlvFWk_20241225194606.txt", "GpgqXCkRO-w_20241225194701.txt", "W5zqC5cYcS0_20241225194241.txt", "T65RDBiB5Hs_20241225194715.txt", "6I5I56uVvLw_20241225194801.txt", "i5611OvTFGM_20241225194548.txt", "wTBSGgbIvsY_20241225194552.txt", "O1YRwWmue4Y_20241225194815.txt", "29n0WG317tM_20241225194511.txt", "xmhsWAqP_0Y_20241225194851.txt", "x4m_PdFbu-s_20241225194722.txt"]
Rag/__init__.py ADDED
File without changes
Rag/rag_pipeline.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from sentence_transformers import SentenceTransformer
4
+ import google.generativeai as genai
5
+ import os
6
+ import logging
7
+ from concurrent.futures import ProcessPoolExecutor, as_completed
8
+ from Llm.llm_endpoints import get_llm_response
9
+ from utils.get_link import get_source_link
10
+ from Prompts.huberman_prompt import huberman_prompt
11
+ from tqdm import tqdm
12
+ # Configuration
13
+ API_KEY = os.getenv("GOOGLE_API_KEY")
14
+ if API_KEY:
15
+ genai.configure(api_key=API_KEY)
16
+
17
+ chromadb_path = "app/Rag/chromadb.db"
18
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
19
+
20
+ # Logging
21
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
22
+
23
+
24
+ # Helper Functions
25
+ def split_text_to_chunks(docs, chunk_size=1000, chunk_overlap=200):
26
+ """Split text into manageable chunks."""
27
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
28
+ chunks = text_splitter.split_text(docs)
29
+ return chunks
30
+
31
+
32
+ def get_new_files(transcripts_folder_path, collection):
33
+ """Find new transcript files that haven't been processed yet."""
34
+ all_files = [f for f in os.listdir(transcripts_folder_path) if f.endswith(".txt")]
35
+ existing_files = [meta["source"] for meta in collection.get()['metadatas']]
36
+ return [f for f in all_files if f not in existing_files]
37
+
38
+
39
+ def process_single_file(file_path):
40
+ """Process a single file and return its chunks."""
41
+ with open(file_path, 'r') as f:
42
+ content = f.read()
43
+ chunks = split_text_to_chunks(content)
44
+ return chunks, os.path.basename(file_path)
45
+
46
+
47
+ def batch_embed_chunks(chunks, batch_size=32):
48
+ """Embed chunks in batches."""
49
+ embeddings = []
50
+ for i in tqdm(range(0, len(chunks), batch_size),desc = "Embedding chunks"):
51
+ batch = chunks[i:i + batch_size]
52
+ batch_embeddings = embedding_model.encode(batch, show_progress_bar=True)
53
+ embeddings.extend(batch_embeddings.tolist())
54
+ return embeddings
55
+
56
+
57
+ def process_and_add_new_files(transcripts_folder_path, collection):
58
+ """Process and add new transcript files to the vector database."""
59
+ new_files = get_new_files(transcripts_folder_path, collection)
60
+ if not new_files:
61
+ logging.info("No new files to process")
62
+ return False
63
+
64
+ # Use a reasonable number of workers (4 is usually a good default)
65
+ n_workers = min(4, len(new_files))
66
+ logging.info(f"Using {n_workers} workers for processing")
67
+
68
+ all_chunks = []
69
+ all_metadata = []
70
+ all_ids = []
71
+
72
+ # Process files in parallel
73
+ with ProcessPoolExecutor(max_workers=n_workers) as executor:
74
+ futures = {
75
+ executor.submit(process_single_file, os.path.join(transcripts_folder_path, file)): file
76
+ for file in new_files
77
+ }
78
+
79
+ for future in as_completed(futures):
80
+ file = futures[future]
81
+ try:
82
+ chunks, filename = future.result()
83
+ file_metadata = [{"source": filename} for _ in range(len(chunks))]
84
+ file_ids = [f"{filename}_chunk_{i}" for i in range(len(chunks))]
85
+
86
+ all_chunks.extend(chunks)
87
+ all_metadata.extend(file_metadata)
88
+ all_ids.extend(file_ids)
89
+
90
+ logging.info(f"Processed {filename}")
91
+ except Exception as e:
92
+ logging.error(f"Error processing {file}: {str(e)}")
93
+ continue
94
+
95
+ # Process embeddings in batches
96
+ logging.info(f"Generating embeddings for {len(all_chunks)} chunks")
97
+ embeddings = batch_embed_chunks(all_chunks)
98
+
99
+ # Add to database in batches
100
+ batch_size = 500
101
+ for i in range(0, len(all_chunks), batch_size):
102
+ end_idx = min(i + batch_size, len(all_chunks))
103
+ collection.upsert(
104
+ documents=all_chunks[i:end_idx],
105
+ embeddings=embeddings[i:end_idx],
106
+ metadatas=all_metadata[i:end_idx],
107
+ ids=all_ids[i:end_idx]
108
+ )
109
+ logging.info(f"Added batch {i // batch_size + 1} to database")
110
+
111
+ logging.info(f"Successfully processed {len(new_files)} files")
112
+ return True
113
+
114
+
115
+ def query_database(collection, query_text, n_results=3):
116
+ """Retrieve the most relevant chunks for the query."""
117
+ query_embeddings = embedding_model.encode(query_text).tolist()
118
+ results = collection.query(query_embeddings=query_embeddings, n_results=n_results)
119
+ retrieved_docs = results['documents'][0]
120
+ metadatas = results['metadatas'][0]
121
+ return retrieved_docs, metadatas
122
+
123
+
124
+ def enhance_query_with_history(query_text, summarized_history):
125
+ enhance_query = f"{query_text}*2\n\n{summarized_history}"
126
+ return enhance_query
127
+
128
+
129
+ def update_conversation_history(history, user_query, bot_response):
130
+ """Update and keeps track of conversation history between user and the bot."""
131
+ history.append({"user": user_query, "bot": bot_response})
132
+ return history
133
+
134
+
135
+ def generate_response(conversation_history, query_text, retrieved_docs, source_links):
136
+ """Generate a response using retrieved documents and the generative AI model."""
137
+ context = " ".join(retrieved_docs)
138
+ history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
139
+ sources_str = "\n".join(source_links)
140
+
141
+ prompt = huberman_prompt.format(
142
+ context=context,
143
+ sources=sources_str,
144
+ history=history_str,
145
+ question=query_text
146
+ )
147
+
148
+ response = get_llm_response(prompt)
149
+ full_response = f"{response}\n\nSources:\n{sources_str}"
150
+ return full_response
151
+
152
+
153
+ def main_workflow(transcripts_folder_path, collection):
154
+ """Run the full RAG workflow."""
155
+ new_files_added = process_and_add_new_files(transcripts_folder_path, collection)
156
+ if new_files_added:
157
+ logging.info("New transcripts added to the database.")
158
+ else:
159
+ logging.info("No new files found. Using existing database.")
160
+
161
+ conversation_history = []
162
+
163
+ while True:
164
+ query_text = input("\nEnter your query(or type 'exit' to end):").strip()
165
+ if query_text.lower() == "exit":
166
+ print("Ending the conversation. Goodbye")
167
+ break
168
+
169
+ query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
170
+ retrived_docs, metadatas = query_database(collection, query_text_with_conversation_history)
171
+ print("-" * 50)
172
+ source_link = get_source_link(metadatas)
173
+ print(source_link)
174
+ print("-" * 50)
175
+
176
+ if not retrived_docs:
177
+ print("No relevent documents is found")
178
+ continue
179
+
180
+ response = generate_response(conversation_history, query_text, retrived_docs, source_link)
181
+ conversation_history = update_conversation_history(conversation_history, query_text, response)
182
+ print("\nGenerated Response:")
183
+ print(response)
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "xyzbot"
3
+ version = "0.1.0"
4
+ description = "A rag application"
5
+ authors = [
6
+ {name = "Angel njlghmr@gmail.com"}
7
+ ]
8
+ license = {text = "MIT"}
9
+ readme = "README.md"
10
+ requires-python =">=3.11,<3.12"
11
+ dependencies = [
12
+ "pyarrow (>=19.0.0,<20.0.0)",
13
+ "pandas (>=2.2.3,<3.0.0)",
14
+ "pendulum (>=3.0.0,<4.0.0)",
15
+ "google-generativeai (>=0.8.4,<0.9.0)",
16
+ "langchain (>=0.3.16,<0.4.0)",
17
+ "langchain-openai (>=0.3.3,<0.4.0)",
18
+ "langchain-chroma (>=0.2.1,<0.3.0)",
19
+ "langchain-community (>=0.3.16,<0.4.0)",
20
+ "chromadb (>=0.4.14)",
21
+ "pypdf (==4.2.0)",
22
+ "flask (==3.0.1)",
23
+ "flask-cors (==3.0.10)",
24
+ "sentence-transformers (==3.3.1)",
25
+ "tqdm (==4.67.1)",
26
+ "torch (==2.5.1)",
27
+ "transformers (==4.46.3)",
28
+ "pydantic (>=2.7.4,<3.0.0)"
29
+ ]
30
+
31
+
32
+ [build-system]
33
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
34
+ build-backend = "poetry.core.masonry.api"
requirements.in ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pyarrow
2
+ pandas[performance, parquet, aws]
3
+ pendulum
4
+ google.generativeai
5
+ langchain
6
+ langchain_openai
7
+ langchain_chroma
8
+ langchain_community
9
+ chromadb==0.4.8
10
+ pypdf
11
+ flask
12
+ flask_cors
13
+ sentence_transformers
14
+ tqdm
15
+ torch
16
+ transformers
17
+ spacy==3.5.0
18
+ coreferee==1.4.1
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ langchain>=0.3.16,<0.4.0
3
+ langchain_openai
4
+ langchain_chroma
5
+ langchain-community>=0.3.16,<0.4.0
6
+ chromadb>=0.4.14
7
+ flask==3.0.1
8
+ flask_cors==3.0.10
9
+ google.generativeai
10
+ pydantic>=2.7.4,<3.0.0
11
+ streamlit
12
+ # PDF Processing
13
+ pypdf==4.2.0
14
+
15
+ # ML/AI Dependencies (with CPU-only versions)
16
+ sentence_transformers==2.3.1
17
+ --extra-index-url https://download.pytorch.org/whl/cpu
18
+ torch==2.1.0+cpu
19
+
20
+ gradio
setup.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install Python dependencies
2
+ pip install -r requirements.txt
3
+
4
+ # Download spaCy model
5
+ python -m spacy download en_core_web_sm
6
+
7
+ # Install Coreferee for English
8
+ python -m coreferee install en
9
+
10
+ echo "Setup completed successfully!"
ui/__init__.py ADDED
File without changes
ui/app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import chromadb
3
+ from typing import List, Dict
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ project_root = Path(__file__).resolve().parent.parent
8
+ sys.path.append(str(project_root))
9
+ sys.path.append(str(project_root / "Rag"))
10
+ sys.path.append(str(project_root / "Data"))
11
+ sys.path.append(str(project_root / "Data" / "transcripts"))
12
+ sys.path.append(str(project_root / "Data" / "video_links"))
13
+ sys.path.append(str(project_root / "Llm"))
14
+ sys.path.append(str(project_root / "Prompts"))
15
+ sys.path.append(str(project_root / "utils"))
16
+ from Rag.rag_pipeline import (
17
+ query_database,
18
+ generate_response,
19
+ enhance_query_with_history,
20
+ update_conversation_history,
21
+ process_and_add_new_files
22
+ )
23
+
24
+ INTRODUCTION = """
25
+ # 🧠 Welcome to HubermanBot!
26
+
27
+ I am your AI assistant trained on Andrew Huberman's podcast content. My knowledge base includes detailed information about:
28
+
29
+ - 🎯 Peak Performance & Focus
30
+ - 😴 Sleep Science & Optimization
31
+ - 🏋️ Physical Fitness & Recovery
32
+ - 🧘 Mental Health & Stress Management
33
+ - 🧪 Neuroscience & Biology
34
+ - 💪 Habit Formation & Behavior Change
35
+
36
+ For each response, I'll provide:
37
+ - Detailed answers based on podcast content
38
+ - Direct source links to specific episodes
39
+ - Scientific context when available
40
+
41
+ Ask me anything about these topics, and I'll help you find relevant information from the Huberman Lab Podcast!
42
+
43
+ Example questions you might ask:
44
+ - "What does Dr. Huberman recommend for better sleep?"
45
+ - "How can I improve my focus and concentration?"
46
+ - "What are the best practices for morning routines?"
47
+ """
48
+
49
+
50
+ def format_youtube_url(filename: str) -> str:
51
+ """Convert filename to YouTube URL"""
52
+ # Extract video ID by removing the timestamp and .txt extension
53
+ video_id = filename.split('_')[0]
54
+ return f"https://www.youtube.com/watch?v={video_id}"
55
+
56
+
57
+ class RAGChatInterface:
58
+ def __init__(self, transcripts_folder_path: str, collection):
59
+ self.transcripts_folder_path = transcripts_folder_path
60
+ self.collection = collection
61
+ self.conversation_history: List[Dict[str, str]] = []
62
+
63
+ def process_query(self, message: str, history: List[List[str]]) -> str:
64
+ """Process a single query and return the response"""
65
+ # Convert Gradio history format to our conversation history format
66
+ self.conversation_history = [
67
+ {"user": user_msg, "bot": bot_msg}
68
+ for user_msg, bot_msg in history
69
+ ]
70
+
71
+ # Enhance query with conversation history
72
+ query_with_history = enhance_query_with_history(message, self.conversation_history)
73
+
74
+ # Get relevant documents
75
+ retrieved_docs, metadatas = query_database(self.collection, query_with_history)
76
+
77
+ if not retrieved_docs:
78
+ return "I apologize, but I couldn't find any relevant information about that in my knowledge base. Could you try rephrasing your question or ask about a different topic covered in the Huberman Lab Podcast?"
79
+
80
+ # Generate response
81
+ source_links = [meta["source"] for meta in metadatas]
82
+ response = generate_response(
83
+ self.conversation_history,
84
+ message,
85
+ retrieved_docs,
86
+ source_links
87
+ )
88
+
89
+ # Remove duplicate sources and convert to YouTube URLs
90
+ unique_sources = list(set(source_links))
91
+ youtube_urls = [format_youtube_url(source) for source in unique_sources]
92
+
93
+ # Format response with markdown for better readability
94
+ formatted_response = f"{response}\n\n---\n📚 **Source Episodes:**\n"
95
+ for url in youtube_urls:
96
+ formatted_response += f"- {url}\n"
97
+
98
+ return formatted_response
99
+
100
+
101
+ def create_interface(transcripts_folder_path: str, collection) -> gr.Interface:
102
+ """Create and configure the Gradio interface"""
103
+ # Initialize the RAG chat interface
104
+ rag_chat = RAGChatInterface(transcripts_folder_path, collection)
105
+
106
+ # Create the Gradio interface with custom styling
107
+ interface = gr.ChatInterface(
108
+ fn=rag_chat.process_query,
109
+ title="🧠 HubermanBot - Your Neuroscience & Wellness AI Assistant",
110
+ description=INTRODUCTION,
111
+ examples=[
112
+ "What are Dr. Huberman's top recommendations for better sleep?",
113
+ "How does sunlight exposure affect our circadian rhythm?",
114
+ "What supplements does Dr. Huberman recommend for focus?",
115
+ "What are the best practices for morning routines according to Dr. Huberman?",
116
+ "How can I optimize my workout recovery based on neuroscience?",
117
+ ],
118
+ theme=gr.themes.Soft(
119
+ primary_hue="indigo",
120
+ secondary_hue="blue",
121
+ )
122
+ )
123
+
124
+ return interface
125
+
126
+
127
+ def main():
128
+ # Get absolute path for ChromaDB
129
+ project_root = Path(__file__).parent.parent
130
+ chromadb_path = project_root / "Rag" / "chromadb.db"
131
+
132
+ client = chromadb.PersistentClient(path=str(chromadb_path))
133
+ collection = client.get_or_create_collection(name="yt_transcript_collection")
134
+
135
+ # Use absolute path for transcripts folder too
136
+ transcripts_folder_path = project_root / "Data" / "transcripts"
137
+
138
+ # Process any new files
139
+ process_and_add_new_files(str(transcripts_folder_path), collection)
140
+
141
+ # Create and launch the interface
142
+ interface = create_interface(str(transcripts_folder_path), collection)
143
+ interface.launch(share=True, server_port=7860)
144
+
145
+
146
+ if __name__ == "__main__":
147
+ main()
utils/__init__.py ADDED
File without changes
utils/corefrence.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from spacy.tokens import Doc
3
+ import coreferee
4
+
5
+ # Load spaCy model
6
+ nlp = spacy.load('en_core_web_sm')
7
+ nlp.add_pipe("coreferee")
8
+
9
+ # Register the custom extension attribute
10
+ Doc.set_extension('resolved_text', default=None, force=True)
11
+
12
+
13
+ def resolve_coreferences(query_text, conversation_history):
14
+ """
15
+ Resolve coreferences in the given text using spaCy and coreferee.
16
+
17
+ Args:
18
+ query_text (str): The current query to resolve
19
+ conversation_history (list): List of dictionaries containing previous conversation turns
20
+
21
+ Returns:
22
+ str: Text with resolved coreferences
23
+ """
24
+ # Combine conversation history and current query
25
+ combined_text = []
26
+ for turn in conversation_history:
27
+ combined_text.append(f"User: {turn['user']}")
28
+ combined_text.append(f"Bot: {turn['Bot']}")
29
+ combined_text.append(f"User: {query_text}")
30
+ text = "\n".join(combined_text)
31
+
32
+ # Process the text
33
+ doc = nlp(text)
34
+
35
+ # Get all tokens and their potential antecedents
36
+ resolved_tokens = list(doc)
37
+
38
+ # Resolve coreferences
39
+ for chain in doc._.coref_chains:
40
+ for mention in chain:
41
+ if mention.root_index != chain.most_specific.root_index:
42
+ # Replace mention with its antecedent
43
+ resolved_tokens[mention.root_index] = doc[chain.most_specific.root_index]
44
+
45
+ # Reconstruct the text with resolved references
46
+ resolved_text = "".join([token.text_with_ws if isinstance(token, spacy.tokens.Token)
47
+ else token.text + " " for token in resolved_tokens])
48
+
49
+ # Extract the resolved query (last line)
50
+ resolved_query = resolved_text.split('\n')[-1].replace("User: ", "").strip()
51
+
52
+ return resolved_query
utils/get_link.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_source_link(metadatas):
2
+ link = 'https://www.youtube.com/watch?v='
3
+ yt_link = []
4
+ for metadata in metadatas:
5
+ source = metadata['source']
6
+ values = source.split('.txt')
7
+
8
+ link = link + values[0]
9
+ yt_link.append(link)
10
+ # print(yt_link)
11
+ return yt_link
utils/summarization.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Llm.llm_endpoints import get_llm_response
2
+
3
+
4
+ def summarize_conversation(conversation_history):
5
+ try:
6
+ summary_prompt = "Summarize the following conversation:\n" + "\n".join(
7
+ [f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
8
+ summary = get_llm_response(summary_prompt)
9
+ print("*************************************************")
10
+ print(summary)
11
+ print("*************************************************")
12
+ return summary
13
+ except:
14
+ return ""