Spaces:
Sleeping
Sleeping
Resolved merge conflict in README.md
Browse files- .dockerignore +23 -0
- .gitignore +255 -0
- .gradio/certificate.pem +31 -0
- Data/__init__.py +0 -0
- Data/get_video_link.py +152 -0
- Data/new_video_added.py +22 -0
- Data/yt_transcript.py +94 -0
- Dockerfile +49 -0
- Example/__init__.py +0 -0
- Example/rag_example.py +18 -0
- Llm/__init__.py +0 -0
- Llm/llm_endpoints.py +14 -0
- Prompts/__init__.py +0 -0
- Prompts/huberman_prompt.py +20 -0
- Prompts/summary_prompt.py +10 -0
- README.md +99 -0
- Rag/Processed_folder/processed_files.json +1 -0
- Rag/__init__.py +0 -0
- Rag/rag_pipeline.py +183 -0
- poetry.lock +0 -0
- pyproject.toml +34 -0
- requirements.in +18 -0
- requirements.txt +20 -0
- setup.sh +10 -0
- ui/__init__.py +0 -0
- ui/app.py +147 -0
- utils/__init__.py +0 -0
- utils/corefrence.py +52 -0
- utils/get_link.py +11 -0
- utils/summarization.py +14 -0
.dockerignore
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Ignore version control
|
2 |
+
.git
|
3 |
+
.gitignore
|
4 |
+
|
5 |
+
# Ignore notebooks
|
6 |
+
Notebook/
|
7 |
+
|
8 |
+
# Ignore databases and logs
|
9 |
+
**/*.db
|
10 |
+
**/*.sqlite3
|
11 |
+
**/chromadb.db
|
12 |
+
**/error_log.txt
|
13 |
+
|
14 |
+
# Ignore cache
|
15 |
+
**/__pycache__/
|
16 |
+
**/*.pyc
|
17 |
+
**/*.pyo
|
18 |
+
**/*.pyd
|
19 |
+
|
20 |
+
# Ignore environment files
|
21 |
+
.env
|
22 |
+
venv/
|
23 |
+
.venv/
|
.gitignore
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tmp
|
2 |
+
.idea
|
3 |
+
models
|
4 |
+
|
5 |
+
stanford-ner-2015-04-20.zip
|
6 |
+
stanford-ner-2015-04-20
|
7 |
+
*.pyc
|
8 |
+
|
9 |
+
### Python template
|
10 |
+
# Byte-compiled / optimized / DLL files
|
11 |
+
__pycache__/
|
12 |
+
*.py[cod]
|
13 |
+
*$py.class
|
14 |
+
|
15 |
+
# C extensions
|
16 |
+
*.so
|
17 |
+
|
18 |
+
# Distribution / packaging
|
19 |
+
.Python
|
20 |
+
build/
|
21 |
+
develop-eggs/
|
22 |
+
dist/
|
23 |
+
downloads/
|
24 |
+
eggs/
|
25 |
+
.eggs/
|
26 |
+
lib/
|
27 |
+
lib64/
|
28 |
+
parts/
|
29 |
+
sdist/
|
30 |
+
var/
|
31 |
+
wheels/
|
32 |
+
pip-wheel-metadata/
|
33 |
+
share/python-wheels/
|
34 |
+
*.egg-info/
|
35 |
+
.installed.cfg
|
36 |
+
*.egg
|
37 |
+
MANIFEST
|
38 |
+
|
39 |
+
# PyInstaller
|
40 |
+
# Usually these files are written by a python script from a template
|
41 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
42 |
+
*.manifest
|
43 |
+
*.spec
|
44 |
+
|
45 |
+
# Installer logs
|
46 |
+
pip-log.txt
|
47 |
+
pip-delete-this-directory.txt
|
48 |
+
|
49 |
+
# Unit test / coverage reports
|
50 |
+
htmlcov/
|
51 |
+
.tox/
|
52 |
+
.nox/
|
53 |
+
.coverage
|
54 |
+
.coverage.*
|
55 |
+
.cache
|
56 |
+
nosetests.xml
|
57 |
+
coverage.xml
|
58 |
+
*.cover
|
59 |
+
.hypothesis/
|
60 |
+
.pytest_cache/
|
61 |
+
|
62 |
+
# Translations
|
63 |
+
*.mo
|
64 |
+
*.pot
|
65 |
+
|
66 |
+
service.log.*
|
67 |
+
|
68 |
+
# Django stuff:
|
69 |
+
*.log
|
70 |
+
local_settings.py
|
71 |
+
db.sqlite3
|
72 |
+
db.sqlite3-journal
|
73 |
+
|
74 |
+
# Flask stuff:
|
75 |
+
instance/
|
76 |
+
.webassets-cache
|
77 |
+
|
78 |
+
# Scrapy stuff:
|
79 |
+
.scrapy
|
80 |
+
|
81 |
+
# Sphinx documentation
|
82 |
+
docs/_build/
|
83 |
+
|
84 |
+
# PyBuilder
|
85 |
+
target/
|
86 |
+
|
87 |
+
# Jupyter Notebook
|
88 |
+
.ipynb_checkpoints
|
89 |
+
|
90 |
+
# IPython
|
91 |
+
profile_default/
|
92 |
+
ipython_config.py
|
93 |
+
|
94 |
+
# pyenv
|
95 |
+
.python-version
|
96 |
+
|
97 |
+
# pipenv
|
98 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
99 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
100 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
101 |
+
# install all needed dependencies.
|
102 |
+
#Pipfile.lock
|
103 |
+
|
104 |
+
# celery beat schedule file
|
105 |
+
celerybeat-schedule
|
106 |
+
|
107 |
+
# SageMath parsed files
|
108 |
+
*.sage.py
|
109 |
+
|
110 |
+
# Environments
|
111 |
+
.env
|
112 |
+
.venv
|
113 |
+
env/
|
114 |
+
venv/
|
115 |
+
ENV/
|
116 |
+
env.bak/
|
117 |
+
venv.bak/
|
118 |
+
Data/transcripts/
|
119 |
+
Data/videolinks/
|
120 |
+
Rag/db/
|
121 |
+
Rag/db/chroma.sqlite3
|
122 |
+
Rag/chromadb.db/
|
123 |
+
# Spyder project settings
|
124 |
+
.spyderproject
|
125 |
+
.spyproject
|
126 |
+
|
127 |
+
# Rope project settings
|
128 |
+
.ropeproject
|
129 |
+
|
130 |
+
# mkdocs documentation
|
131 |
+
/site
|
132 |
+
__pycache__/
|
133 |
+
*.pyc
|
134 |
+
*.pyo
|
135 |
+
*.pyd
|
136 |
+
.env
|
137 |
+
# mypy
|
138 |
+
.mypy_cache/
|
139 |
+
.dmypy.json
|
140 |
+
dmypy.json
|
141 |
+
|
142 |
+
# Pyre type checker
|
143 |
+
.pyre/
|
144 |
+
|
145 |
+
### JetBrains template
|
146 |
+
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
|
147 |
+
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
148 |
+
|
149 |
+
# User-specific stuff
|
150 |
+
.idea/**/workspace.xml
|
151 |
+
.idea/**/tasks.xml
|
152 |
+
.idea/**/usage.statistics.xml
|
153 |
+
.idea/**/dictionaries
|
154 |
+
.idea/**/shelf
|
155 |
+
|
156 |
+
# Generated files
|
157 |
+
.idea/**/contentModel.xml
|
158 |
+
|
159 |
+
# Sensitive or high-churn files
|
160 |
+
.idea/**/dataSources/
|
161 |
+
.idea/**/dataSources.ids
|
162 |
+
.idea/**/dataSources.local.xml
|
163 |
+
.idea/**/sqlDataSources.xml
|
164 |
+
.idea/**/dynamic.xml
|
165 |
+
.idea/**/uiDesigner.xml
|
166 |
+
.idea/**/dbnavigator.xml
|
167 |
+
|
168 |
+
# Gradle
|
169 |
+
.idea/**/gradle.xml
|
170 |
+
.idea/**/libraries
|
171 |
+
|
172 |
+
# Gradle and Maven with auto-import
|
173 |
+
# When using Gradle or Maven with auto-import, you should exclude module files,
|
174 |
+
# since they will be recreated, and may cause churn. Uncomment if using
|
175 |
+
# auto-import.
|
176 |
+
# .idea/modules.xml
|
177 |
+
# .idea/*.iml
|
178 |
+
# .idea/modules
|
179 |
+
# *.iml
|
180 |
+
# *.ipr
|
181 |
+
|
182 |
+
# CMake
|
183 |
+
cmake-build-*/
|
184 |
+
|
185 |
+
#
|
186 |
+
Mongo Explorer plugin
|
187 |
+
.idea/**/mongoSettings.xml
|
188 |
+
|
189 |
+
# File-based project format
|
190 |
+
*.iws
|
191 |
+
|
192 |
+
# IntelliJ
|
193 |
+
out/
|
194 |
+
|
195 |
+
# mpeltonen/sbt-idea plugin
|
196 |
+
.idea_modules/
|
197 |
+
|
198 |
+
# JIRA plugin
|
199 |
+
atlassian-ide-plugin.xml
|
200 |
+
|
201 |
+
# Cursive Clojure plugin
|
202 |
+
.idea/replstate.xml
|
203 |
+
|
204 |
+
# Crashlytics plugin (for Android Studio and IntelliJ)
|
205 |
+
com_crashlytics_export_strings.xml
|
206 |
+
crashlytics.properties
|
207 |
+
crashlytics-build.properties
|
208 |
+
fabric.properties
|
209 |
+
|
210 |
+
# Editor-based Rest Client
|
211 |
+
.idea/httpRequests
|
212 |
+
|
213 |
+
# Android studio 3.1+ serialized cache file
|
214 |
+
.idea/caches/build_file_checksums.ser
|
215 |
+
|
216 |
+
### VirtualEnv template
|
217 |
+
# Virtualenv
|
218 |
+
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
|
219 |
+
.Python
|
220 |
+
[Bb]in
|
221 |
+
[Ii]nclude
|
222 |
+
[Ll]ib
|
223 |
+
[Ll]ib64
|
224 |
+
[Ll]ocal
|
225 |
+
[Ss]cripts
|
226 |
+
pyvenv.cfg
|
227 |
+
.venv
|
228 |
+
pip-selfcheck.json
|
229 |
+
|
230 |
+
files
|
231 |
+
Files
|
232 |
+
*.tmp
|
233 |
+
.vscode
|
234 |
+
my_virtual_environment
|
235 |
+
dist
|
236 |
+
crf_py_utils.egg-info
|
237 |
+
build
|
238 |
+
datas
|
239 |
+
tests/data
|
240 |
+
venv
|
241 |
+
create_docker_image.sh
|
242 |
+
|
243 |
+
anydonebert/data
|
244 |
+
|
245 |
+
results
|
246 |
+
train_test_split
|
247 |
+
|
248 |
+
anydonebert/models/sbert.net_models_paraphrase-distilroberta-base-v1
|
249 |
+
anydonebert/models/sbert.net_models_paraphrase-distilroberta-base-v2
|
250 |
+
resources/conll_files/*
|
251 |
+
resources/test_xml_files/*
|
252 |
+
resources/xml_files/*
|
253 |
+
config.ini
|
254 |
+
flowcess/commons/settings.py
|
255 |
+
|
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
Data/__init__.py
ADDED
File without changes
|
Data/get_video_link.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from Data.new_video_added import get_new_video_url
|
5 |
+
from datetime import datetime
|
6 |
+
import json
|
7 |
+
from pathlib import Path
|
8 |
+
load_dotenv()
|
9 |
+
|
10 |
+
api_key = os.getenv('API_KEY')
|
11 |
+
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
12 |
+
BASE_URL = "https://www.googleapis.com/youtube/v3"
|
13 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
14 |
+
channel = "https://www.youtube.com/@hubermanlab/videos"
|
15 |
+
new_video_added = False
|
16 |
+
# video_links_folder_name = os.path.join(BASE_DIR, "videolinks")
|
17 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
18 |
+
# print("THIS IS BASE DIR:", BASE_DIR)
|
19 |
+
# print("THIS is current dir:", CURRENT_DIR)
|
20 |
+
# video_links_folder_name = os.path.join(CURRENT_DIR, "videolinks")
|
21 |
+
video_links_folder_name = os.path.join(PROJECT_ROOT, "Data", "video_links")
|
22 |
+
|
23 |
+
def ensure_directories():
|
24 |
+
if not os.path.exists(video_links_folder_name):
|
25 |
+
os.makedirs(video_links_folder_name)
|
26 |
+
print(f"Directory {video_links_folder_name} created")
|
27 |
+
|
28 |
+
|
29 |
+
def get_chanel_id(chanel_name):
|
30 |
+
url = f"{BASE_URL}/search"
|
31 |
+
params = {
|
32 |
+
"part": "snippet",
|
33 |
+
"q": chanel_name,
|
34 |
+
"type": "channel",
|
35 |
+
"key": api_key
|
36 |
+
}
|
37 |
+
response = requests.get(url, params)
|
38 |
+
response_data = response.json()
|
39 |
+
if "items" in response_data and len(response_data["items"]) > 0:
|
40 |
+
return response_data["items"][0]["snippet"]["channelId"]
|
41 |
+
else:
|
42 |
+
return None
|
43 |
+
|
44 |
+
|
45 |
+
def get_video_links(channel_id):
|
46 |
+
url = f"{BASE_URL}/search"
|
47 |
+
video_links = []
|
48 |
+
next_page_token = None
|
49 |
+
|
50 |
+
while True:
|
51 |
+
params = {
|
52 |
+
"part": "snippet",
|
53 |
+
"channelId": channel_id,
|
54 |
+
"maxResults": 50,
|
55 |
+
"type": "video",
|
56 |
+
"key": api_key,
|
57 |
+
}
|
58 |
+
if next_page_token:
|
59 |
+
params["pageToken"] = next_page_token
|
60 |
+
|
61 |
+
response = requests.get(url, params=params)
|
62 |
+
response_data = response.json()
|
63 |
+
|
64 |
+
if "items" not in response_data:
|
65 |
+
break
|
66 |
+
|
67 |
+
for item in response_data["items"]:
|
68 |
+
video_id = item["id"]["videoId"]
|
69 |
+
video_links.append(f"https://www.youtube.com/watch?v={video_id}")
|
70 |
+
|
71 |
+
next_page_token = response_data.get("nextPageToken")
|
72 |
+
if not next_page_token:
|
73 |
+
break
|
74 |
+
|
75 |
+
return video_links
|
76 |
+
|
77 |
+
|
78 |
+
def save_video_links(video_links):
|
79 |
+
if not os.path.exists(video_links_folder_name):
|
80 |
+
os.makedirs(video_links_folder_name)
|
81 |
+
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
82 |
+
filename = f"video_links_{timestamp}.json"
|
83 |
+
filepath = os.path.join(video_links_folder_name, filename)
|
84 |
+
with open(filepath, 'w') as file:
|
85 |
+
json.dump(video_links, file)
|
86 |
+
print(f"{len(video_links)} The video links is saved successfully to {filename}")
|
87 |
+
|
88 |
+
|
89 |
+
def load_video_links():
|
90 |
+
"""
|
91 |
+
Load the most recent video links file based on timestamp in the filename.
|
92 |
+
"""
|
93 |
+
# List all files in the current directory
|
94 |
+
if not os.path.exists(video_links_folder_name):
|
95 |
+
print(f"{video_links_folder_name} does not exits")
|
96 |
+
files = [f for f in os.listdir(video_links_folder_name) if f.startswith("video_links_") and f.endswith(".json")]
|
97 |
+
|
98 |
+
if not files:
|
99 |
+
print("No video links file found.")
|
100 |
+
return []
|
101 |
+
|
102 |
+
# Sort files by the timestamp in their names (descending)
|
103 |
+
files.sort(key=lambda x: datetime.strptime(x[len("video_links_"):-len(".json")], "%Y%m%d%H%M%S"), reverse=True)
|
104 |
+
|
105 |
+
# Load the most recent file
|
106 |
+
latest_file = files[0]
|
107 |
+
filepath = os.path.join(video_links_folder_name, latest_file)
|
108 |
+
try:
|
109 |
+
with open(filepath, 'r') as file:
|
110 |
+
video_links = json.load(file)
|
111 |
+
print(f"{len(video_links)} video links loaded successfully from {latest_file}.")
|
112 |
+
return video_links
|
113 |
+
except Exception as e:
|
114 |
+
print(f"Error loading {latest_file}: {e}")
|
115 |
+
return []
|
116 |
+
|
117 |
+
|
118 |
+
def video_links_main():
|
119 |
+
ensure_directories()
|
120 |
+
video_links = load_video_links()
|
121 |
+
if video_links:
|
122 |
+
print(f"Using {len(video_links)} saved video links")
|
123 |
+
else:
|
124 |
+
channel_name = input("Enter the YouTube channel name: ")
|
125 |
+
channel_id = get_chanel_id(channel_name)
|
126 |
+
|
127 |
+
if channel_id:
|
128 |
+
print(f"Fetching videos for channel: {channel_name} (ID: {channel_id})")
|
129 |
+
video_links = get_video_links(channel_id)
|
130 |
+
save_video_links(video_links)
|
131 |
+
else:
|
132 |
+
print("Failed to fetch video links")
|
133 |
+
# for link in video_links:
|
134 |
+
# # print(link)
|
135 |
+
new_video_url = get_new_video_url(channel)
|
136 |
+
# new_video_url = new_video_url[:3]
|
137 |
+
new_videos = [url for url in new_video_url if url not in video_links]
|
138 |
+
|
139 |
+
if new_videos:
|
140 |
+
print(f"{len(new_videos)} new video founds")
|
141 |
+
video_links.extend(new_videos)
|
142 |
+
save_video_links(video_links)
|
143 |
+
new_video_added = True
|
144 |
+
else:
|
145 |
+
print("No new video founds")
|
146 |
+
new_video_added = False
|
147 |
+
# print(new_video_added)
|
148 |
+
return video_links, new_video_added, new_videos
|
149 |
+
|
150 |
+
|
151 |
+
if __name__ == "__main__":
|
152 |
+
video_links_main()
|
Data/new_video_added.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import re
|
3 |
+
|
4 |
+
|
5 |
+
|
6 |
+
def get_new_video_url(channel):
|
7 |
+
"""
|
8 |
+
Fetch all video URLs from the given YouTube channel page.
|
9 |
+
"""
|
10 |
+
try:
|
11 |
+
html = requests.get(channel).text
|
12 |
+
# Extract all video IDs from the HTML
|
13 |
+
video_ids = re.findall(r'(?<="videoId":").*?(?=")', html)
|
14 |
+
video_urls = [f"https://www.youtube.com/watch?v={video_id}" for video_id in video_ids]
|
15 |
+
|
16 |
+
# Remove duplicates while preserving order
|
17 |
+
video_urls = list(dict.fromkeys(video_urls))
|
18 |
+
print(f"Fetched {len(video_urls)} video URLs from the channel.")
|
19 |
+
return video_urls
|
20 |
+
except Exception as e:
|
21 |
+
print(f"Error fetching video URLs: {e}")
|
22 |
+
return []
|
Data/yt_transcript.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
2 |
+
from Data.get_video_link import video_links_main
|
3 |
+
from pathlib import Path
|
4 |
+
from datetime import datetime
|
5 |
+
|
6 |
+
# Dynamically get the root directory of the project
|
7 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent # Moves up from /Data/
|
8 |
+
TRANSCRIPTS_FOLDER = PROJECT_ROOT / "Data" / "transcripts"
|
9 |
+
|
10 |
+
def save_transcript(video_id, transcript_text):
|
11 |
+
"""
|
12 |
+
Saves transcripts to the local folder
|
13 |
+
"""
|
14 |
+
# Ensure the transcripts folder exists
|
15 |
+
TRANSCRIPTS_FOLDER.mkdir(parents=True, exist_ok=True)
|
16 |
+
|
17 |
+
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
18 |
+
filename = f"{video_id}_{timestamp}.txt"
|
19 |
+
file_path = TRANSCRIPTS_FOLDER / filename
|
20 |
+
|
21 |
+
file_path.write_text('\n'.join(transcript_text), encoding="utf-8")
|
22 |
+
return file_path
|
23 |
+
|
24 |
+
|
25 |
+
def get_video_id(video_links_list):
|
26 |
+
return [link.replace("https://www.youtube.com/watch?v=", "") for link in video_links_list]
|
27 |
+
|
28 |
+
|
29 |
+
def fetch_yt_transcript(video_ids):
|
30 |
+
"""
|
31 |
+
Fetches YouTube transcripts using video IDs.
|
32 |
+
"""
|
33 |
+
video_transcripts = {}
|
34 |
+
|
35 |
+
for video_id in video_ids:
|
36 |
+
print(f"Fetching transcript for: {video_id}")
|
37 |
+
try:
|
38 |
+
output = YouTubeTranscriptApi.get_transcript(video_id)
|
39 |
+
transcript_text = [item['text'] for item in output]
|
40 |
+
|
41 |
+
# Save transcript and get file path
|
42 |
+
file_path = save_transcript(video_id, transcript_text)
|
43 |
+
video_transcripts[video_id] = {
|
44 |
+
'text': transcript_text,
|
45 |
+
'file_path': str(file_path)
|
46 |
+
}
|
47 |
+
print(f"Transcript saved to: {file_path}")
|
48 |
+
|
49 |
+
except Exception as e:
|
50 |
+
print(f"Transcript not found for video: {video_id}")
|
51 |
+
video_transcripts[video_id] = {
|
52 |
+
'text': [],
|
53 |
+
'file_path': None
|
54 |
+
}
|
55 |
+
|
56 |
+
return video_transcripts
|
57 |
+
|
58 |
+
|
59 |
+
def all_video_transcript_pipeline():
|
60 |
+
"""
|
61 |
+
Handles fetching and storing transcripts, checking for new videos.
|
62 |
+
"""
|
63 |
+
print(f"Looking for transcripts in: {TRANSCRIPTS_FOLDER}")
|
64 |
+
video_links_list, new_video_added, new_videos_link = video_links_main()
|
65 |
+
video_transcripts = {}
|
66 |
+
|
67 |
+
# Always load existing transcripts
|
68 |
+
if TRANSCRIPTS_FOLDER.exists():
|
69 |
+
existing_files = list(TRANSCRIPTS_FOLDER.glob("*.txt"))
|
70 |
+
print(f"Found {len(existing_files)} transcript files.")
|
71 |
+
|
72 |
+
for file in existing_files:
|
73 |
+
video_id = file.stem.split("_")[0] # Extract video ID
|
74 |
+
try:
|
75 |
+
transcript_text = file.read_text(encoding="utf-8").splitlines()
|
76 |
+
video_transcripts[video_id] = {
|
77 |
+
'text': transcript_text,
|
78 |
+
'file_path': str(file)
|
79 |
+
}
|
80 |
+
print(f"Loaded transcript for video: {video_id}")
|
81 |
+
except Exception as e:
|
82 |
+
print(f"Error loading transcript {file.name}: {e}")
|
83 |
+
else:
|
84 |
+
print(f"Transcripts folder not found at: {TRANSCRIPTS_FOLDER}, creating it.")
|
85 |
+
TRANSCRIPTS_FOLDER.mkdir(parents=True, exist_ok=True)
|
86 |
+
|
87 |
+
# Fetch new transcripts if needed
|
88 |
+
if new_video_added and new_videos_link:
|
89 |
+
print("New videos detected... Fetching transcripts.")
|
90 |
+
new_video_ids = [url.split("v=")[1] for url in new_videos_link] # Extract video IDs
|
91 |
+
new_transcripts = fetch_yt_transcript(new_video_ids)
|
92 |
+
|
93 |
+
print(f"Total transcripts loaded: {len(video_transcripts)}")
|
94 |
+
|
Dockerfile
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Declare build arguments at the top (for initial stage)
|
2 |
+
ARG USER_UID=1000
|
3 |
+
ARG USER_GID=1000
|
4 |
+
|
5 |
+
# Stage 1: Build dependencies
|
6 |
+
FROM python:3.11-slim AS builder
|
7 |
+
WORKDIR /app
|
8 |
+
RUN apt-get update && \
|
9 |
+
apt-get install -y --no-install-recommends \
|
10 |
+
build-essential \
|
11 |
+
git && \
|
12 |
+
rm -rf /var/lib/apt/lists/*
|
13 |
+
RUN python -m venv /opt/venv
|
14 |
+
ENV PATH="/opt/venv/bin:$PATH"
|
15 |
+
COPY requirements.txt .
|
16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
17 |
+
|
18 |
+
# Stage 2: Final image
|
19 |
+
FROM python:3.11-slim
|
20 |
+
|
21 |
+
# Re-declare build arguments for this stage
|
22 |
+
ARG USER_UID=1000
|
23 |
+
ARG USER_GID=1000
|
24 |
+
|
25 |
+
COPY --from=builder /opt/venv /opt/venv
|
26 |
+
ENV PATH="/opt/venv/bin:$PATH"
|
27 |
+
WORKDIR /app
|
28 |
+
RUN apt-get update && \
|
29 |
+
apt-get install -y --no-install-recommends \
|
30 |
+
libgomp1 && \
|
31 |
+
rm -rf /var/lib/apt/lists/*
|
32 |
+
|
33 |
+
COPY . .
|
34 |
+
|
35 |
+
# Create the group and user first
|
36 |
+
RUN groupadd -g ${USER_GID} appuser && \
|
37 |
+
useradd -m -u ${USER_UID} -g appuser appuser
|
38 |
+
|
39 |
+
# Create directories and set permissions
|
40 |
+
RUN mkdir -p /app/Rag/chromadb.db && \
|
41 |
+
mkdir -p /app/Data && \
|
42 |
+
chown -R appuser:appuser /app
|
43 |
+
|
44 |
+
USER appuser
|
45 |
+
|
46 |
+
# Make sure your Python code uses this path for ChromaDB
|
47 |
+
ENV CHROMA_PERSISTENCE_DIRECTORY=/app/Rag/chromadb.db
|
48 |
+
|
49 |
+
CMD ["python", "-m","ui.app"]
|
Example/__init__.py
ADDED
File without changes
|
Example/rag_example.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import chromadb
|
3 |
+
from pathlib import Path
|
4 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
5 |
+
# transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
|
6 |
+
# transcripts_folder_path = 'Data/transcripts'
|
7 |
+
transcripts_folder_path = PROJECT_ROOT / "Data" / "transcripts"
|
8 |
+
chromadb_path = PROJECT_ROOT / "Rag" / "chromadb.db"
|
9 |
+
client = chromadb.PersistentClient(path=str(chromadb_path))
|
10 |
+
collection = client.get_or_create_collection(name="yt_transcript_collection")
|
11 |
+
sys.path.append(str(PROJECT_ROOT))
|
12 |
+
sys.path.append(str(PROJECT_ROOT / "Rag"))
|
13 |
+
# print("Python path:", sys.path)
|
14 |
+
from Rag.rag_pipeline import main_workflow
|
15 |
+
|
16 |
+
# Run the application
|
17 |
+
if __name__ == "__main__":
|
18 |
+
main_workflow(transcripts_folder_path, collection)
|
Llm/__init__.py
ADDED
File without changes
|
Llm/llm_endpoints.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
import os
|
3 |
+
import google.generativeai as genai
|
4 |
+
|
5 |
+
|
6 |
+
# Configure the Generative AI model with the API key from the environment
|
7 |
+
load_dotenv()
|
8 |
+
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
9 |
+
gemini_model = genai.GenerativeModel("models/gemini-1.5-flash")
|
10 |
+
|
11 |
+
# Function to get a response from the generative model
|
12 |
+
def get_llm_response(prompt: str) -> str:
|
13 |
+
response = gemini_model.generate_content(prompt)
|
14 |
+
return response.text
|
Prompts/__init__.py
ADDED
File without changes
|
Prompts/huberman_prompt.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
huberman_prompt = """
|
2 |
+
You are Dr. Andrew Huberman, an expert neuroscientist and educator known for your clear, engaging, and scientifically accurate explanations. When answering, please consider the following:
|
3 |
+
1. Provide a clear and concise summary of the scientific concepts involved.
|
4 |
+
2. Highlight any relevant research or studies.
|
5 |
+
3. Offer actionable insights or practical advice.
|
6 |
+
|
7 |
+
Context:
|
8 |
+
{context}
|
9 |
+
|
10 |
+
Sources:
|
11 |
+
{sources}
|
12 |
+
|
13 |
+
Conversation History:
|
14 |
+
{history}
|
15 |
+
|
16 |
+
Question:
|
17 |
+
{question}
|
18 |
+
|
19 |
+
Please respond in a manner that is informative, research-backed, and reflective of your unique style.
|
20 |
+
"""
|
Prompts/summary_prompt.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
summary_prompts = """
|
2 |
+
#System
|
3 |
+
You are an AI agents whose job is to summarize the conversation between AI bots and the user
|
4 |
+
here is the conversation history
|
5 |
+
{{}}
|
6 |
+
|
7 |
+
#Output format
|
8 |
+
|
9 |
+
|
10 |
+
"""
|
README.md
CHANGED
@@ -12,3 +12,102 @@ short_description: a bot
|
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
15 |
+
# Andrew Huberman RAG-Based AI Chatbot
|
16 |
+
|
17 |
+
## Overview
|
18 |
+
Xyzbot is an AI chatbot that extracts and synthesizes insights from Andrew Huberman's YouTube videos. It automatically retrieves video transcripts, updates its knowledge base in ChromaDB, and provides citation-linked responses.
|
19 |
+
|
20 |
+
## 🚀 Key Features
|
21 |
+
- Mimics Andrew Huberman's insights using YouTube video transcripts
|
22 |
+
- Automatic transcript retrieval and knowledge base updates
|
23 |
+
- RAG-powered response generation with direct video citations
|
24 |
+
- Interactive Streamlit user interface
|
25 |
+
- Docker-based deployment for easy scalability
|
26 |
+
|
27 |
+
## 🛠 Tech Stack
|
28 |
+
- Backend: Python, LangChain, OpenAI API
|
29 |
+
- Frontend: Streamlit
|
30 |
+
- Database: ChromaDB
|
31 |
+
- Deployment: Docker
|
32 |
+
|
33 |
+
## 📂 Project Structure
|
34 |
+
```
|
35 |
+
📦 Xyzbot
|
36 |
+
├── 📂 Data
|
37 |
+
├── 📂 Example
|
38 |
+
├── 📂 Llm
|
39 |
+
├── 📂 Notebook
|
40 |
+
├── 📂 Prompts
|
41 |
+
├── 📂 Rag
|
42 |
+
│ ├── chromadb.db
|
43 |
+
│ └── 📂 Processed_folder
|
44 |
+
├── 📂 utils
|
45 |
+
├── Dockerfile
|
46 |
+
└── pyproject.toml
|
47 |
+
```
|
48 |
+
|
49 |
+
## 🔧 Prerequisites
|
50 |
+
- Python 3.8+
|
51 |
+
- Docker (optional)
|
52 |
+
|
53 |
+
## 🔑 API Keys Required
|
54 |
+
1. Google Gemini API Key
|
55 |
+
2. YouTube API Key
|
56 |
+
|
57 |
+
## 🚀 Installation
|
58 |
+
|
59 |
+
### Local Setup
|
60 |
+
1. Clone the repository
|
61 |
+
```bash
|
62 |
+
git clone https://github.com/Angel-dash/Xyzbot.git
|
63 |
+
cd Xyzbot
|
64 |
+
```
|
65 |
+
|
66 |
+
2. Create virtual environment
|
67 |
+
```bash
|
68 |
+
python3 -m venv venv
|
69 |
+
source venv/bin/activate
|
70 |
+
pip install -r requirements.txt
|
71 |
+
```
|
72 |
+
|
73 |
+
### Docker Setup
|
74 |
+
|
75 |
+
#### Option 1: Build Locally
|
76 |
+
```bash
|
77 |
+
docker build -t xyzbot:v1.0 .
|
78 |
+
docker run -it \
|
79 |
+
-v $(pwd)/Rag:/app/Rag:rw \
|
80 |
+
-e GOOGLE_API_KEY=your_api_key \
|
81 |
+
xyzbot:v1.0
|
82 |
+
```
|
83 |
+
|
84 |
+
#### Option 2: Pull from Docker Hub
|
85 |
+
```bash
|
86 |
+
docker pull angeldash/xyzbot:v1.0
|
87 |
+
docker run -it \
|
88 |
+
-v $(pwd)/Rag:/app/Rag:rw \
|
89 |
+
-e GOOGLE_API_KEY=your_api_key \
|
90 |
+
angeldash/xyzbot:v1.0
|
91 |
+
```
|
92 |
+
|
93 |
+
## 🖥️ Running the Application
|
94 |
+
```bash
|
95 |
+
streamlit run src/main.py
|
96 |
+
```
|
97 |
+
|
98 |
+
## 📈 Future Roadmap
|
99 |
+
- Fine-tuned LLM response generation
|
100 |
+
- Real-time multi-channel monitoring
|
101 |
+
- Enhanced citation formatting
|
102 |
+
- AI agent conversation handling
|
103 |
+
- Performance optimization
|
104 |
+
|
105 |
+
## 📜 License
|
106 |
+
MIT License
|
107 |
+
|
108 |
+
## 🤝 Contributing
|
109 |
+
Contributions are welcome! Open an issue or submit a pull request.
|
110 |
+
|
111 |
+
---
|
112 |
+
**Author:** Angel Dash | **GitHub:** [@Angel-dash](https://github.com/Angel-dash)
|
113 |
+
|
Rag/Processed_folder/processed_files.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["VOfwbcveP84_20241225194621.txt", "In9Bq4EJMZw_20241225194705.txt", "DkS1pkKpILY_20241225194325.txt", "ajneRM-ET1Q_20241225194311.txt", "K4Ze-Sp6aUE_20241225194709.txt", "n28W4AmvMDE_20241225194626.txt", "UIy-WQCZd4M_20241225194819.txt", "etbfLTHD_VU_20241225194439.txt", "PVmQOLYckKQ_20241225194814.txt", "F9KrZd_-ge0_20241225194812.txt", "xjEFo3a1AnI_20241225194539.txt", "szqPAPKE5tQ_20241225194712.txt", "3_auLYOilb8_20241225194826.txt", "acgz0C-z-gc_20241225194817.txt", "zVCaYyUWWSw_20241225194412.txt", "doupx8SAs5Y_20241225194603.txt", "wAZn9dF3XTo_20241225194423.txt", "2XGREPnlI8U_20241225194659.txt", "UNCwdFxPtE8_20241225194521.txt", "at37Y8rKDlA_20241225194513.txt", "oL3SkPV1_Ik_20241225194837.txt", "nOgypsWKjm4_20241225194440.txt", "rW9QKc-iFoY_20241225194751.txt", "CQlTmOFM4Qs_20241225194550.txt", "tR73Ny4Dt9s_20241225194413.txt", "t1F7EEGPQwo_20241225194649.txt", "ccrbE0QHy94_20241225194608.txt", "SyWC8ZFVxGo_20241225194333.txt", "zlc4VrDx_qk_20241225194800.txt", "8qaBpM73NSk_20241225194409.txt", "sxgCC4H1dl8_20241225194524.txt", "RBK5KLA5Jjg_20241225194446.txt", "slUCmZJDXrk_20241225194627.txt", "h2aWYjSA1Jc_20241225194702.txt", "Ov4yyK15-K8_20241225194230.txt", "juD99_sPWGU_20241225194340.txt", "q1Ss8sTbFBY_20241225194647.txt", "X8Hw8zeCDTA_20241225194518.txt", "UChhXiFPRgg_20241225194443.txt", "pq6WHJzOkno_20241225194415.txt", "2Ds1m5gflCI_20241225194849.txt", "jGZ1mR9uLU0_20241225194808.txt", "VAEzZeaV5zM_20241225194347.txt", "EhlIkzJwPlk_20241225194656.txt", "HiyzzcuaAac_20241225194255.txt", "C3X0bUAiluE_20241225194259.txt", "kG5Qb9sr0YQ_20241225194810.txt", "wRsX_ZkzxvQ_20241225194619.txt", "U2BPitASUh0_20241225194358.txt", "Wcs2PFz5q6g_20241225194327.txt", "CuzL1qxUyHw_20241225194312.txt", "q37ARYnRDGc_20241225194623.txt", "cp9GXl9Qk_s_20241225194735.txt", "XT_6Lvkhxvo_20241225194342.txt", "bUr_9fgfnto_20241225194256.txt", "LTGGyQS1fZE_20241225194305.txt", "mAlt_HKX4as_20241225194420.txt", "SZSRgyl7pyQ_20241225194418.txt", "RI112zW8GDw_20241225194356.txt", "ycOBZZeVeAc_20241225194707.txt", "6YLdlK2hYnw_20241225194328.txt", "p4ZfkezDTXQ_20241225194615.txt", "LVxL_p_kToc_20241225194558.txt", "HXzTbCEqCJc_20241225194710.txt", "yOoVz9E9kfQ_20241225194901.txt", "C5KpIXjpzdY_20241225194400.txt", "__RAXBLt1iM_20241225194430.txt", "8N7mdkrXgbc_20241225194338.txt", "JnlSDaBjCCU_20241225194450.txt", "IOl28gj_RXw_20241225194431.txt", "Nr5xb-QCBGA_20241225194354.txt", "GzvzWO0NU50_20241225194605.txt", "DtmwtjOoSYU_20241225194633.txt", "CrtR12PBKb0_20241225194632.txt", "gMRph_BvHB4_20241225194516.txt", "QpoaNklmRPc_20241225194248.txt", "9tRohh0gErM_20241225194353.txt", "Xu1FMCxoEFc_20241225194346.txt", "15R2pMqU2ok_20241225194406.txt", "eIxVfln02Ss_20241225194335.txt", "0Dtt95_xabw_20241225194252.txt", "3ZGItIAUQmI_20241225194719.txt", "uxZFl4BDOGk_20241225194757.txt", "hvPGfcAgk9Y_20241225194754.txt", "HYVeP4F0GNU_20241225194559.txt", "z5W74QC3v2I_20241225194308.txt", "31wjVhCcI5Y_20241225194426.txt", "BMTt8gSl13s_20241225194836.txt", "aQDOU3hPci0_20241225194501.txt", "tkH2-_jMCSk_20241225194543.txt", "ntfcfJ28eiU_20241225194522.txt", "S8nPJU9xkNw_20241225194748.txt", "fcxjwA4C4Cw_20241225194553.txt", "iMvtHqLmEkI_20241225194855.txt", "099hgtRoUZw_20241225194436.txt", "4RFEkGKKhdE_20241225194907.txt", "eJU6Df_ffAE_20241225194635.txt", "nqNEtdHVUjM_20241225194437.txt", "1SXDXdngX2M_20241225194316.txt", "X4QE6t-MkYE_20241225194642.txt", "79p1X_7rAMo_20241225194630.txt", "6RZbGrq9BxE_20241225194306.txt", "pkJi9Raxikg_20241225194824.txt", "QbMxDZeB8Ks_20241225194247.txt", "RgAcOqVRfYA_20241225194657.txt", "ncSoor2Iw8k_20241225194833.txt", "i_DEPeCKxs8_20241225194235.txt", "FE0lTEUa7EY_20241225194753.txt", "gE0_8AjTFaM_20241225194852.txt", "kgr22uMsJ5o_20241225194317.txt", "ufsIA5NARIo_20241225194535.txt", "CyDLbrZK75U_20241225194434.txt", "7TkGDj4LaOU_20241225194244.txt", "XLr2RKoD-oY_20241225194738.txt", "yb5zpo5WDG4_20241225194645.txt", "a9yFKPmPZ90_20241225194556.txt", "TG8VM5-CTfw_20241225194636.txt", "eMqWH3LYiII_20241225194351.txt", "CVh3_8e5u8I_20241225194246.txt", "SuR0DaYoe0Y_20241225194302.txt", "FLxIoNguGRU_20241225194233.txt", "GA89kjVY6Ik_20241225194854.txt", "qJ3uV7coZbA_20241225194453.txt", "EQ3GjpGq5Y8_20241225194405.txt", "yOJvm_ri_hk_20241225194555.txt", "cwakOgHIT0E_20241225194421.txt", "DTCmprPCDqc_20241225194733.txt", "qPKd99Pa2iU_20241225194500.txt", "nm1TxQj9IsQ_20241225194611.txt", "LRM5LutB538_20241225194857.txt", "xTtM2AvCRyA_20241225194643.txt", "62lVH-6xYGY_20241225194250.txt", "Rxmv7rT9leo_20241225194417.txt", "ulHrUVV3Kq4_20241225194452.txt", "bGixnNGvSkg_20241225194231.txt", "1CxJVdeyltw_20241225194614.txt", "wgUjIRtote8_20241225194726.txt", "qPKd99Pa2iU_20241225194503.txt", "S_SrHS8FvMM_20241225194807.txt", "xX6hiEmDmxs_20241225194227.txt", "uXs-zPc63kM_20241225194449.txt", "4AwyVTHEU3s_20241225194904.txt", "xaE9XyMMAHY_20241225194848.txt", "hFL6qRIJZ_Y_20241225194428.txt", "FOi5s3OUogo_20241225194245.txt", "cS7cNaBrkxo_20241225194624.txt", "kpTJqwIfHcM_20241225194654.txt", "yixIc1Ai6jM_20241225194829.txt", "vfRtLI6cJrk_20241225194324.txt", "GLgKkG44MGo_20241225194729.txt", "KPlJcD-o-4Q_20241225194617.txt", "AtChcxeaukQ_20241225194646.txt", "tLS6t3FVOTI_20241225194714.txt", "GqPGXG5TlZw_20241225194541.txt", "UF0nqolsNZc_20241225194727.txt", "7R3-3HR6-u4_20241225194519.txt", "tLRCS48Ens4_20241225194447.txt", "V0Sdgn0_kFM_20241225194740.txt", "G1VUSu6sGoU_20241225194251.txt", "m_OazsImOiI_20241225194322.txt", "Og56hmAspV8_20241225194258.txt", "dFR_wFN23ZY_20241225194640.txt", "q-H_A_dQUxQ_20241225194303.txt", "KVjfFN89qvQ_20241225194314.txt", "zU5EYw06wtw_20241225194349.txt", "Z7MU6zrAXsM_20241225194442.txt", "LYYyQcAJZfk_20241225194508.txt", "E7W4OQfJWdw_20241225194717.txt", "azb3Ih68awQ_20241225194505.txt", "ouCWNRvPk20_20241225194401.txt", "uwWOc_RqTBA_20241225194858.txt", "pZX8ikmWvEU_20241225194510.txt", "n9IxomBusuw_20241225194545.txt", "BwyZIWeBpRw_20241225194534.txt", "XY0rBdaDXD8_20241225194226.txt", "1Wo6SqLNmLk_20241225194845.txt", "ddq8JIMhz7c_20241225194529.txt", "VQLU7gpk_X8_20241225194821.txt", "jC8Pu9HBd48_20241225194321.txt", "rZkMpVLcVsg_20241225194319.txt", "gbQFSMayJxk_20241225194736.txt", "F54qXuTpgfM_20241225194843.txt", "p3JLaF_4Tz8_20241225194537.txt", "FeRgqJVALMQ_20241225194433.txt", "hF32FvBH4gI_20241225194332.txt", "CDUetQMKM6g_20241225194454.txt", "wG3UFHR1o48_20241225194229.txt", "6P8hrzjnetU_20241225194336.txt", "WFcYF_pxLgA_20241225194458.txt", "77CdVSpnUX4_20241225194746.txt", "VOfwbcveP84_20241225194742.txt", "VRvn3Oj5r3E_20241225194839.txt", "Gf-kC30SLtc_20241225194846.txt", "S8jWFcDGz4Y_20241225194805.txt", "x3MgDtZovks_20241225194526.txt", "lIo9FcrljDk_20241225194309.txt", "-e9ErUozQo4_20241225194903.txt", "aXvDEmo6uS4_20241225194629.txt", "3gtvNYa3Nd8_20241225194531.txt", "5tYR7e5Wpyc_20241225194238.txt", "OadokY8fcAA_20241225194601.txt", "O640yAgq5f8_20241225194744.txt", "zbpb1wd-wvs_20241225194827.txt", "gXvuJu1kt48_20241225194638.txt", "zEYE-vcVKy8_20241225194547.txt", "Ky-ZJ9SS-x4_20241225194240.txt", "0RYyQRQFgFk_20241225194532.txt", "4F_RBc1akC8_20241225194724.txt", "nDLb8_wgX50_20241225194540.txt", "tcueMCe-0zo_20241225194236.txt", "K-TW2Chpz4k_20241225194330.txt", "XcvhERcZpWw_20241225194731.txt", "Ze2pc6NwsHQ_20241225194704.txt", "_ltcLEM-5HU_20241225194612.txt", "jouFvyRZntk_20241225194507.txt", "uWV9a3zEaL4_20241225194823.txt", "-OBCwiPPfEU_20241225194747.txt", "dzOvi0Aa2EA_20241225194301.txt", "K9lORz2_XSU_20241225194527.txt", "j2sMqSDLd4k_20241225194407.txt", "oNkDA2F7CjM_20241225194651.txt", "50BZQRT1dAg_20241225194403.txt", "q8CHXefn7B4_20241225194411.txt", "Jy4rJcYmtUM_20241225194344.txt", "QmOF0crdyRU_20241225194456.txt", "6ZrlsVx85ek_20241225194758.txt", "CD0bRU1e1ZM_20241225194425.txt", "IAnhFUUCq6c_20241225194804.txt", "Phm-Alz1Zjo_20241225194906.txt", "csubiPlvFWk_20241225194606.txt", "GpgqXCkRO-w_20241225194701.txt", "W5zqC5cYcS0_20241225194241.txt", "T65RDBiB5Hs_20241225194715.txt", "6I5I56uVvLw_20241225194801.txt", "i5611OvTFGM_20241225194548.txt", "wTBSGgbIvsY_20241225194552.txt", "O1YRwWmue4Y_20241225194815.txt", "29n0WG317tM_20241225194511.txt", "xmhsWAqP_0Y_20241225194851.txt", "x4m_PdFbu-s_20241225194722.txt"]
|
Rag/__init__.py
ADDED
File without changes
|
Rag/rag_pipeline.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chromadb
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
import google.generativeai as genai
|
5 |
+
import os
|
6 |
+
import logging
|
7 |
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
8 |
+
from Llm.llm_endpoints import get_llm_response
|
9 |
+
from utils.get_link import get_source_link
|
10 |
+
from Prompts.huberman_prompt import huberman_prompt
|
11 |
+
from tqdm import tqdm
|
12 |
+
# Configuration
|
13 |
+
API_KEY = os.getenv("GOOGLE_API_KEY")
|
14 |
+
if API_KEY:
|
15 |
+
genai.configure(api_key=API_KEY)
|
16 |
+
|
17 |
+
chromadb_path = "app/Rag/chromadb.db"
|
18 |
+
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
19 |
+
|
20 |
+
# Logging
|
21 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
|
22 |
+
|
23 |
+
|
24 |
+
# Helper Functions
|
25 |
+
def split_text_to_chunks(docs, chunk_size=1000, chunk_overlap=200):
|
26 |
+
"""Split text into manageable chunks."""
|
27 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
28 |
+
chunks = text_splitter.split_text(docs)
|
29 |
+
return chunks
|
30 |
+
|
31 |
+
|
32 |
+
def get_new_files(transcripts_folder_path, collection):
|
33 |
+
"""Find new transcript files that haven't been processed yet."""
|
34 |
+
all_files = [f for f in os.listdir(transcripts_folder_path) if f.endswith(".txt")]
|
35 |
+
existing_files = [meta["source"] for meta in collection.get()['metadatas']]
|
36 |
+
return [f for f in all_files if f not in existing_files]
|
37 |
+
|
38 |
+
|
39 |
+
def process_single_file(file_path):
|
40 |
+
"""Process a single file and return its chunks."""
|
41 |
+
with open(file_path, 'r') as f:
|
42 |
+
content = f.read()
|
43 |
+
chunks = split_text_to_chunks(content)
|
44 |
+
return chunks, os.path.basename(file_path)
|
45 |
+
|
46 |
+
|
47 |
+
def batch_embed_chunks(chunks, batch_size=32):
|
48 |
+
"""Embed chunks in batches."""
|
49 |
+
embeddings = []
|
50 |
+
for i in tqdm(range(0, len(chunks), batch_size),desc = "Embedding chunks"):
|
51 |
+
batch = chunks[i:i + batch_size]
|
52 |
+
batch_embeddings = embedding_model.encode(batch, show_progress_bar=True)
|
53 |
+
embeddings.extend(batch_embeddings.tolist())
|
54 |
+
return embeddings
|
55 |
+
|
56 |
+
|
57 |
+
def process_and_add_new_files(transcripts_folder_path, collection):
|
58 |
+
"""Process and add new transcript files to the vector database."""
|
59 |
+
new_files = get_new_files(transcripts_folder_path, collection)
|
60 |
+
if not new_files:
|
61 |
+
logging.info("No new files to process")
|
62 |
+
return False
|
63 |
+
|
64 |
+
# Use a reasonable number of workers (4 is usually a good default)
|
65 |
+
n_workers = min(4, len(new_files))
|
66 |
+
logging.info(f"Using {n_workers} workers for processing")
|
67 |
+
|
68 |
+
all_chunks = []
|
69 |
+
all_metadata = []
|
70 |
+
all_ids = []
|
71 |
+
|
72 |
+
# Process files in parallel
|
73 |
+
with ProcessPoolExecutor(max_workers=n_workers) as executor:
|
74 |
+
futures = {
|
75 |
+
executor.submit(process_single_file, os.path.join(transcripts_folder_path, file)): file
|
76 |
+
for file in new_files
|
77 |
+
}
|
78 |
+
|
79 |
+
for future in as_completed(futures):
|
80 |
+
file = futures[future]
|
81 |
+
try:
|
82 |
+
chunks, filename = future.result()
|
83 |
+
file_metadata = [{"source": filename} for _ in range(len(chunks))]
|
84 |
+
file_ids = [f"{filename}_chunk_{i}" for i in range(len(chunks))]
|
85 |
+
|
86 |
+
all_chunks.extend(chunks)
|
87 |
+
all_metadata.extend(file_metadata)
|
88 |
+
all_ids.extend(file_ids)
|
89 |
+
|
90 |
+
logging.info(f"Processed {filename}")
|
91 |
+
except Exception as e:
|
92 |
+
logging.error(f"Error processing {file}: {str(e)}")
|
93 |
+
continue
|
94 |
+
|
95 |
+
# Process embeddings in batches
|
96 |
+
logging.info(f"Generating embeddings for {len(all_chunks)} chunks")
|
97 |
+
embeddings = batch_embed_chunks(all_chunks)
|
98 |
+
|
99 |
+
# Add to database in batches
|
100 |
+
batch_size = 500
|
101 |
+
for i in range(0, len(all_chunks), batch_size):
|
102 |
+
end_idx = min(i + batch_size, len(all_chunks))
|
103 |
+
collection.upsert(
|
104 |
+
documents=all_chunks[i:end_idx],
|
105 |
+
embeddings=embeddings[i:end_idx],
|
106 |
+
metadatas=all_metadata[i:end_idx],
|
107 |
+
ids=all_ids[i:end_idx]
|
108 |
+
)
|
109 |
+
logging.info(f"Added batch {i // batch_size + 1} to database")
|
110 |
+
|
111 |
+
logging.info(f"Successfully processed {len(new_files)} files")
|
112 |
+
return True
|
113 |
+
|
114 |
+
|
115 |
+
def query_database(collection, query_text, n_results=3):
|
116 |
+
"""Retrieve the most relevant chunks for the query."""
|
117 |
+
query_embeddings = embedding_model.encode(query_text).tolist()
|
118 |
+
results = collection.query(query_embeddings=query_embeddings, n_results=n_results)
|
119 |
+
retrieved_docs = results['documents'][0]
|
120 |
+
metadatas = results['metadatas'][0]
|
121 |
+
return retrieved_docs, metadatas
|
122 |
+
|
123 |
+
|
124 |
+
def enhance_query_with_history(query_text, summarized_history):
|
125 |
+
enhance_query = f"{query_text}*2\n\n{summarized_history}"
|
126 |
+
return enhance_query
|
127 |
+
|
128 |
+
|
129 |
+
def update_conversation_history(history, user_query, bot_response):
|
130 |
+
"""Update and keeps track of conversation history between user and the bot."""
|
131 |
+
history.append({"user": user_query, "bot": bot_response})
|
132 |
+
return history
|
133 |
+
|
134 |
+
|
135 |
+
def generate_response(conversation_history, query_text, retrieved_docs, source_links):
|
136 |
+
"""Generate a response using retrieved documents and the generative AI model."""
|
137 |
+
context = " ".join(retrieved_docs)
|
138 |
+
history_str = "\n".join([f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
|
139 |
+
sources_str = "\n".join(source_links)
|
140 |
+
|
141 |
+
prompt = huberman_prompt.format(
|
142 |
+
context=context,
|
143 |
+
sources=sources_str,
|
144 |
+
history=history_str,
|
145 |
+
question=query_text
|
146 |
+
)
|
147 |
+
|
148 |
+
response = get_llm_response(prompt)
|
149 |
+
full_response = f"{response}\n\nSources:\n{sources_str}"
|
150 |
+
return full_response
|
151 |
+
|
152 |
+
|
153 |
+
def main_workflow(transcripts_folder_path, collection):
|
154 |
+
"""Run the full RAG workflow."""
|
155 |
+
new_files_added = process_and_add_new_files(transcripts_folder_path, collection)
|
156 |
+
if new_files_added:
|
157 |
+
logging.info("New transcripts added to the database.")
|
158 |
+
else:
|
159 |
+
logging.info("No new files found. Using existing database.")
|
160 |
+
|
161 |
+
conversation_history = []
|
162 |
+
|
163 |
+
while True:
|
164 |
+
query_text = input("\nEnter your query(or type 'exit' to end):").strip()
|
165 |
+
if query_text.lower() == "exit":
|
166 |
+
print("Ending the conversation. Goodbye")
|
167 |
+
break
|
168 |
+
|
169 |
+
query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
|
170 |
+
retrived_docs, metadatas = query_database(collection, query_text_with_conversation_history)
|
171 |
+
print("-" * 50)
|
172 |
+
source_link = get_source_link(metadatas)
|
173 |
+
print(source_link)
|
174 |
+
print("-" * 50)
|
175 |
+
|
176 |
+
if not retrived_docs:
|
177 |
+
print("No relevent documents is found")
|
178 |
+
continue
|
179 |
+
|
180 |
+
response = generate_response(conversation_history, query_text, retrived_docs, source_link)
|
181 |
+
conversation_history = update_conversation_history(conversation_history, query_text, response)
|
182 |
+
print("\nGenerated Response:")
|
183 |
+
print(response)
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "xyzbot"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "A rag application"
|
5 |
+
authors = [
|
6 |
+
{name = "Angel njlghmr@gmail.com"}
|
7 |
+
]
|
8 |
+
license = {text = "MIT"}
|
9 |
+
readme = "README.md"
|
10 |
+
requires-python =">=3.11,<3.12"
|
11 |
+
dependencies = [
|
12 |
+
"pyarrow (>=19.0.0,<20.0.0)",
|
13 |
+
"pandas (>=2.2.3,<3.0.0)",
|
14 |
+
"pendulum (>=3.0.0,<4.0.0)",
|
15 |
+
"google-generativeai (>=0.8.4,<0.9.0)",
|
16 |
+
"langchain (>=0.3.16,<0.4.0)",
|
17 |
+
"langchain-openai (>=0.3.3,<0.4.0)",
|
18 |
+
"langchain-chroma (>=0.2.1,<0.3.0)",
|
19 |
+
"langchain-community (>=0.3.16,<0.4.0)",
|
20 |
+
"chromadb (>=0.4.14)",
|
21 |
+
"pypdf (==4.2.0)",
|
22 |
+
"flask (==3.0.1)",
|
23 |
+
"flask-cors (==3.0.10)",
|
24 |
+
"sentence-transformers (==3.3.1)",
|
25 |
+
"tqdm (==4.67.1)",
|
26 |
+
"torch (==2.5.1)",
|
27 |
+
"transformers (==4.46.3)",
|
28 |
+
"pydantic (>=2.7.4,<3.0.0)"
|
29 |
+
]
|
30 |
+
|
31 |
+
|
32 |
+
[build-system]
|
33 |
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
34 |
+
build-backend = "poetry.core.masonry.api"
|
requirements.in
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pyarrow
|
2 |
+
pandas[performance, parquet, aws]
|
3 |
+
pendulum
|
4 |
+
google.generativeai
|
5 |
+
langchain
|
6 |
+
langchain_openai
|
7 |
+
langchain_chroma
|
8 |
+
langchain_community
|
9 |
+
chromadb==0.4.8
|
10 |
+
pypdf
|
11 |
+
flask
|
12 |
+
flask_cors
|
13 |
+
sentence_transformers
|
14 |
+
tqdm
|
15 |
+
torch
|
16 |
+
transformers
|
17 |
+
spacy==3.5.0
|
18 |
+
coreferee==1.4.1
|
requirements.txt
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core dependencies
|
2 |
+
langchain>=0.3.16,<0.4.0
|
3 |
+
langchain_openai
|
4 |
+
langchain_chroma
|
5 |
+
langchain-community>=0.3.16,<0.4.0
|
6 |
+
chromadb>=0.4.14
|
7 |
+
flask==3.0.1
|
8 |
+
flask_cors==3.0.10
|
9 |
+
google.generativeai
|
10 |
+
pydantic>=2.7.4,<3.0.0
|
11 |
+
streamlit
|
12 |
+
# PDF Processing
|
13 |
+
pypdf==4.2.0
|
14 |
+
|
15 |
+
# ML/AI Dependencies (with CPU-only versions)
|
16 |
+
sentence_transformers==2.3.1
|
17 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
18 |
+
torch==2.1.0+cpu
|
19 |
+
|
20 |
+
gradio
|
setup.sh
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Install Python dependencies
|
2 |
+
pip install -r requirements.txt
|
3 |
+
|
4 |
+
# Download spaCy model
|
5 |
+
python -m spacy download en_core_web_sm
|
6 |
+
|
7 |
+
# Install Coreferee for English
|
8 |
+
python -m coreferee install en
|
9 |
+
|
10 |
+
echo "Setup completed successfully!"
|
ui/__init__.py
ADDED
File without changes
|
ui/app.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import chromadb
|
3 |
+
from typing import List, Dict
|
4 |
+
import sys
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
project_root = Path(__file__).resolve().parent.parent
|
8 |
+
sys.path.append(str(project_root))
|
9 |
+
sys.path.append(str(project_root / "Rag"))
|
10 |
+
sys.path.append(str(project_root / "Data"))
|
11 |
+
sys.path.append(str(project_root / "Data" / "transcripts"))
|
12 |
+
sys.path.append(str(project_root / "Data" / "video_links"))
|
13 |
+
sys.path.append(str(project_root / "Llm"))
|
14 |
+
sys.path.append(str(project_root / "Prompts"))
|
15 |
+
sys.path.append(str(project_root / "utils"))
|
16 |
+
from Rag.rag_pipeline import (
|
17 |
+
query_database,
|
18 |
+
generate_response,
|
19 |
+
enhance_query_with_history,
|
20 |
+
update_conversation_history,
|
21 |
+
process_and_add_new_files
|
22 |
+
)
|
23 |
+
|
24 |
+
INTRODUCTION = """
|
25 |
+
# 🧠 Welcome to HubermanBot!
|
26 |
+
|
27 |
+
I am your AI assistant trained on Andrew Huberman's podcast content. My knowledge base includes detailed information about:
|
28 |
+
|
29 |
+
- 🎯 Peak Performance & Focus
|
30 |
+
- 😴 Sleep Science & Optimization
|
31 |
+
- 🏋️ Physical Fitness & Recovery
|
32 |
+
- 🧘 Mental Health & Stress Management
|
33 |
+
- 🧪 Neuroscience & Biology
|
34 |
+
- 💪 Habit Formation & Behavior Change
|
35 |
+
|
36 |
+
For each response, I'll provide:
|
37 |
+
- Detailed answers based on podcast content
|
38 |
+
- Direct source links to specific episodes
|
39 |
+
- Scientific context when available
|
40 |
+
|
41 |
+
Ask me anything about these topics, and I'll help you find relevant information from the Huberman Lab Podcast!
|
42 |
+
|
43 |
+
Example questions you might ask:
|
44 |
+
- "What does Dr. Huberman recommend for better sleep?"
|
45 |
+
- "How can I improve my focus and concentration?"
|
46 |
+
- "What are the best practices for morning routines?"
|
47 |
+
"""
|
48 |
+
|
49 |
+
|
50 |
+
def format_youtube_url(filename: str) -> str:
|
51 |
+
"""Convert filename to YouTube URL"""
|
52 |
+
# Extract video ID by removing the timestamp and .txt extension
|
53 |
+
video_id = filename.split('_')[0]
|
54 |
+
return f"https://www.youtube.com/watch?v={video_id}"
|
55 |
+
|
56 |
+
|
57 |
+
class RAGChatInterface:
|
58 |
+
def __init__(self, transcripts_folder_path: str, collection):
|
59 |
+
self.transcripts_folder_path = transcripts_folder_path
|
60 |
+
self.collection = collection
|
61 |
+
self.conversation_history: List[Dict[str, str]] = []
|
62 |
+
|
63 |
+
def process_query(self, message: str, history: List[List[str]]) -> str:
|
64 |
+
"""Process a single query and return the response"""
|
65 |
+
# Convert Gradio history format to our conversation history format
|
66 |
+
self.conversation_history = [
|
67 |
+
{"user": user_msg, "bot": bot_msg}
|
68 |
+
for user_msg, bot_msg in history
|
69 |
+
]
|
70 |
+
|
71 |
+
# Enhance query with conversation history
|
72 |
+
query_with_history = enhance_query_with_history(message, self.conversation_history)
|
73 |
+
|
74 |
+
# Get relevant documents
|
75 |
+
retrieved_docs, metadatas = query_database(self.collection, query_with_history)
|
76 |
+
|
77 |
+
if not retrieved_docs:
|
78 |
+
return "I apologize, but I couldn't find any relevant information about that in my knowledge base. Could you try rephrasing your question or ask about a different topic covered in the Huberman Lab Podcast?"
|
79 |
+
|
80 |
+
# Generate response
|
81 |
+
source_links = [meta["source"] for meta in metadatas]
|
82 |
+
response = generate_response(
|
83 |
+
self.conversation_history,
|
84 |
+
message,
|
85 |
+
retrieved_docs,
|
86 |
+
source_links
|
87 |
+
)
|
88 |
+
|
89 |
+
# Remove duplicate sources and convert to YouTube URLs
|
90 |
+
unique_sources = list(set(source_links))
|
91 |
+
youtube_urls = [format_youtube_url(source) for source in unique_sources]
|
92 |
+
|
93 |
+
# Format response with markdown for better readability
|
94 |
+
formatted_response = f"{response}\n\n---\n📚 **Source Episodes:**\n"
|
95 |
+
for url in youtube_urls:
|
96 |
+
formatted_response += f"- {url}\n"
|
97 |
+
|
98 |
+
return formatted_response
|
99 |
+
|
100 |
+
|
101 |
+
def create_interface(transcripts_folder_path: str, collection) -> gr.Interface:
|
102 |
+
"""Create and configure the Gradio interface"""
|
103 |
+
# Initialize the RAG chat interface
|
104 |
+
rag_chat = RAGChatInterface(transcripts_folder_path, collection)
|
105 |
+
|
106 |
+
# Create the Gradio interface with custom styling
|
107 |
+
interface = gr.ChatInterface(
|
108 |
+
fn=rag_chat.process_query,
|
109 |
+
title="🧠 HubermanBot - Your Neuroscience & Wellness AI Assistant",
|
110 |
+
description=INTRODUCTION,
|
111 |
+
examples=[
|
112 |
+
"What are Dr. Huberman's top recommendations for better sleep?",
|
113 |
+
"How does sunlight exposure affect our circadian rhythm?",
|
114 |
+
"What supplements does Dr. Huberman recommend for focus?",
|
115 |
+
"What are the best practices for morning routines according to Dr. Huberman?",
|
116 |
+
"How can I optimize my workout recovery based on neuroscience?",
|
117 |
+
],
|
118 |
+
theme=gr.themes.Soft(
|
119 |
+
primary_hue="indigo",
|
120 |
+
secondary_hue="blue",
|
121 |
+
)
|
122 |
+
)
|
123 |
+
|
124 |
+
return interface
|
125 |
+
|
126 |
+
|
127 |
+
def main():
|
128 |
+
# Get absolute path for ChromaDB
|
129 |
+
project_root = Path(__file__).parent.parent
|
130 |
+
chromadb_path = project_root / "Rag" / "chromadb.db"
|
131 |
+
|
132 |
+
client = chromadb.PersistentClient(path=str(chromadb_path))
|
133 |
+
collection = client.get_or_create_collection(name="yt_transcript_collection")
|
134 |
+
|
135 |
+
# Use absolute path for transcripts folder too
|
136 |
+
transcripts_folder_path = project_root / "Data" / "transcripts"
|
137 |
+
|
138 |
+
# Process any new files
|
139 |
+
process_and_add_new_files(str(transcripts_folder_path), collection)
|
140 |
+
|
141 |
+
# Create and launch the interface
|
142 |
+
interface = create_interface(str(transcripts_folder_path), collection)
|
143 |
+
interface.launch(share=True, server_port=7860)
|
144 |
+
|
145 |
+
|
146 |
+
if __name__ == "__main__":
|
147 |
+
main()
|
utils/__init__.py
ADDED
File without changes
|
utils/corefrence.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
from spacy.tokens import Doc
|
3 |
+
import coreferee
|
4 |
+
|
5 |
+
# Load spaCy model
|
6 |
+
nlp = spacy.load('en_core_web_sm')
|
7 |
+
nlp.add_pipe("coreferee")
|
8 |
+
|
9 |
+
# Register the custom extension attribute
|
10 |
+
Doc.set_extension('resolved_text', default=None, force=True)
|
11 |
+
|
12 |
+
|
13 |
+
def resolve_coreferences(query_text, conversation_history):
|
14 |
+
"""
|
15 |
+
Resolve coreferences in the given text using spaCy and coreferee.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
query_text (str): The current query to resolve
|
19 |
+
conversation_history (list): List of dictionaries containing previous conversation turns
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
str: Text with resolved coreferences
|
23 |
+
"""
|
24 |
+
# Combine conversation history and current query
|
25 |
+
combined_text = []
|
26 |
+
for turn in conversation_history:
|
27 |
+
combined_text.append(f"User: {turn['user']}")
|
28 |
+
combined_text.append(f"Bot: {turn['Bot']}")
|
29 |
+
combined_text.append(f"User: {query_text}")
|
30 |
+
text = "\n".join(combined_text)
|
31 |
+
|
32 |
+
# Process the text
|
33 |
+
doc = nlp(text)
|
34 |
+
|
35 |
+
# Get all tokens and their potential antecedents
|
36 |
+
resolved_tokens = list(doc)
|
37 |
+
|
38 |
+
# Resolve coreferences
|
39 |
+
for chain in doc._.coref_chains:
|
40 |
+
for mention in chain:
|
41 |
+
if mention.root_index != chain.most_specific.root_index:
|
42 |
+
# Replace mention with its antecedent
|
43 |
+
resolved_tokens[mention.root_index] = doc[chain.most_specific.root_index]
|
44 |
+
|
45 |
+
# Reconstruct the text with resolved references
|
46 |
+
resolved_text = "".join([token.text_with_ws if isinstance(token, spacy.tokens.Token)
|
47 |
+
else token.text + " " for token in resolved_tokens])
|
48 |
+
|
49 |
+
# Extract the resolved query (last line)
|
50 |
+
resolved_query = resolved_text.split('\n')[-1].replace("User: ", "").strip()
|
51 |
+
|
52 |
+
return resolved_query
|
utils/get_link.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def get_source_link(metadatas):
|
2 |
+
link = 'https://www.youtube.com/watch?v='
|
3 |
+
yt_link = []
|
4 |
+
for metadata in metadatas:
|
5 |
+
source = metadata['source']
|
6 |
+
values = source.split('.txt')
|
7 |
+
|
8 |
+
link = link + values[0]
|
9 |
+
yt_link.append(link)
|
10 |
+
# print(yt_link)
|
11 |
+
return yt_link
|
utils/summarization.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from Llm.llm_endpoints import get_llm_response
|
2 |
+
|
3 |
+
|
4 |
+
def summarize_conversation(conversation_history):
|
5 |
+
try:
|
6 |
+
summary_prompt = "Summarize the following conversation:\n" + "\n".join(
|
7 |
+
[f"User: {turn['user']}\nBot: {turn['bot']}" for turn in conversation_history])
|
8 |
+
summary = get_llm_response(summary_prompt)
|
9 |
+
print("*************************************************")
|
10 |
+
print(summary)
|
11 |
+
print("*************************************************")
|
12 |
+
return summary
|
13 |
+
except:
|
14 |
+
return ""
|