Spaces:
Sleeping
Sleeping
Commit
·
029c5e8
1
Parent(s):
0642efd
Change to prompt/ Dependency issue resolved/ dockarzie
Browse files- .dockerignore +36 -0
- Dockerfile +23 -0
- Example/rag_example.py +6 -1
- Prompts/huberman_prompt.py +20 -0
- Rag/rag_pipeline.py +3 -3
- poetry.lock +0 -0
- pyproject.toml +34 -0
- requirements.in +18 -0
- requirements.txt +11 -13
- setup.sh +0 -0
- utils/corefrence.py +47 -9
.dockerignore
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Ignore virtual environments and cache
|
2 |
+
__pycache__/
|
3 |
+
*.pyc
|
4 |
+
*.pyo
|
5 |
+
*.pyd
|
6 |
+
*.pyc
|
7 |
+
.venv/
|
8 |
+
venv/
|
9 |
+
__pycache__/
|
10 |
+
*.sqlite3-journal
|
11 |
+
|
12 |
+
# Ignore IDE files
|
13 |
+
.idea/
|
14 |
+
.vscode/
|
15 |
+
|
16 |
+
# Ignore development files
|
17 |
+
*.swp
|
18 |
+
*.swo
|
19 |
+
*.swn
|
20 |
+
*.swo
|
21 |
+
*.swn
|
22 |
+
*.swn
|
23 |
+
*.swo
|
24 |
+
*.swn
|
25 |
+
*.swo
|
26 |
+
*.swn
|
27 |
+
|
28 |
+
# Ignore data files
|
29 |
+
chromadb.db
|
30 |
+
chroma.sqlite3
|
31 |
+
Rag/chromadb.db
|
32 |
+
# Ignore setup scripts
|
33 |
+
setup.sh
|
34 |
+
|
35 |
+
# Ignore environment files
|
36 |
+
.env
|
Dockerfile
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use the official Python 3.11.11 image
|
2 |
+
FROM python:3.11.11-slim
|
3 |
+
|
4 |
+
# Set the working directory in the container
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# Copy the requirements file into the container
|
8 |
+
COPY requirements.txt .
|
9 |
+
|
10 |
+
# Install Python dependencies
|
11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
12 |
+
|
13 |
+
## Download spaCy model
|
14 |
+
#RUN python -m spacy download en_core_web_sm
|
15 |
+
#
|
16 |
+
## Install Coreferee for English
|
17 |
+
#RUN python -m coreferee install en
|
18 |
+
|
19 |
+
# Copy the rest of the application code
|
20 |
+
COPY . .
|
21 |
+
|
22 |
+
# Set the main entry point
|
23 |
+
CMD ["python", "-m", "Example.rag_example"]
|
Example/rag_example.py
CHANGED
@@ -1,10 +1,15 @@
|
|
|
|
1 |
import chromadb
|
2 |
-
transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
|
|
|
3 |
chromadb_path = "/home/nightwing/Codes/Xyzbot/Rag/chromadb.db"
|
4 |
client = chromadb.PersistentClient(path=chromadb_path)
|
5 |
collection = client.get_or_create_collection(name="yt_transcript_collection")
|
|
|
|
|
6 |
from Rag.rag_pipeline import main_workflow
|
7 |
|
8 |
# Run the application
|
9 |
if __name__ == "__main__":
|
|
|
10 |
main_workflow(transcripts_folder_path, collection)
|
|
|
1 |
+
import sys
|
2 |
import chromadb
|
3 |
+
# transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
|
4 |
+
tr
|
5 |
chromadb_path = "/home/nightwing/Codes/Xyzbot/Rag/chromadb.db"
|
6 |
client = chromadb.PersistentClient(path=chromadb_path)
|
7 |
collection = client.get_or_create_collection(name="yt_transcript_collection")
|
8 |
+
|
9 |
+
print("Python path:", sys.path)
|
10 |
from Rag.rag_pipeline import main_workflow
|
11 |
|
12 |
# Run the application
|
13 |
if __name__ == "__main__":
|
14 |
+
|
15 |
main_workflow(transcripts_folder_path, collection)
|
Prompts/huberman_prompt.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
huberman_prompt = """
|
2 |
+
You are Dr. Andrew Huberman, an expert neuroscientist and educator known for your clear, engaging, and scientifically accurate explanations. When answering, please consider the following:
|
3 |
+
1. Provide a clear and concise summary of the scientific concepts involved.
|
4 |
+
2. Highlight any relevant research or studies.
|
5 |
+
3. Offer actionable insights or practical advice.
|
6 |
+
|
7 |
+
Context:
|
8 |
+
{context}
|
9 |
+
|
10 |
+
Sources:
|
11 |
+
{sources}
|
12 |
+
|
13 |
+
Conversation History:
|
14 |
+
{history}
|
15 |
+
|
16 |
+
Question:
|
17 |
+
{question}
|
18 |
+
|
19 |
+
Please respond in a manner that is informative, research-backed, and reflective of your unique style.
|
20 |
+
"""
|
Rag/rag_pipeline.py
CHANGED
@@ -6,7 +6,7 @@ import os
|
|
6 |
import logging
|
7 |
from Llm.llm_endpoints import get_llm_response
|
8 |
from utils.get_link import get_source_link
|
9 |
-
from utils.corefrence import resolve_coreferences
|
10 |
from Prompts.huberman_prompt import huberman_prompt
|
11 |
# Configuration
|
12 |
API_KEY = os.getenv("GOOGLE_API_KEY")
|
@@ -127,8 +127,8 @@ def main_workflow(transcripts_folder_path, collection):
|
|
127 |
if query_text.lower() == "exit":
|
128 |
print("Ending the conversation. Goodbye")
|
129 |
break
|
130 |
-
resolved_query = resolve_coreferences(query_text, conversation_history)
|
131 |
-
query_text_with_conversation_history = enhance_query_with_history(
|
132 |
# resolved_query = resolve_coreference_in_query(query_text_with_conversation_history, conversation_history)
|
133 |
retrived_docs, metadatas = query_database(collection, query_text_with_conversation_history)
|
134 |
print("-" * 50)
|
|
|
6 |
import logging
|
7 |
from Llm.llm_endpoints import get_llm_response
|
8 |
from utils.get_link import get_source_link
|
9 |
+
# from utils.corefrence import resolve_coreferences
|
10 |
from Prompts.huberman_prompt import huberman_prompt
|
11 |
# Configuration
|
12 |
API_KEY = os.getenv("GOOGLE_API_KEY")
|
|
|
127 |
if query_text.lower() == "exit":
|
128 |
print("Ending the conversation. Goodbye")
|
129 |
break
|
130 |
+
# resolved_query = resolve_coreferences(query_text, conversation_history)
|
131 |
+
query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
|
132 |
# resolved_query = resolve_coreference_in_query(query_text_with_conversation_history, conversation_history)
|
133 |
retrived_docs, metadatas = query_database(collection, query_text_with_conversation_history)
|
134 |
print("-" * 50)
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "xyzbot"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "A rag application"
|
5 |
+
authors = [
|
6 |
+
{name = "Angel njlghmr@gmail.com"}
|
7 |
+
]
|
8 |
+
license = {text = "MIT"}
|
9 |
+
readme = "README.md"
|
10 |
+
requires-python =">=3.11,<3.12"
|
11 |
+
dependencies = [
|
12 |
+
"pyarrow (>=19.0.0,<20.0.0)",
|
13 |
+
"pandas (>=2.2.3,<3.0.0)",
|
14 |
+
"pendulum (>=3.0.0,<4.0.0)",
|
15 |
+
"google-generativeai (>=0.8.4,<0.9.0)",
|
16 |
+
"langchain (>=0.3.16,<0.4.0)",
|
17 |
+
"langchain-openai (>=0.3.3,<0.4.0)",
|
18 |
+
"langchain-chroma (>=0.2.1,<0.3.0)",
|
19 |
+
"langchain-community (>=0.3.16,<0.4.0)",
|
20 |
+
"chromadb (>=0.4.14)",
|
21 |
+
"pypdf (==4.2.0)",
|
22 |
+
"flask (==3.0.1)",
|
23 |
+
"flask-cors (==3.0.10)",
|
24 |
+
"sentence-transformers (==3.3.1)",
|
25 |
+
"tqdm (==4.67.1)",
|
26 |
+
"torch (==2.5.1)",
|
27 |
+
"transformers (==4.46.3)",
|
28 |
+
"pydantic (>=2.7.4,<3.0.0)"
|
29 |
+
]
|
30 |
+
|
31 |
+
|
32 |
+
[build-system]
|
33 |
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
34 |
+
build-backend = "poetry.core.masonry.api"
|
requirements.in
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pyarrow
|
2 |
+
pandas[performance, parquet, aws]
|
3 |
+
pendulum
|
4 |
+
google.generativeai
|
5 |
+
langchain
|
6 |
+
langchain_openai
|
7 |
+
langchain_chroma
|
8 |
+
langchain_community
|
9 |
+
chromadb==0.4.8
|
10 |
+
pypdf
|
11 |
+
flask
|
12 |
+
flask_cors
|
13 |
+
sentence_transformers
|
14 |
+
tqdm
|
15 |
+
torch
|
16 |
+
transformers
|
17 |
+
spacy==3.5.0
|
18 |
+
coreferee==1.4.1
|
requirements.txt
CHANGED
@@ -1,18 +1,16 @@
|
|
1 |
pyarrow
|
2 |
-
pandas
|
3 |
pendulum
|
4 |
google.generativeai
|
5 |
-
langchain
|
6 |
langchain_openai
|
7 |
langchain_chroma
|
8 |
-
|
9 |
-
chromadb
|
10 |
-
pypdf
|
11 |
-
flask
|
12 |
-
flask_cors
|
13 |
-
sentence_transformers
|
14 |
-
tqdm
|
15 |
-
torch
|
16 |
-
|
17 |
-
spacy==3.5.0
|
18 |
-
coreferee==1.4.1
|
|
|
1 |
pyarrow
|
2 |
+
pandas
|
3 |
pendulum
|
4 |
google.generativeai
|
5 |
+
langchain>=0.3.16,<0.4.0
|
6 |
langchain_openai
|
7 |
langchain_chroma
|
8 |
+
langchain-community>=0.3.16,<0.4.0
|
9 |
+
chromadb>=0.4.14
|
10 |
+
pypdf==4.2.0
|
11 |
+
flask==3.0.1
|
12 |
+
flask_cors==3.0.10
|
13 |
+
sentence_transformers==3.3.1
|
14 |
+
tqdm==4.67.1
|
15 |
+
torch==2.5.1
|
16 |
+
pydantic>=2.7.4,<3.0.0
|
|
|
|
setup.sh
CHANGED
File without changes
|
utils/corefrence.py
CHANGED
@@ -1,14 +1,52 @@
|
|
1 |
import spacy
|
|
|
|
|
|
|
|
|
2 |
nlp = spacy.load('en_core_web_sm')
|
3 |
nlp.add_pipe("coreferee")
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
combined_text = []
|
6 |
for turn in conversation_history:
|
7 |
-
combined_text.append(f"User:{turn['user']}")
|
8 |
-
combined_text.append(f"Bot:{turn['Bot']}")
|
9 |
-
combined_text.append(f"User:{query_text}")
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import spacy
|
2 |
+
from spacy.tokens import Doc
|
3 |
+
import coreferee
|
4 |
+
|
5 |
+
# Load spaCy model
|
6 |
nlp = spacy.load('en_core_web_sm')
|
7 |
nlp.add_pipe("coreferee")
|
8 |
+
|
9 |
+
# Register the custom extension attribute
|
10 |
+
Doc.set_extension('resolved_text', default=None, force=True)
|
11 |
+
|
12 |
+
|
13 |
+
def resolve_coreferences(query_text, conversation_history):
|
14 |
+
"""
|
15 |
+
Resolve coreferences in the given text using spaCy and coreferee.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
query_text (str): The current query to resolve
|
19 |
+
conversation_history (list): List of dictionaries containing previous conversation turns
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
str: Text with resolved coreferences
|
23 |
+
"""
|
24 |
+
# Combine conversation history and current query
|
25 |
combined_text = []
|
26 |
for turn in conversation_history:
|
27 |
+
combined_text.append(f"User: {turn['user']}")
|
28 |
+
combined_text.append(f"Bot: {turn['Bot']}")
|
29 |
+
combined_text.append(f"User: {query_text}")
|
30 |
+
text = "\n".join(combined_text)
|
31 |
+
|
32 |
+
# Process the text
|
33 |
+
doc = nlp(text)
|
34 |
+
|
35 |
+
# Get all tokens and their potential antecedents
|
36 |
+
resolved_tokens = list(doc)
|
37 |
+
|
38 |
+
# Resolve coreferences
|
39 |
+
for chain in doc._.coref_chains:
|
40 |
+
for mention in chain:
|
41 |
+
if mention.root_index != chain.most_specific.root_index:
|
42 |
+
# Replace mention with its antecedent
|
43 |
+
resolved_tokens[mention.root_index] = doc[chain.most_specific.root_index]
|
44 |
+
|
45 |
+
# Reconstruct the text with resolved references
|
46 |
+
resolved_text = "".join([token.text_with_ws if isinstance(token, spacy.tokens.Token)
|
47 |
+
else token.text + " " for token in resolved_tokens])
|
48 |
+
|
49 |
+
# Extract the resolved query (last line)
|
50 |
+
resolved_query = resolved_text.split('\n')[-1].replace("User: ", "").strip()
|
51 |
+
|
52 |
+
return resolved_query
|