Nightwing11 commited on
Commit
029c5e8
·
1 Parent(s): 0642efd

Change to prompt/ Dependency issue resolved/ dockarzie

Browse files
.dockerignore ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore virtual environments and cache
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ *.pyc
7
+ .venv/
8
+ venv/
9
+ __pycache__/
10
+ *.sqlite3-journal
11
+
12
+ # Ignore IDE files
13
+ .idea/
14
+ .vscode/
15
+
16
+ # Ignore development files
17
+ *.swp
18
+ *.swo
19
+ *.swn
20
+ *.swo
21
+ *.swn
22
+ *.swn
23
+ *.swo
24
+ *.swn
25
+ *.swo
26
+ *.swn
27
+
28
+ # Ignore data files
29
+ chromadb.db
30
+ chroma.sqlite3
31
+ Rag/chromadb.db
32
+ # Ignore setup scripts
33
+ setup.sh
34
+
35
+ # Ignore environment files
36
+ .env
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python 3.11.11 image
2
+ FROM python:3.11.11-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the requirements file into the container
8
+ COPY requirements.txt .
9
+
10
+ # Install Python dependencies
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ ## Download spaCy model
14
+ #RUN python -m spacy download en_core_web_sm
15
+ #
16
+ ## Install Coreferee for English
17
+ #RUN python -m coreferee install en
18
+
19
+ # Copy the rest of the application code
20
+ COPY . .
21
+
22
+ # Set the main entry point
23
+ CMD ["python", "-m", "Example.rag_example"]
Example/rag_example.py CHANGED
@@ -1,10 +1,15 @@
 
1
  import chromadb
2
- transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
 
3
  chromadb_path = "/home/nightwing/Codes/Xyzbot/Rag/chromadb.db"
4
  client = chromadb.PersistentClient(path=chromadb_path)
5
  collection = client.get_or_create_collection(name="yt_transcript_collection")
 
 
6
  from Rag.rag_pipeline import main_workflow
7
 
8
  # Run the application
9
  if __name__ == "__main__":
 
10
  main_workflow(transcripts_folder_path, collection)
 
1
+ import sys
2
  import chromadb
3
+ # transcripts_folder_path = '/home/nightwing/Codes/Xyzbot/Data/transcripts'
4
+ tr
5
  chromadb_path = "/home/nightwing/Codes/Xyzbot/Rag/chromadb.db"
6
  client = chromadb.PersistentClient(path=chromadb_path)
7
  collection = client.get_or_create_collection(name="yt_transcript_collection")
8
+
9
+ print("Python path:", sys.path)
10
  from Rag.rag_pipeline import main_workflow
11
 
12
  # Run the application
13
  if __name__ == "__main__":
14
+
15
  main_workflow(transcripts_folder_path, collection)
Prompts/huberman_prompt.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ huberman_prompt = """
2
+ You are Dr. Andrew Huberman, an expert neuroscientist and educator known for your clear, engaging, and scientifically accurate explanations. When answering, please consider the following:
3
+ 1. Provide a clear and concise summary of the scientific concepts involved.
4
+ 2. Highlight any relevant research or studies.
5
+ 3. Offer actionable insights or practical advice.
6
+
7
+ Context:
8
+ {context}
9
+
10
+ Sources:
11
+ {sources}
12
+
13
+ Conversation History:
14
+ {history}
15
+
16
+ Question:
17
+ {question}
18
+
19
+ Please respond in a manner that is informative, research-backed, and reflective of your unique style.
20
+ """
Rag/rag_pipeline.py CHANGED
@@ -6,7 +6,7 @@ import os
6
  import logging
7
  from Llm.llm_endpoints import get_llm_response
8
  from utils.get_link import get_source_link
9
- from utils.corefrence import resolve_coreferences
10
  from Prompts.huberman_prompt import huberman_prompt
11
  # Configuration
12
  API_KEY = os.getenv("GOOGLE_API_KEY")
@@ -127,8 +127,8 @@ def main_workflow(transcripts_folder_path, collection):
127
  if query_text.lower() == "exit":
128
  print("Ending the conversation. Goodbye")
129
  break
130
- resolved_query = resolve_coreferences(query_text, conversation_history)
131
- query_text_with_conversation_history = enhance_query_with_history(resolved_query, conversation_history)
132
  # resolved_query = resolve_coreference_in_query(query_text_with_conversation_history, conversation_history)
133
  retrived_docs, metadatas = query_database(collection, query_text_with_conversation_history)
134
  print("-" * 50)
 
6
  import logging
7
  from Llm.llm_endpoints import get_llm_response
8
  from utils.get_link import get_source_link
9
+ # from utils.corefrence import resolve_coreferences
10
  from Prompts.huberman_prompt import huberman_prompt
11
  # Configuration
12
  API_KEY = os.getenv("GOOGLE_API_KEY")
 
127
  if query_text.lower() == "exit":
128
  print("Ending the conversation. Goodbye")
129
  break
130
+ # resolved_query = resolve_coreferences(query_text, conversation_history)
131
+ query_text_with_conversation_history = enhance_query_with_history(query_text, conversation_history)
132
  # resolved_query = resolve_coreference_in_query(query_text_with_conversation_history, conversation_history)
133
  retrived_docs, metadatas = query_database(collection, query_text_with_conversation_history)
134
  print("-" * 50)
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "xyzbot"
3
+ version = "0.1.0"
4
+ description = "A rag application"
5
+ authors = [
6
+ {name = "Angel njlghmr@gmail.com"}
7
+ ]
8
+ license = {text = "MIT"}
9
+ readme = "README.md"
10
+ requires-python =">=3.11,<3.12"
11
+ dependencies = [
12
+ "pyarrow (>=19.0.0,<20.0.0)",
13
+ "pandas (>=2.2.3,<3.0.0)",
14
+ "pendulum (>=3.0.0,<4.0.0)",
15
+ "google-generativeai (>=0.8.4,<0.9.0)",
16
+ "langchain (>=0.3.16,<0.4.0)",
17
+ "langchain-openai (>=0.3.3,<0.4.0)",
18
+ "langchain-chroma (>=0.2.1,<0.3.0)",
19
+ "langchain-community (>=0.3.16,<0.4.0)",
20
+ "chromadb (>=0.4.14)",
21
+ "pypdf (==4.2.0)",
22
+ "flask (==3.0.1)",
23
+ "flask-cors (==3.0.10)",
24
+ "sentence-transformers (==3.3.1)",
25
+ "tqdm (==4.67.1)",
26
+ "torch (==2.5.1)",
27
+ "transformers (==4.46.3)",
28
+ "pydantic (>=2.7.4,<3.0.0)"
29
+ ]
30
+
31
+
32
+ [build-system]
33
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
34
+ build-backend = "poetry.core.masonry.api"
requirements.in ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pyarrow
2
+ pandas[performance, parquet, aws]
3
+ pendulum
4
+ google.generativeai
5
+ langchain
6
+ langchain_openai
7
+ langchain_chroma
8
+ langchain_community
9
+ chromadb==0.4.8
10
+ pypdf
11
+ flask
12
+ flask_cors
13
+ sentence_transformers
14
+ tqdm
15
+ torch
16
+ transformers
17
+ spacy==3.5.0
18
+ coreferee==1.4.1
requirements.txt CHANGED
@@ -1,18 +1,16 @@
1
  pyarrow
2
- pandas[performance, parquet, aws]
3
  pendulum
4
  google.generativeai
5
- langchain
6
  langchain_openai
7
  langchain_chroma
8
- langchain_community
9
- chromadb==0.4.8
10
- pypdf
11
- flask
12
- flask_cors
13
- sentence_transformers
14
- tqdm
15
- torch
16
- transformers
17
- spacy==3.5.0
18
- coreferee==1.4.1
 
1
  pyarrow
2
+ pandas
3
  pendulum
4
  google.generativeai
5
+ langchain>=0.3.16,<0.4.0
6
  langchain_openai
7
  langchain_chroma
8
+ langchain-community>=0.3.16,<0.4.0
9
+ chromadb>=0.4.14
10
+ pypdf==4.2.0
11
+ flask==3.0.1
12
+ flask_cors==3.0.10
13
+ sentence_transformers==3.3.1
14
+ tqdm==4.67.1
15
+ torch==2.5.1
16
+ pydantic>=2.7.4,<3.0.0
 
 
setup.sh CHANGED
File without changes
utils/corefrence.py CHANGED
@@ -1,14 +1,52 @@
1
  import spacy
 
 
 
 
2
  nlp = spacy.load('en_core_web_sm')
3
  nlp.add_pipe("coreferee")
4
- def resolve_corefrence(query_text, conversation_history):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  combined_text = []
6
  for turn in conversation_history:
7
- combined_text.append(f"User:{turn['user']}")
8
- combined_text.append(f"Bot:{turn['Bot']}")
9
- combined_text.append(f"User:{query_text}")
10
- combined_text = "\n".join(combined_text)
11
- doc = nlp(combined_text)
12
- resolved_text = doc._.corefrence_resolved
13
- resolved_query = resolved_text.split('\n')[-1].replace("User: ", "")
14
- return resolved_query.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import spacy
2
+ from spacy.tokens import Doc
3
+ import coreferee
4
+
5
+ # Load spaCy model
6
  nlp = spacy.load('en_core_web_sm')
7
  nlp.add_pipe("coreferee")
8
+
9
+ # Register the custom extension attribute
10
+ Doc.set_extension('resolved_text', default=None, force=True)
11
+
12
+
13
+ def resolve_coreferences(query_text, conversation_history):
14
+ """
15
+ Resolve coreferences in the given text using spaCy and coreferee.
16
+
17
+ Args:
18
+ query_text (str): The current query to resolve
19
+ conversation_history (list): List of dictionaries containing previous conversation turns
20
+
21
+ Returns:
22
+ str: Text with resolved coreferences
23
+ """
24
+ # Combine conversation history and current query
25
  combined_text = []
26
  for turn in conversation_history:
27
+ combined_text.append(f"User: {turn['user']}")
28
+ combined_text.append(f"Bot: {turn['Bot']}")
29
+ combined_text.append(f"User: {query_text}")
30
+ text = "\n".join(combined_text)
31
+
32
+ # Process the text
33
+ doc = nlp(text)
34
+
35
+ # Get all tokens and their potential antecedents
36
+ resolved_tokens = list(doc)
37
+
38
+ # Resolve coreferences
39
+ for chain in doc._.coref_chains:
40
+ for mention in chain:
41
+ if mention.root_index != chain.most_specific.root_index:
42
+ # Replace mention with its antecedent
43
+ resolved_tokens[mention.root_index] = doc[chain.most_specific.root_index]
44
+
45
+ # Reconstruct the text with resolved references
46
+ resolved_text = "".join([token.text_with_ws if isinstance(token, spacy.tokens.Token)
47
+ else token.text + " " for token in resolved_tokens])
48
+
49
+ # Extract the resolved query (last line)
50
+ resolved_query = resolved_text.split('\n')[-1].replace("User: ", "").strip()
51
+
52
+ return resolved_query