Nightwing11 commited on
Commit
66f97de
·
1 Parent(s): 880d7b6

resolve rag/db

Browse files
Files changed (3) hide show
  1. .gitignore +0 -1
  2. Llm/llm_endpoints.py +14 -0
  3. Rag/chunking.py +54 -6
.gitignore CHANGED
@@ -250,5 +250,4 @@ flowcess/commons/settings.py
250
  Rag/db
251
  *.db
252
  Rag/chromadb.db/chroma.sqlite3
253
- Rag/chromadb.db/chroma.sqlite3
254
  Rag/db/*
 
250
  Rag/db
251
  *.db
252
  Rag/chromadb.db/chroma.sqlite3
 
253
  Rag/db/*
Llm/llm_endpoints.py CHANGED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os
3
+ import google.generativeai as genai
4
+
5
+
6
+ # Configure the Generative AI model with the API key from the environment
7
+ load_dotenv()
8
+ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
9
+ gemini_model = genai.GenerativeModel("models/gemini-1.5-flash")
10
+
11
+ # Function to get a response from the generative model
12
+ def get_llm_response(prompt: str) -> str:
13
+ response = gemini_model.generate_content(prompt)
14
+ return response.text
Rag/chunking.py CHANGED
@@ -1,10 +1,58 @@
1
  import chromadb
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
-
4
- from sentence_transformers import SentenceTransformer
5
- import google.generativeai as genai
6
  import os
7
- import json
 
 
 
 
 
 
 
 
 
 
 
8
  import logging
9
- from dotenv import load_dotenv
10
- from LLM.llm_endpoints import get_llm_response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import chromadb
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.chains import ConversationalRetrievalChain
4
+ from langchain_community.document_loaders import TextLoader
5
+ from langchain.schema import Document
6
  import os
7
+ import sys
8
+ from Data.yt_transcript import all_video_transcript_pipeline
9
+ import google.generativeai as genai
10
+
11
+ PROJECT_ROOT = os.path.abspath(os.path.dirname(os.path.abspath(__file__)))
12
+ sys.path.append(PROJECT_ROOT)
13
+ API_KEY = os.getenv("GOOGLE_API_KEY")
14
+ if API_KEY:
15
+ genai.configure(api_key=API_KEY)
16
+ full_transcripts = all_video_transcript_pipeline()
17
+ loader = TextLoader(full_transcripts)
18
+
19
  import logging
20
+
21
+ logging.basicConfig(level=logging.INFO)
22
+
23
+
24
+ def prepare_documents(full_transcript):
25
+ docs = []
26
+ for key, value in full_transcript.items():
27
+ if isinstance(value, dict) and "text" in value:
28
+ content = " ".join(value["text"]) if isinstance(value["text"], list) else value["text"]
29
+ docs.append(Document(page_content=content, metadata={"source": key}))
30
+ return docs
31
+
32
+
33
+ def split_text_to_chunks():
34
+ try:
35
+ docs = prepare_documents(full_transcripts)
36
+ logging.info(f"{len(docs)} documents prepared")
37
+ text_splitter = RecursiveCharacterTextSplitter(
38
+ chunk_size=1000,
39
+ chunk_overlap=200,
40
+ separators=['\n\n', '.', '?', '!'])
41
+ splits = text_splitter.split_documents(docs)
42
+ return splits
43
+ except Exception as e:
44
+ logging.error(f"Error while splitting text: {str(e)}")
45
+ # Optionally log the full traceback to a file
46
+ import traceback
47
+ with open("error_log.txt", "w") as f:
48
+ traceback.print_exc(file=f)
49
+ return None
50
+
51
+
52
+ all_splits = split_text_to_chunks()
53
+ if all_splits:
54
+ print(f"Total chunks created: {len(all_splits)}")
55
+ print(all_splits[0].metadata)
56
+ print(all_splits[1])
57
+ else:
58
+ print("Splitting failed. Check logs for details.")