Spaces:

munibz
/

rag

Sleeping

App Files Files Community

rag / rag.py

munibz

Upload 6 files

58d133e verified 3 months ago

raw

history blame contribute delete

12.1 kB

	#!/usr/bin/env python3
	"""
	RAG (Retrieval Augmented Generation) System
	-------------------------------------------
	This module implements a RAG system that processes PDF documents,
	uses ChromaDB as a vector database, sentence-transformers for embeddings,
	and Google's Gemini as the main LLM. The system follows a conversational pattern.
	"""

	import os
	import logging
	from typing import List, Dict, Any, Optional

	# Document processing
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	# Embeddings
	from sentence_transformers import SentenceTransformer

	# Vector database
	import chromadb
	from chromadb.utils import embedding_functions

	# For Gemini LLM integration
	from gemini_wrapper import GoogleGeminiWrapper

	from gtts import gTTS

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	class RAGSystem:
	"""
	A Retrieval Augmented Generation system that processes PDF documents,
	stores their embeddings in a vector database, and generates responses
	using the Google Gemini model.
	"""

	def __init__(
	self,
	pdf_dir: str,
	gemini_api_key: str,
	embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
	chunk_size: int = 1000,
	chunk_overlap: int = 200,
	db_directory: str = "./chroma_db"
	):
	"""
	Initialize the RAG system.

	Args:
	pdf_dir: Directory containing PDF documents
	gemini_api_key: API key for Google Gemini
	embedding_model_name: Name of the sentence-transformers model
	chunk_size: Size of text chunks for splitting documents
	chunk_overlap: Overlap between consecutive chunks
	db_directory: Directory to store the ChromaDB database
	"""
	self.pdf_dir = pdf_dir
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap
	self.db_directory = db_directory

	# Initialize the embedding model
	logger.info(f"Loading embedding model: {embedding_model_name}")
	self.embedding_model = SentenceTransformer(embedding_model_name)

	# Initialize the text splitter
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=self.chunk_size,
	chunk_overlap=self.chunk_overlap,
	)

	# Initialize ChromaDB
	logger.info(f"Initializing ChromaDB at {db_directory}")
	self.client = chromadb.PersistentClient(path=db_directory)

	# Create a custom embedding function that uses sentence-transformers
	self.sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
	model_name=embedding_model_name
	)

	# Create or get the collection
	self.collection = self.client.get_or_create_collection(
	name="pdf_documents",
	embedding_function=self.sentence_transformer_ef
	)

	# Initialize the Gemini LLM
	logger.info("Initializing Google Gemini")
	self.llm = GoogleGeminiWrapper(api_key=gemini_api_key)

	# Load conversation history
	self.conversation_history = []

	def process_documents(self) -> None:
	"""
	Process all PDF documents in the specified directory,
	split them into chunks, generate embeddings, and store in ChromaDB.
	"""
	logger.info(f"Processing documents from: {self.pdf_dir}")

	# Check if documents are already processed
	if self.collection.count() > 0:
	logger.info(f"Found {self.collection.count()} existing document chunks in the database")
	return

	# Process each PDF file in the directory
	pdf_files = [f for f in os.listdir(self.pdf_dir) if f.endswith('.pdf')]
	if not pdf_files:
	logger.warning(f"No PDF files found in {self.pdf_dir}")
	return

	logger.info(f"Found {len(pdf_files)} PDF files")

	doc_chunks = []
	metadatas = []
	ids = []
	chunk_idx = 0

	for pdf_file in pdf_files:
	pdf_path = os.path.join(self.pdf_dir, pdf_file)
	logger.info(f"Processing: {pdf_path}")

	# Load PDF
	loader = PyPDFLoader(pdf_path)
	documents = loader.load()

	# Split documents into chunks
	chunks = self.text_splitter.split_documents(documents)
	logger.info(f"Split {pdf_file} into {len(chunks)} chunks")

	# Prepare data for ChromaDB
	for chunk in chunks:
	doc_chunks.append(chunk.page_content)
	metadatas.append({
	"source": pdf_file,
	"page": chunk.metadata.get("page", 0),
	})
	ids.append(f"chunk_{chunk_idx}")
	chunk_idx += 1

	# Add documents to ChromaDB
	if doc_chunks:
	logger.info(f"Adding {len(doc_chunks)} chunks to ChromaDB")
	self.collection.add(
	documents=doc_chunks,
	metadatas=metadatas,
	ids=ids
	)
	logger.info("Documents successfully processed and stored")
	else:
	logger.warning("No document chunks were generated")

	def retrieve_relevant_chunks(self, query: str, k: int = 3) -> List[Dict[str, Any]]:
	"""
	Retrieve the k most relevant document chunks for a given query.

	Args:
	query: The query text
	k: Number of relevant chunks to retrieve

	Returns:
	List of relevant document chunks with their metadata
	"""
	logger.info(f"Retrieving {k} relevant chunks for query: {query}")
	results = self.collection.query(
	query_texts=[query],
	n_results=k
	)

	relevant_chunks = []
	if results and results["documents"] and results["documents"][0]:
	for i, doc in enumerate(results["documents"][0]):
	relevant_chunks.append({
	"content": doc,
	"metadata": results["metadatas"][0][i] if results["metadatas"] and results["metadatas"][0] else {},
	"id": results["ids"][0][i] if results["ids"] and results["ids"][0] else f"unknown_{i}"
	})

	return relevant_chunks

	def generate_response(self, query: str, k: int = 3) -> str:
	"""
	Generate a response for a user query using RAG.

	Args:
	query: User query
	k: Number of relevant chunks to retrieve

	Returns:
	Generated response from the LLM
	"""
	# Retrieve relevant document chunks
	relevant_chunks = self.retrieve_relevant_chunks(query, k=k)

	if not relevant_chunks:
	logger.warning("No relevant chunks found for the query")
	return "I couldn't find relevant information to answer your question."

	# Format context from retrieved chunks
	context = "\n\n".join([f"Document {i+1} (from {chunk['metadata'].get('source', 'unknown')}, page {chunk['metadata'].get('page', 'unknown')}):\n{chunk['content']}"
	for i, chunk in enumerate(relevant_chunks)])

	# Create prompt for the LLM
	prompt = f"""
	You are a helpful assistant that answers questions based on the provided context.

	CONTEXT:
	{context}

	QUESTION:
	{query}

	Please provide a comprehensive and accurate answer based only on the information in the provided context.
	If the context doesn't contain enough information to answer the question, please say so.
	"""

	# Generate response using Gemini
	response = self.llm.ask(prompt, max_tokens=500, temperature=0.3)
	return response

	def chat(self, user_input: str = None) -> Optional[str]:
	"""
	Conduct a conversation with the user using the RAG system.

	Args:
	user_input: User's input. If None, starts a new conversation.

	Returns:
	System's response or None to exit
	"""
	if user_input is None:
	# Initialize conversation
	print("RAG System Initialized. Type 'exit' or 'quit' to end the conversation.")
	user_input = input("You: ")

	if user_input.lower() in ['exit', 'quit']:
	print("Ending conversation. Goodbye!")
	return None

	# Generate response using RAG
	response = self.generate_response(user_input)

	# Update conversation history
	self.conversation_history.append({"user": user_input, "system": response})

	return response

	def interactive_session(self) -> None:
	"""
	Start an interactive chat session with the RAG system.
	"""
	print("Welcome to the RAG System!")
	print("Type 'exit' or 'quit' to end the conversation.")

	while True:
	user_input = input("\nYou: ")

	if user_input.lower() in ['exit', 'quit']:
	print("Ending conversation. Goodbye!")
	break

	response = self.generate_response(user_input)
	print(f"\nRAG System: {response}")

	# Function to convert text to speech
	def text_to_speech(response):
	tts = gTTS(response)
	audio_path = "response_audio.mp3"
	tts.save(audio_path)
	return audio_path

	def main():
	"""
	Main function to demonstrate the RAG system.
	"""
	# Attempt to get the Gemini API key from environment variable
	gemini_api_key = os.getenv("GEMINI_API_KEY")

	if not gemini_api_key:
	# If environment variable is not set or is empty, fallback to the hardcoded key
	hardcoded_api_key = "AIzaSyBisxoehBz8UF0i9kX42f1V3jp-9RNq04g" # Your hardcoded key
	# Check if the environment variable was truly not set (vs. set to an empty string)
	# to decide if we should print the INFO message.
	if os.getenv("GEMINI_API_KEY") is None: # More specific check for unset env variable
	print("INFO: GEMINI_API_KEY environment variable not found. Using hardcoded API key from rag.py.")
	gemini_api_key = hardcoded_api_key

	# Final check: if the key is still not set (e.g. if hardcoded key was also empty or None)
	if not gemini_api_key:
	print("Error: Gemini API key is not set.")
	print("Please set the GEMINI_API_KEY environment variable, or ensure it's correctly hardcoded in rag.py.")
	print("To set as environment variable:")
	print(" export GEMINI_API_KEY='your_api_key' # For Linux/macOS")
	print(" set GEMINI_API_KEY=your_api_key # For Windows CMD")
	print(" $env:GEMINI_API_KEY='your_api_key' # For Windows PowerShell")
	return

	# Set paths
	current_dir = os.path.dirname(os.path.abspath(__file__))
	pdf_dir = os.path.join(current_dir, "material")
	db_dir = os.path.join(current_dir, "chroma_db")

	# Initialize the RAG system
	rag = RAGSystem(
	pdf_dir=pdf_dir,
	gemini_api_key=gemini_api_key,
	db_directory=db_dir
	)

	# Process documents
	rag.process_documents()

	# Start interactive session
	rag.interactive_session()


	if __name__ == "__main__":
	main()