derm-ai / app /services /RAG_evaluation.py
muhammadnoman76's picture
update
75e2b6c
from typing import Dict, Any
import re
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from app.services.chathistory import ChatSession
import os
# # Set NLTK data path to a writable location
# nltk_data_dir = os.path.join(os.getcwd(), "nltk_data")
# os.makedirs(nltk_data_dir, exist_ok=True)
# nltk.data.path.append(nltk_data_dir)
# # Download NLTK resources to the specified directory
# nltk.download('stopwords', download_dir=nltk_data_dir)
# nltk.download('wordnet', download_dir=nltk_data_dir)
class RAGEvaluation:
def __init__(self, token: str, page: int = 1, page_size: int = 5):
self.chat_session = ChatSession(token, "session_id")
self.page = page
self.page_size = page_size
self.lemmatizer = WordNetLemmatizer()
self.stop_words = set(stopwords.words('english'))
def _preprocess_text(self, text: str) -> str:
text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
words = text.split()
lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words]
filtered_words = [word for word in lemmatized_words if word not in self.stop_words]
seen = set()
cleaned_words = []
for word in filtered_words:
if word not in seen:
seen.add(word)
cleaned_words.append(word)
return ' '.join(cleaned_words)
def _calculate_cosine_similarity(self, context: str, response: str) -> float:
clean_context = self._preprocess_text(context)
clean_response = self._preprocess_text(response)
vectorizer = TfidfVectorizer(vocabulary=clean_context.split())
try:
context_vector = vectorizer.fit_transform([clean_context])
response_vector = vectorizer.transform([clean_response])
return cosine_similarity(context_vector, response_vector)[0][0]
except ValueError:
return 0.0
def _calculate_time_difference(self, start_time: str, end_time: str) -> float:
start = datetime.fromisoformat(start_time)
end = datetime.fromisoformat(end_time)
return (end - start).total_seconds()
def _process_interaction(self, interaction: Dict[str, Any]) -> Dict[str, Any]:
processed = interaction.copy()
processed['accuracy'] = self._calculate_cosine_similarity(
interaction['context'],
interaction['response']
)
processed['overall_time'] = self._calculate_time_difference(
interaction['rag_start_time'],
interaction['rag_end_time']
)
return processed
def generate_evaluation_report(self) -> Dict[str, Any]:
raw_data = self.chat_session.get_save_details(
page=self.page,
page_size=self.page_size
)
return {
'total_interactions': raw_data['total_interactions'],
'page': raw_data['page'],
'page_size': raw_data['page_size'],
'total_pages': raw_data['total_pages'],
'results': [self._process_interaction(i) for i in raw_data['results']]
}