derm-ai / app /services /RAG_evaluation.py
muhammadnoman76's picture
update
75e2b6c
raw
history blame
3.28 kB
from typing import Dict, Any
import re
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from app.services.chathistory import ChatSession
import os
# # Set NLTK data path to a writable location
# nltk_data_dir = os.path.join(os.getcwd(), "nltk_data")
# os.makedirs(nltk_data_dir, exist_ok=True)
# nltk.data.path.append(nltk_data_dir)
# # Download NLTK resources to the specified directory
# nltk.download('stopwords', download_dir=nltk_data_dir)
# nltk.download('wordnet', download_dir=nltk_data_dir)
class RAGEvaluation:
def __init__(self, token: str, page: int = 1, page_size: int = 5):
self.chat_session = ChatSession(token, "session_id")
self.page = page
self.page_size = page_size
self.lemmatizer = WordNetLemmatizer()
self.stop_words = set(stopwords.words('english'))
def _preprocess_text(self, text: str) -> str:
text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
words = text.split()
lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words]
filtered_words = [word for word in lemmatized_words if word not in self.stop_words]
seen = set()
cleaned_words = []
for word in filtered_words:
if word not in seen:
seen.add(word)
cleaned_words.append(word)
return ' '.join(cleaned_words)
def _calculate_cosine_similarity(self, context: str, response: str) -> float:
clean_context = self._preprocess_text(context)
clean_response = self._preprocess_text(response)
vectorizer = TfidfVectorizer(vocabulary=clean_context.split())
try:
context_vector = vectorizer.fit_transform([clean_context])
response_vector = vectorizer.transform([clean_response])
return cosine_similarity(context_vector, response_vector)[0][0]
except ValueError:
return 0.0
def _calculate_time_difference(self, start_time: str, end_time: str) -> float:
start = datetime.fromisoformat(start_time)
end = datetime.fromisoformat(end_time)
return (end - start).total_seconds()
def _process_interaction(self, interaction: Dict[str, Any]) -> Dict[str, Any]:
processed = interaction.copy()
processed['accuracy'] = self._calculate_cosine_similarity(
interaction['context'],
interaction['response']
)
processed['overall_time'] = self._calculate_time_difference(
interaction['rag_start_time'],
interaction['rag_end_time']
)
return processed
def generate_evaluation_report(self) -> Dict[str, Any]:
raw_data = self.chat_session.get_save_details(
page=self.page,
page_size=self.page_size
)
return {
'total_interactions': raw_data['total_interactions'],
'page': raw_data['page'],
'page_size': raw_data['page_size'],
'total_pages': raw_data['total_pages'],
'results': [self._process_interaction(i) for i in raw_data['results']]
}