|
import difflib |
|
from typing import List |
|
import mcp.types as types |
|
from langchain.chat_models import init_chat_model |
|
from dotenv import load_dotenv |
|
import os |
|
import re |
|
|
|
|
|
load_dotenv() |
|
print("GROQ_API_KEY is set:", "GROQ_API_KEY" in os.environ) |
|
|
|
llm = init_chat_model("llama-3.1-8b-instant", model_provider='groq') |
|
|
|
|
|
def genratequestionnaire(model: str, capabilities: str) -> List[str]: |
|
""" |
|
Generate a baseline questionnaire for the given model. |
|
Returns a list of question strings for diagnostic purposes. |
|
""" |
|
global llm |
|
questions = [] |
|
previously_generated = "" |
|
|
|
for i in range(5): |
|
try: |
|
response = llm.invoke( |
|
f"Generate a questionnaire for a model with the following capabilities:\n" |
|
f"Model Name: {model}\n" |
|
f"Capabilities Overview:\n{capabilities}\n" |
|
f"Please provide one more question that covers the model's capabilities and typical use-cases.\n" |
|
f"Previously generated questions:\n{previously_generated}\n" |
|
f"Question {i + 1}:" |
|
) |
|
new_question = str(response.content).strip() |
|
questions.append(new_question) |
|
|
|
|
|
if previously_generated: |
|
previously_generated += "\n" |
|
previously_generated += f"Question {i + 1}: {new_question}" |
|
|
|
except Exception as e: |
|
print(f"Error generating question {i + 1}: {e}") |
|
|
|
questions.append(f"What are your capabilities as {model}?") |
|
|
|
return questions |
|
|
|
|
|
def gradeanswers(old_answers: List[str], new_answers: List[str]) -> str: |
|
""" |
|
Use the LLM to compare the old and new answers to compute a drift score. |
|
Returns a drift percentage as a string. |
|
""" |
|
global llm |
|
|
|
if not old_answers or not new_answers: |
|
return "0" |
|
|
|
if len(old_answers) != len(new_answers): |
|
return "100" |
|
|
|
try: |
|
|
|
prompt = "You're tasked with detecting semantic drift between two sets of model responses.\n\n" |
|
prompt += "Original responses:\n" |
|
for i, ans in enumerate(old_answers): |
|
prompt += f"Response {i + 1}: {ans}\n\n" |
|
|
|
prompt += "New responses:\n" |
|
for i, ans in enumerate(new_answers): |
|
prompt += f"Response {i + 1}: {ans}\n\n" |
|
|
|
prompt += ("Analyze the semantic differences between the original and new responses. " |
|
"Provide a drift percentage score (0-100%) that represents how much the meaning, " |
|
"intent, or capabilities have changed between the two sets of responses. " |
|
"Only return the numerical percentage value without any explanation or additional text.") |
|
|
|
|
|
response = llm.invoke(prompt) |
|
drift_text = str(response.content).strip() |
|
|
|
|
|
drift_match = re.search(r'(\d+\.?\d*)', drift_text) |
|
if drift_match: |
|
drift_pct = float(drift_match.group(1)) |
|
return str(int(drift_pct)) |
|
else: |
|
|
|
similarity_scores = [] |
|
for old, new in zip(old_answers, new_answers): |
|
similarity = difflib.SequenceMatcher(None, old, new).ratio() |
|
similarity_scores.append(similarity) |
|
|
|
avg_similarity = sum(similarity_scores) / len(similarity_scores) |
|
drift_pct = (1 - avg_similarity) * 100 |
|
return str(int(drift_pct)) |
|
|
|
except Exception as e: |
|
print(f"Error grading answers: {e}") |
|
|
|
similarity_scores = [] |
|
for old, new in zip(old_answers, new_answers): |
|
similarity = difflib.SequenceMatcher(None, old, new).ratio() |
|
similarity_scores.append(similarity) |
|
|
|
avg_similarity = sum(similarity_scores) / len(similarity_scores) |
|
drift_pct = (1 - avg_similarity) * 100 |
|
return str(int(drift_pct)) |