C2MV's picture
Update app.py
b9fbc5e verified
"""
CÓDIGO COMPLETO Y CORREGIDO - VERSIÓN 7.8 (Agente de Instrucciones de Lenguaje Natural)
- MEJORA MAYOR: Se ha creado un nuevo `InstructionParsingAgent` que interpreta las instrucciones
en lenguaje natural del usuario desde el cuadro de "Especificaciones Adicionales".
- FUNCIONALIDAD AVANZADA: El usuario puede especificar qué métricas usar (R2, RMSE, o ambas) y
cuántos "Top N" modelos seleccionar, simplemente escribiéndolo.
- MODELSELECTIONAGENT MEJORADO: La lógica del "Plan B" ahora es dinámica y se basa en las
instrucciones parseadas, calculando un score combinado si se especifican múltiples métricas.
- UI SIMPLIFICADA: Se han eliminado los controles estáticos de ranking, reemplazados por el
cuadro de texto de instrucciones, haciendo la interfaz más limpia y potente.
"""
import gradio as gr
from gradio_client import Client, handle_file
import pandas as pd
import json
import tempfile
import os
import re
from datetime import datetime
import plotly.graph_objects as go
import logging
import numpy as np
from smolagents import CodeAgent, InferenceClientModel
# --- CONFIGURACIÓN Y CLIENTES ---
logging.basicConfig(level=logging.INFO); logger = logging.getLogger(__name__)
# --- INICIALIZACIÓN DE MODELO PARA AGENTES ---
try:
hf_engine = InferenceClientModel(model_id="deepseek-ai/DeepSeek-V2-Lite-Instruct")
logger.info("✅ Modelo de lenguaje (DeepSeek-V2-Lite) inicializado para agentes.")
except Exception: hf_engine = None; logger.error("❌ No se pudo inicializar el modelo de lenguaje para agentes.")
try: biotech_client = Client("C2MV/BiotechU4"); logger.info("✅ Cliente BiotechU4 inicializado.")
except: biotech_client = None
try: analysis_client = Client("C2MV/Project-HF-2025-2"); logger.info("✅ Cliente Project-HF-2025-2 inicializado.")
except: analysis_client = None
# ============================================================================
# 🤖 SISTEMA DE AGENTES (LÓGICA ACTUALIZADA)
# ============================================================================
class LoggingAgent:
def __init__(self): self.log_entries, self.start_time = [], datetime.now(); logger.info("🕵️ LoggingAgent activado.")
def register(self, agent_name, action, details=""):
entry = f"**{datetime.now().strftime('%H:%M:%S')} | {agent_name}:** {action}"; self.log_entries.append(entry + (f"\n> *Detalles: {details}*" if details else ""))
def get_report(self):
if not self.log_entries: return "### 🕵️ Informe de Actividad\n\nNo se registraron actividades."
return "### 🕵️ Informe de Actividad de Agentes\n\n---\n\n" + "\n\n---\n\n".join(self.log_entries) + f"\n\n---\n\n**Tiempo total: {(datetime.now() - self.start_time).total_seconds():.2f} s.**"
def clear(self): self.log_entries, self.start_time = [], datetime.now()
class StructureValidationAgent:
def __init__(self, log_agent: LoggingAgent): self.log_agent = log_agent
def validate(self, file_obj):
try:
if file_obj.name.endswith(('.xls', '.xlsx')): df = pd.read_excel(file_obj.name, header=1)
else: df = pd.read_csv(file_obj.name)
if df.empty: return False, "Validación fallida: El archivo está vacío."
except Exception as e: return False, f"Error crítico al leer el archivo: {e}"
self.log_agent.register("StructureValidationAgent", "Validación de formato de archivo superada.")
return True, "Formato de archivo básico validado."
class InstructionParsingAgent:
"""Agente que convierte el lenguaje natural del usuario en un plan de acción estructurado."""
def __init__(self, log_agent: LoggingAgent, llm_engine):
self.log_agent = log_agent
self.agent = CodeAgent(tools=[], model=llm_engine) if llm_engine else None
def parse(self, text: str):
default_instructions = {'metrics': ['R2'], 'top_n': 3}
if not self.agent:
self.log_agent.register("InstructionParsingAgent", "LLM no disponible, usando defaults.")
return default_instructions
prompt = f"""
Analyze the user's instruction for a data analysis task. Extract the metrics they want to use for ranking and the number of top models to select.
The possible metrics are "R2", "RMSE".
Your output MUST be ONLY a valid JSON object with two keys: "metrics" (a list of strings) and "top_n" (an integer).
- If the user mentions "R2" or "R-cuadrado", include "R2" in the metrics list.
- If the user mentions "RMSE", include "RMSE" in the metrics list.
- If the user mentions a number like "top 3", "los 2 mejores", or just a digit, set "top_n" to that number.
- If no metrics are mentioned, default to ["R2"].
- If no number is mentioned, default to 3.
Example 1: "Usa el promedio de R2 y RMSE y elige el top 2" -> {{"metrics": ["R2", "RMSE"], "top_n": 2}}
Example 2: "dame los 3 mejores modelos segun el menor RMSE" -> {{"metrics": ["RMSE"], "top_n": 3}}
Example 3: "el mejor R2" -> {{"metrics": ["R2"], "top_n": 1}}
Example 4: "analizar los datos" -> {{"metrics": ["R2"], "top_n": 3}}
User instruction: "{text}"
JSON Output:
"""
try:
response_str = self.agent.run(prompt)
json_str = response_str[response_str.find('{'):response_str.rfind('}')+1]
instructions = json.loads(json_str)
# Validar que el formato es correcto
if 'metrics' not in instructions or 'top_n' not in instructions:
raise ValueError("JSON de salida no contiene las claves esperadas.")
self.log_agent.register("InstructionParsingAgent", "Instrucciones del usuario parseadas con éxito.")
return instructions
except Exception as e:
self.log_agent.register("InstructionParsingAgent", "Error parseando instrucciones, usando defaults.", f"Error: {e}")
return default_instructions
class ModelSelectionAgent:
"""Agente 3 (CON FALLBACK CONFIGURABLE): Identifica los mejores modelos, con un plan B personalizable."""
def __init__(self, log_agent: LoggingAgent): self.log_agent = log_agent
def _find_column(self, df_columns, possible_names):
for name in possible_names:
for col in df_columns:
if str(col).lower() == name.lower(): return col
return None
def identify_best_models(self, results_df, component, r2_threshold, rmse_threshold, instructions: dict):
self.log_agent.register("ModelSelectionAgent", f"Iniciando identificación para: '{component}'.")
# 1. Normalizar columna del Modelo
model_col = self._find_column(results_df.columns, ['model', 'modelo'])
if not model_col: return [], "Error: No se encontró la columna de nombres de modelos ('Model')."
df_norm = results_df.rename(columns={model_col: 'Model'})
# 2. Identificar columnas de métricas
r2_target_col, rmse_target_col = None, None
if component != 'all':
r2_target_col = self._find_column(df_norm.columns, [f'r2_{component}'])
rmse_target_col = self._find_column(df_norm.columns, [f'rmse_{component}'])
else:
metric_cols_r2 = [c for c in df_norm.columns if 'r2_' in str(c).lower()]
metric_cols_rmse = [c for c in df_norm.columns if 'rmse_' in str(c).lower()]
if metric_cols_r2 and metric_cols_rmse:
r2_target_col, rmse_target_col = 'R2_Avg', 'RMSE_Avg'
df_norm[r2_target_col] = df_norm[metric_cols_r2].mean(axis=1, skipna=True)
df_norm[rmse_target_col] = df_norm[metric_cols_rmse].mean(axis=1, skipna=True)
if not r2_target_col or not rmse_target_col or r2_target_col not in df_norm.columns or rmse_target_col not in df_norm.columns:
return [], f"Error: No se encontraron las métricas para el componente '{component}'."
# 3. Agrupar por modelo y calcular métrica de rendimiento promedio
model_performance = df_norm.groupby('Model').agg({r2_target_col: 'mean', rmse_target_col: 'mean'}).reset_index()
# 4. Intento 1: Filtrado Estricto
good_models_df = model_performance[(model_performance[r2_target_col] >= r2_threshold) & (model_performance[rmse_target_col] <= rmse_threshold)]
if not good_models_df.empty:
best_models_list = sorted([str(model).lower() for model in good_models_df['Model'].tolist()])
reasoning = f"Agente identificó **{len(best_models_list)}** modelo(s) que cumplen tus criterios: `{', '.join(best_models_list)}`."
return best_models_list, reasoning
else:
# 5. Intento 2: Plan B - Ranking Estratégico basado en instrucciones
self.log_agent.register("ModelSelectionAgent", "Filtro primario falló. Activando fallback: 'Ranking por Instrucciones'.", f"Plan: {instructions}")
use_r2 = 'R2' in instructions['metrics']
use_rmse = 'RMSE' in instructions['metrics']
top_n = instructions['top_n']
# Calcular el score de rendimiento
if use_r2 and use_rmse:
model_performance['Score'] = model_performance[r2_target_col] / (model_performance[rmse_target_col] + 1e-9)
sort_col, ascending, metric_name = 'Score', False, "R²/RMSE combinado"
elif use_rmse:
sort_col, ascending, metric_name = rmse_target_col, True, "RMSE"
else: # Por defecto R2
sort_col, ascending, metric_name = r2_target_col, False, "R²"
sorted_performance = model_performance.sort_values(by=sort_col, ascending=ascending)
top_n_df = sorted_performance.head(top_n)
best_models_list = sorted([str(model).lower() for model in top_n_df['Model'].tolist()])
reasoning = (f"**Advertencia:** Ningún modelo cumplió con los criterios iniciales.\n\n"
f"Como plan B, el agente ha seleccionado los **Top {len(best_models_list)}** modelos con el mejor **{metric_name} promedio**: `{', '.join(best_models_list)}`.")
return best_models_list, reasoning
# --- INICIALIZACIÓN DE AGENTES GLOBALES ---
log_agent = LoggingAgent(); validation_agent = StructureValidationAgent(log_agent)
instruction_parser_agent = InstructionParsingAgent(log_agent, hf_engine)
model_selection_agent = ModelSelectionAgent(log_agent)
# --- FUNCIONES DEL PIPELINE ---
def create_dummy_plot(title="Esperando resultados..."):
fig = go.Figure(go.Scatter(x=[], y=[])); fig.update_layout(title=title, template="plotly_white", height=500, annotations=[dict(text="Sube un archivo y ejecuta", showarrow=False)])
return fig
def detect_experiments(file_obj):
if not file_obj: return gr.update(choices=[], value=[])
try:
df_first_row = pd.read_excel(file_obj.name, header=None, nrows=1)
exp_names = [str(name).strip() for name in df_first_row.iloc[0].dropna().tolist()]
return gr.update(choices=exp_names, value=exp_names, interactive=True)
except Exception as e: return gr.update(choices=[], value=[], interactive=False, placeholder=f"Error: {e}")
# ... (ETAPA 1: run_base_analysis - sin cambios) ...
def run_base_analysis(file, models, exp_names_selected, component, use_de, maxfev, progress=gr.Progress()):
log_agent.clear(); progress(0, desc="🚀 Iniciando Análisis Base...")
if not file or not models or not exp_names_selected:
return create_dummy_plot(), None, "❌ Por favor, sube un archivo y selecciona modelos/experimentos.", gr.update(interactive=False), {}, None, None, log_agent.get_report()
log_agent.register("Pipeline (Etapa 1)", "Iniciando Análisis Base."); progress(0.2, desc="Validando archivo...")
is_valid, msg = validation_agent.validate(file)
if not is_valid: return create_dummy_plot(), None, msg, gr.update(interactive=False), {}, None, None, log_agent.get_report()
progress(0.5, desc="Ejecutando análisis biotecnológico...");
if not biotech_client: return create_dummy_plot(), None, "❌ Cliente BiotechU4 no disponible.", gr.update(interactive=False), {}, None, None, log_agent.get_report()
try:
exp_names_str = ",".join(exp_names_selected); models_lower = [str(m).lower() for m in models]
plot_info, df_data, status = biotech_client.predict(file=handle_file(file.name), models=models_lower, component=component, use_de=use_de, maxfev=maxfev, exp_names=exp_names_str, api_name="/run_analysis_wrapper")
if "Error" in status: raise Exception(status)
except Exception as e:
return create_dummy_plot(), None, f"❌ Error en Análisis Base: {e}", gr.update(interactive=False), {}, None, None, log_agent.get_report()
progress(1, desc="🎉 Análisis Base Completado")
final_status = "✅ Análisis Base completado. \n➡️ Ahora puedes aplicar el filtro de IA y generar el informe final."
results_df_obj = {'data': df_data['data'], 'headers': df_data['headers']}
fig = go.Figure(json.loads(plot_info['plot'])) if plot_info and 'plot' in plot_info else create_dummy_plot()
original_params = {'exp_names': exp_names_selected, 'component': component, 'use_de': use_de, 'maxfev': maxfev}
return fig, df_data, final_status, gr.update(interactive=True), results_df_obj, file.name, original_params, log_agent.get_report()
# --- ETAPA 2: REFINAMIENTO Y REPORTE IA (ACTUALIZADA) ---
def refine_and_generate_report(baseline_results, file_path, original_params, r2_threshold, rmse_threshold, instructions_text, ia_model, detail_level, language, max_output_tokens, use_personal_key, personal_api_key, progress=gr.Progress()):
progress(0, desc="🚀 Iniciando Refinamiento con IA..."); log_agent.register("Pipeline (Etapa 2)", "Iniciando Refinamiento.")
if not baseline_results or not file_path or not original_params:
return gr.update(), None, None, None, "❌ No hay resultados base para refinar.", None, log_agent.get_report()
progress(0.1, desc="Agente de Parseo interpretando instrucciones...")
instructions = instruction_parser_agent.parse(instructions_text)
log_agent.register("InstructionParsingAgent", "Instrucciones interpretadas.", f"Plan: {instructions}")
progress(0.2, desc="Agente de Selección identificando mejores modelos...")
results_df = pd.DataFrame(baseline_results['data'], columns=baseline_results['headers'])
best_models, reasoning = model_selection_agent.identify_best_models(results_df, original_params['component'], r2_threshold, rmse_threshold, instructions)
if not best_models:
return gr.update(), baseline_results, None, None, f"🤖 Análisis del Agente:\n{reasoning}", None, log_agent.get_report()
progress(0.4, desc="Re-ejecutando análisis con los mejores modelos...");
try:
exp_names_str = ",".join(original_params['exp_names'])
final_plot_info, final_df_data, final_status = biotech_client.predict(file=handle_file(file_path), models=best_models, component=original_params['component'], use_de=original_params['use_de'], maxfev=original_params['maxfev'], exp_names=exp_names_str, api_name="/run_analysis_wrapper")
if "Error" in final_status: raise Exception(final_status)
except Exception as e:
return gr.update(), None, None, None, f"❌ Error en el re-análisis final: {e}", None, log_agent.get_report()
progress(0.6, desc="Generando informe IA..."); temp_csv_file = None
try:
final_results_df = pd.DataFrame(final_df_data['data'], columns=final_df_data['headers'])
with tempfile.NamedTemporaryFile(mode='w+', suffix='.csv', delete=False, encoding='utf-8') as temp_f:
final_results_df.to_csv(temp_f.name, index=False); temp_csv_file = temp_f.name
current_analysis_client = analysis_client
if use_personal_key and personal_api_key: current_analysis_client = Client("C2MV/Project-HF-2025-2", hf_token=personal_api_key)
chunk_update_dict = current_analysis_client.predict(files=[handle_file(temp_csv_file)], api_name="/update_chunk_column_selector")
selected_chunk_column = chunk_update_dict['choices'][0][0]
result = current_analysis_client.predict(files=[handle_file(temp_csv_file)], chunk_column=selected_chunk_column, qwen_model=ia_model, detail_level=detail_level, language=language, additional_specs="", max_output_tokens=max_output_tokens, api_name="/process_files_and_analyze")
_, analysis_report, implementation_code, token_usage = result
except Exception as e:
return gr.update(), final_df_data, None, None, f"❌ Error generando informe IA: {e}", None, log_agent.get_report()
finally:
if temp_csv_file and os.path.exists(temp_csv_file): os.remove(temp_csv_file)
progress(0.9, desc="Finalizando..."); final_report_path = None
if analysis_report:
export_dir = "exported_reports"; os.makedirs(export_dir, exist_ok=True)
final_report_path = os.path.join(export_dir, f"report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md")
with open(final_report_path, 'w', encoding='utf-8') as f: f.write(analysis_report)
final_status = f"✅ Refinamiento y reporte completados.\n{reasoning}\nInforme IA generado con {token_usage}."
final_fig = go.Figure(json.loads(final_plot_info['plot'])) if final_plot_info and 'plot' in final_plot_info else create_dummy_plot()
return final_fig, final_df_data, analysis_report, implementation_code, final_status, final_report_path, log_agent.get_report()
# ... (create_dummy_excel_file y constantes sin cambios) ...
def create_dummy_excel_file():
examples_dir = "examples"; os.makedirs(examples_dir, exist_ok=True); file_path = os.path.join(examples_dir, "archivo.xlsx")
if not os.path.exists(file_path):
exp_names = ['CN 20_1', 'CN 20_2', 'CN 30_1', 'CN 40_1']; writer = pd.ExcelWriter(file_path, engine='xlsxwriter'); worksheet = writer.book.add_worksheet('Datos'); writer.sheets['Datos'] = worksheet
for i, name in enumerate(exp_names): worksheet.write(0, i * 4, name)
start_col = 0
for _ in exp_names:
time = np.arange(0, 11, 2); biomass = 0.2 + (np.random.rand() * 20) / (1 + np.exp(4 - 0.5 * time)) + np.random.rand(len(time)) * 0.2
substrate = 10 * np.exp(-0.2 * time) + np.random.rand(len(time)) * 0.3; product = 1 * (1 - np.exp(-0.3 * time)) + np.random.rand(len(time)) * 0.1
df = pd.DataFrame({'Tiempo': time, 'Biomasa': biomass, 'Sustrato': substrate, 'Producto': product})
df.to_excel(writer, sheet_name='Datos', startrow=1, startcol=start_col, index=False); start_col += 4
writer.close()
BIOTECH_MODELS = ['logistic', 'gompertz', 'moser', 'baranyi', 'monod', 'contois', 'andrews', 'tessier', 'richards', 'stannard', 'huang']
IA_MODELS = ["deepseek-ai/DeepSeek-V3-0324"]
theme = gr.themes.Soft(primary_hue="blue", secondary_hue="indigo", neutral_hue="slate")
if __name__ == "__main__":
create_dummy_excel_file()
with gr.Blocks(theme=theme, title="BioTech Analysis & Report Generator") as demo:
gr.Markdown("# 🧬 BioTech Analysis & Report Generator v7.8")
gr.Markdown("### Un pipeline inteligente de dos etapas: Análisis Base y Refinamiento con IA.")
baseline_results_state = gr.State(value=None); file_path_state = gr.State(value=None); original_params_state = gr.State(value=None)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 1. Carga y Configuración del Análisis Base")
file_input = gr.File(label="📁 Archivo de Datos", file_types=[".xlsx", ".xls"]); gr.Examples(examples=["examples/archivo.xlsx"], inputs=[file_input])
exp_names_input = gr.CheckboxGroup(label="🔬 Experimentos a Analizar", interactive=False)
models_input = gr.CheckboxGroup(choices=BIOTECH_MODELS, value=BIOTECH_MODELS, label="📊 Modelos a Evaluar")
component_input = gr.Dropdown(['all', 'biomass', 'substrate', 'product'], value='all', label="📈 Componente a Analizar/Filtrar")
with gr.Accordion("Parámetros Avanzados", open=False):
use_de_input = gr.Checkbox(label="🧮 Usar Evolución Diferencial", value=False)
maxfev_input = gr.Slider(label="🔄 Máx. Iteraciones", minimum=10000, maximum=100000, value=50000, step=1000)
run_base_analysis_btn = gr.Button("1. Ejecutar Análisis Base", variant="secondary")
with gr.Group():
gr.Markdown("### 2. Refinamiento con IA")
gr.Markdown("#### Criterios de Selección Primarios (Filtro Estricto)")
r2_threshold_slider = gr.Slider(minimum=0.0, maximum=0.99, value=0.9, step=0.01, label="R² Mínimo")
rmse_threshold_input = gr.Number(value=0.5, label="RMSE Máximo")
# --- NUEVO INPUT DE INSTRUCCIONES ---
additional_specs_input = gr.Textbox(label="📝 Instrucciones para Selección Avanzada (Plan B)",
placeholder="Ej: Usa R2 y RMSE y dame el top 2",
info="Si ningún modelo cumple los criterios de arriba, el agente seguirá estas instrucciones.")
with gr.Accordion("Parámetros del Informe de IA Final", open=False):
ia_model_input = gr.Dropdown(choices=IA_MODELS, value=IA_MODELS[0], label="🤖 Modelo de IA para Informe")
detail_level_input = gr.Radio(['detailed', 'summarized'], value='detailed', label="📋 Nivel de Detalle")
language_input = gr.Dropdown(['es', 'en'], value='es', label="🌐 Idioma")
max_output_tokens_input = gr.Slider(minimum=1000, maximum=32000, value=8000, step=100, label="🔢 Máx. Tokens")
use_personal_key_input = gr.Checkbox(label="Usar Token HF Personal", value=False)
personal_api_key_input = gr.Textbox(label="Token HF", type="password", visible=False)
refine_with_ia_btn = gr.Button("2. 🤖 Aplicar Filtro y Generar Informe IA", variant="primary", interactive=False)
with gr.Column(scale=2):
gr.Markdown("### 3. Resultados")
status_output = gr.Textbox(label="📊 Registro de Estado", lines=5, interactive=False)
with gr.Tabs():
with gr.TabItem("📊 Visualización"): plot_output = gr.Plot()
with gr.TabItem("📋 Tabla de Modelado"): table_output = gr.Dataframe()
with gr.TabItem("📝 Informe IA"): analysis_output = gr.Markdown("El informe aparecerá aquí.")
with gr.TabItem("💻 Código"): code_output = gr.Code(language="python")
with gr.TabItem("🕵️ Registro de Agentes"): agent_log_output = gr.Markdown()
download_link_markdown = gr.Markdown("*El enlace de descarga aparecerá aquí.*")
report_output = gr.File(label="📥 Descargar Informe", interactive=False)
report_path_state = gr.State(value=None)
file_input.upload(fn=detect_experiments, inputs=file_input, outputs=exp_names_input)
use_personal_key_input.change(lambda x: gr.update(visible=x), inputs=use_personal_key_input, outputs=personal_api_key_input)
run_base_analysis_btn.click(
fn=run_base_analysis,
inputs=[file_input, models_input, exp_names_input, component_input, use_de_input, maxfev_input],
outputs=[plot_output, table_output, status_output, refine_with_ia_btn, baseline_results_state, file_path_state, original_params_state, agent_log_output]
)
refine_with_ia_btn.click(
fn=refine_and_generate_report,
inputs=[baseline_results_state, file_path_state, original_params_state, r2_threshold_slider, rmse_threshold_input, additional_specs_input, ia_model_input, detail_level_input, language_input, max_output_tokens_input, use_personal_key_input, personal_api_key_input],
outputs=[plot_output, table_output, analysis_output, code_output, status_output, report_path_state, agent_log_output]
)
def update_dl_link(path):
if path and os.path.exists(path): return f"**¡Informe listo!** 👉 [**Descargar '{os.path.basename(path)}'**](/file={path})"
return "*No se generó ningún archivo para descargar.*"
report_path_state.change(fn=update_dl_link, inputs=report_path_state, outputs=download_link_markdown)
demo.launch(show_error=True, debug=True)