Project-HF-2025-4

Sleeping

App Files Files Community

Project-HF-2025-4 / app.py

C2MV

Update app.py

b9fbc5e verified 16 days ago

raw

history blame contribute delete

24.9 kB

	"""
	CÓDIGO COMPLETO Y CORREGIDO - VERSIÓN 7.8 (Agente de Instrucciones de Lenguaje Natural)
	- MEJORA MAYOR: Se ha creado un nuevo `InstructionParsingAgent` que interpreta las instrucciones
	en lenguaje natural del usuario desde el cuadro de "Especificaciones Adicionales".
	- FUNCIONALIDAD AVANZADA: El usuario puede especificar qué métricas usar (R2, RMSE, o ambas) y
	cuántos "Top N" modelos seleccionar, simplemente escribiéndolo.
	- MODELSELECTIONAGENT MEJORADO: La lógica del "Plan B" ahora es dinámica y se basa en las
	instrucciones parseadas, calculando un score combinado si se especifican múltiples métricas.
	- UI SIMPLIFICADA: Se han eliminado los controles estáticos de ranking, reemplazados por el
	cuadro de texto de instrucciones, haciendo la interfaz más limpia y potente.
	"""

	import gradio as gr
	from gradio_client import Client, handle_file
	import pandas as pd
	import json
	import tempfile
	import os
	import re
	from datetime import datetime
	import plotly.graph_objects as go
	import logging
	import numpy as np
	from smolagents import CodeAgent, InferenceClientModel

	# --- CONFIGURACIÓN Y CLIENTES ---
	logging.basicConfig(level=logging.INFO); logger = logging.getLogger(__name__)

	# --- INICIALIZACIÓN DE MODELO PARA AGENTES ---
	try:
	hf_engine = InferenceClientModel(model_id="deepseek-ai/DeepSeek-V2-Lite-Instruct")
	logger.info("✅ Modelo de lenguaje (DeepSeek-V2-Lite) inicializado para agentes.")
	except Exception: hf_engine = None; logger.error("❌ No se pudo inicializar el modelo de lenguaje para agentes.")

	try: biotech_client = Client("C2MV/BiotechU4"); logger.info("✅ Cliente BiotechU4 inicializado.")
	except: biotech_client = None
	try: analysis_client = Client("C2MV/Project-HF-2025-2"); logger.info("✅ Cliente Project-HF-2025-2 inicializado.")
	except: analysis_client = None

	# ============================================================================
	# 🤖 SISTEMA DE AGENTES (LÓGICA ACTUALIZADA)
	# ============================================================================

	class LoggingAgent:
	def __init__(self): self.log_entries, self.start_time = [], datetime.now(); logger.info("🕵️ LoggingAgent activado.")
	def register(self, agent_name, action, details=""):
	entry = f"{datetime.now().strftime('%H:%M:%S')} \| {agent_name}: {action}"; self.log_entries.append(entry + (f"\n> Detalles: {details}" if details else ""))
	def get_report(self):
	if not self.log_entries: return "### 🕵️ Informe de Actividad\n\nNo se registraron actividades."
	return "### 🕵️ Informe de Actividad de Agentes\n\n---\n\n" + "\n\n---\n\n".join(self.log_entries) + f"\n\n---\n\nTiempo total: {(datetime.now() - self.start_time).total_seconds():.2f} s."
	def clear(self): self.log_entries, self.start_time = [], datetime.now()


	class StructureValidationAgent:
	def __init__(self, log_agent: LoggingAgent): self.log_agent = log_agent
	def validate(self, file_obj):
	try:
	if file_obj.name.endswith(('.xls', '.xlsx')): df = pd.read_excel(file_obj.name, header=1)
	else: df = pd.read_csv(file_obj.name)
	if df.empty: return False, "Validación fallida: El archivo está vacío."
	except Exception as e: return False, f"Error crítico al leer el archivo: {e}"
	self.log_agent.register("StructureValidationAgent", "Validación de formato de archivo superada.")
	return True, "Formato de archivo básico validado."


	class InstructionParsingAgent:
	"""Agente que convierte el lenguaje natural del usuario en un plan de acción estructurado."""
	def __init__(self, log_agent: LoggingAgent, llm_engine):
	self.log_agent = log_agent
	self.agent = CodeAgent(tools=[], model=llm_engine) if llm_engine else None

	def parse(self, text: str):
	default_instructions = {'metrics': ['R2'], 'top_n': 3}
	if not self.agent:
	self.log_agent.register("InstructionParsingAgent", "LLM no disponible, usando defaults.")
	return default_instructions

	prompt = f"""
	Analyze the user's instruction for a data analysis task. Extract the metrics they want to use for ranking and the number of top models to select.
	The possible metrics are "R2", "RMSE".
	Your output MUST be ONLY a valid JSON object with two keys: "metrics" (a list of strings) and "top_n" (an integer).

	- If the user mentions "R2" or "R-cuadrado", include "R2" in the metrics list.
	- If the user mentions "RMSE", include "RMSE" in the metrics list.
	- If the user mentions a number like "top 3", "los 2 mejores", or just a digit, set "top_n" to that number.
	- If no metrics are mentioned, default to ["R2"].
	- If no number is mentioned, default to 3.

	Example 1: "Usa el promedio de R2 y RMSE y elige el top 2" -> {{"metrics": ["R2", "RMSE"], "top_n": 2}}
	Example 2: "dame los 3 mejores modelos segun el menor RMSE" -> {{"metrics": ["RMSE"], "top_n": 3}}
	Example 3: "el mejor R2" -> {{"metrics": ["R2"], "top_n": 1}}
	Example 4: "analizar los datos" -> {{"metrics": ["R2"], "top_n": 3}}

	User instruction: "{text}"
	JSON Output:
	"""
	try:
	response_str = self.agent.run(prompt)
	json_str = response_str[response_str.find('{'):response_str.rfind('}')+1]
	instructions = json.loads(json_str)
	# Validar que el formato es correcto
	if 'metrics' not in instructions or 'top_n' not in instructions:
	raise ValueError("JSON de salida no contiene las claves esperadas.")
	self.log_agent.register("InstructionParsingAgent", "Instrucciones del usuario parseadas con éxito.")
	return instructions
	except Exception as e:
	self.log_agent.register("InstructionParsingAgent", "Error parseando instrucciones, usando defaults.", f"Error: {e}")
	return default_instructions


	class ModelSelectionAgent:
	"""Agente 3 (CON FALLBACK CONFIGURABLE): Identifica los mejores modelos, con un plan B personalizable."""
	def __init__(self, log_agent: LoggingAgent): self.log_agent = log_agent

	def _find_column(self, df_columns, possible_names):
	for name in possible_names:
	for col in df_columns:
	if str(col).lower() == name.lower(): return col
	return None

	def identify_best_models(self, results_df, component, r2_threshold, rmse_threshold, instructions: dict):
	self.log_agent.register("ModelSelectionAgent", f"Iniciando identificación para: '{component}'.")

	# 1. Normalizar columna del Modelo
	model_col = self._find_column(results_df.columns, ['model', 'modelo'])
	if not model_col: return [], "Error: No se encontró la columna de nombres de modelos ('Model')."
	df_norm = results_df.rename(columns={model_col: 'Model'})

	# 2. Identificar columnas de métricas
	r2_target_col, rmse_target_col = None, None
	if component != 'all':
	r2_target_col = self._find_column(df_norm.columns, [f'r2_{component}'])
	rmse_target_col = self._find_column(df_norm.columns, [f'rmse_{component}'])
	else:
	metric_cols_r2 = [c for c in df_norm.columns if 'r2_' in str(c).lower()]
	metric_cols_rmse = [c for c in df_norm.columns if 'rmse_' in str(c).lower()]
	if metric_cols_r2 and metric_cols_rmse:
	r2_target_col, rmse_target_col = 'R2_Avg', 'RMSE_Avg'
	df_norm[r2_target_col] = df_norm[metric_cols_r2].mean(axis=1, skipna=True)
	df_norm[rmse_target_col] = df_norm[metric_cols_rmse].mean(axis=1, skipna=True)

	if not r2_target_col or not rmse_target_col or r2_target_col not in df_norm.columns or rmse_target_col not in df_norm.columns:
	return [], f"Error: No se encontraron las métricas para el componente '{component}'."

	# 3. Agrupar por modelo y calcular métrica de rendimiento promedio
	model_performance = df_norm.groupby('Model').agg({r2_target_col: 'mean', rmse_target_col: 'mean'}).reset_index()

	# 4. Intento 1: Filtrado Estricto
	good_models_df = model_performance[(model_performance[r2_target_col] >= r2_threshold) & (model_performance[rmse_target_col] <= rmse_threshold)]

	if not good_models_df.empty:
	best_models_list = sorted([str(model).lower() for model in good_models_df['Model'].tolist()])
	reasoning = f"Agente identificó {len(best_models_list)} modelo(s) que cumplen tus criterios: `{', '.join(best_models_list)}`."
	return best_models_list, reasoning
	else:
	# 5. Intento 2: Plan B - Ranking Estratégico basado en instrucciones
	self.log_agent.register("ModelSelectionAgent", "Filtro primario falló. Activando fallback: 'Ranking por Instrucciones'.", f"Plan: {instructions}")

	use_r2 = 'R2' in instructions['metrics']
	use_rmse = 'RMSE' in instructions['metrics']
	top_n = instructions['top_n']

	# Calcular el score de rendimiento
	if use_r2 and use_rmse:
	model_performance['Score'] = model_performance[r2_target_col] / (model_performance[rmse_target_col] + 1e-9)
	sort_col, ascending, metric_name = 'Score', False, "R²/RMSE combinado"
	elif use_rmse:
	sort_col, ascending, metric_name = rmse_target_col, True, "RMSE"
	else: # Por defecto R2
	sort_col, ascending, metric_name = r2_target_col, False, "R²"

	sorted_performance = model_performance.sort_values(by=sort_col, ascending=ascending)
	top_n_df = sorted_performance.head(top_n)

	best_models_list = sorted([str(model).lower() for model in top_n_df['Model'].tolist()])
	reasoning = (f"Advertencia: Ningún modelo cumplió con los criterios iniciales.\n\n"
	f"Como plan B, el agente ha seleccionado los Top {len(best_models_list)} modelos con el mejor {metric_name} promedio: `{', '.join(best_models_list)}`.")
	return best_models_list, reasoning

	# --- INICIALIZACIÓN DE AGENTES GLOBALES ---
	log_agent = LoggingAgent(); validation_agent = StructureValidationAgent(log_agent)
	instruction_parser_agent = InstructionParsingAgent(log_agent, hf_engine)
	model_selection_agent = ModelSelectionAgent(log_agent)

	# --- FUNCIONES DEL PIPELINE ---
	def create_dummy_plot(title="Esperando resultados..."):
	fig = go.Figure(go.Scatter(x=[], y=[])); fig.update_layout(title=title, template="plotly_white", height=500, annotations=[dict(text="Sube un archivo y ejecuta", showarrow=False)])
	return fig

	def detect_experiments(file_obj):
	if not file_obj: return gr.update(choices=[], value=[])
	try:
	df_first_row = pd.read_excel(file_obj.name, header=None, nrows=1)
	exp_names = [str(name).strip() for name in df_first_row.iloc[0].dropna().tolist()]
	return gr.update(choices=exp_names, value=exp_names, interactive=True)
	except Exception as e: return gr.update(choices=[], value=[], interactive=False, placeholder=f"Error: {e}")

	# ... (ETAPA 1: run_base_analysis - sin cambios) ...
	def run_base_analysis(file, models, exp_names_selected, component, use_de, maxfev, progress=gr.Progress()):
	log_agent.clear(); progress(0, desc="🚀 Iniciando Análisis Base...")
	if not file or not models or not exp_names_selected:
	return create_dummy_plot(), None, "❌ Por favor, sube un archivo y selecciona modelos/experimentos.", gr.update(interactive=False), {}, None, None, log_agent.get_report()
	log_agent.register("Pipeline (Etapa 1)", "Iniciando Análisis Base."); progress(0.2, desc="Validando archivo...")
	is_valid, msg = validation_agent.validate(file)
	if not is_valid: return create_dummy_plot(), None, msg, gr.update(interactive=False), {}, None, None, log_agent.get_report()
	progress(0.5, desc="Ejecutando análisis biotecnológico...");
	if not biotech_client: return create_dummy_plot(), None, "❌ Cliente BiotechU4 no disponible.", gr.update(interactive=False), {}, None, None, log_agent.get_report()
	try:
	exp_names_str = ",".join(exp_names_selected); models_lower = [str(m).lower() for m in models]
	plot_info, df_data, status = biotech_client.predict(file=handle_file(file.name), models=models_lower, component=component, use_de=use_de, maxfev=maxfev, exp_names=exp_names_str, api_name="/run_analysis_wrapper")
	if "Error" in status: raise Exception(status)
	except Exception as e:
	return create_dummy_plot(), None, f"❌ Error en Análisis Base: {e}", gr.update(interactive=False), {}, None, None, log_agent.get_report()
	progress(1, desc="🎉 Análisis Base Completado")
	final_status = "✅ Análisis Base completado. \n➡️ Ahora puedes aplicar el filtro de IA y generar el informe final."
	results_df_obj = {'data': df_data['data'], 'headers': df_data['headers']}
	fig = go.Figure(json.loads(plot_info['plot'])) if plot_info and 'plot' in plot_info else create_dummy_plot()
	original_params = {'exp_names': exp_names_selected, 'component': component, 'use_de': use_de, 'maxfev': maxfev}
	return fig, df_data, final_status, gr.update(interactive=True), results_df_obj, file.name, original_params, log_agent.get_report()

	# --- ETAPA 2: REFINAMIENTO Y REPORTE IA (ACTUALIZADA) ---
	def refine_and_generate_report(baseline_results, file_path, original_params, r2_threshold, rmse_threshold, instructions_text, ia_model, detail_level, language, max_output_tokens, use_personal_key, personal_api_key, progress=gr.Progress()):
	progress(0, desc="🚀 Iniciando Refinamiento con IA..."); log_agent.register("Pipeline (Etapa 2)", "Iniciando Refinamiento.")
	if not baseline_results or not file_path or not original_params:
	return gr.update(), None, None, None, "❌ No hay resultados base para refinar.", None, log_agent.get_report()

	progress(0.1, desc="Agente de Parseo interpretando instrucciones...")
	instructions = instruction_parser_agent.parse(instructions_text)
	log_agent.register("InstructionParsingAgent", "Instrucciones interpretadas.", f"Plan: {instructions}")

	progress(0.2, desc="Agente de Selección identificando mejores modelos...")
	results_df = pd.DataFrame(baseline_results['data'], columns=baseline_results['headers'])
	best_models, reasoning = model_selection_agent.identify_best_models(results_df, original_params['component'], r2_threshold, rmse_threshold, instructions)

	if not best_models:
	return gr.update(), baseline_results, None, None, f"🤖 Análisis del Agente:\n{reasoning}", None, log_agent.get_report()

	progress(0.4, desc="Re-ejecutando análisis con los mejores modelos...");
	try:
	exp_names_str = ",".join(original_params['exp_names'])
	final_plot_info, final_df_data, final_status = biotech_client.predict(file=handle_file(file_path), models=best_models, component=original_params['component'], use_de=original_params['use_de'], maxfev=original_params['maxfev'], exp_names=exp_names_str, api_name="/run_analysis_wrapper")
	if "Error" in final_status: raise Exception(final_status)
	except Exception as e:
	return gr.update(), None, None, None, f"❌ Error en el re-análisis final: {e}", None, log_agent.get_report()

	progress(0.6, desc="Generando informe IA..."); temp_csv_file = None
	try:
	final_results_df = pd.DataFrame(final_df_data['data'], columns=final_df_data['headers'])
	with tempfile.NamedTemporaryFile(mode='w+', suffix='.csv', delete=False, encoding='utf-8') as temp_f:
	final_results_df.to_csv(temp_f.name, index=False); temp_csv_file = temp_f.name
	current_analysis_client = analysis_client
	if use_personal_key and personal_api_key: current_analysis_client = Client("C2MV/Project-HF-2025-2", hf_token=personal_api_key)
	chunk_update_dict = current_analysis_client.predict(files=[handle_file(temp_csv_file)], api_name="/update_chunk_column_selector")
	selected_chunk_column = chunk_update_dict['choices'][0][0]
	result = current_analysis_client.predict(files=[handle_file(temp_csv_file)], chunk_column=selected_chunk_column, qwen_model=ia_model, detail_level=detail_level, language=language, additional_specs="", max_output_tokens=max_output_tokens, api_name="/process_files_and_analyze")
	_, analysis_report, implementation_code, token_usage = result
	except Exception as e:
	return gr.update(), final_df_data, None, None, f"❌ Error generando informe IA: {e}", None, log_agent.get_report()
	finally:
	if temp_csv_file and os.path.exists(temp_csv_file): os.remove(temp_csv_file)

	progress(0.9, desc="Finalizando..."); final_report_path = None
	if analysis_report:
	export_dir = "exported_reports"; os.makedirs(export_dir, exist_ok=True)
	final_report_path = os.path.join(export_dir, f"report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md")
	with open(final_report_path, 'w', encoding='utf-8') as f: f.write(analysis_report)

	final_status = f"✅ Refinamiento y reporte completados.\n{reasoning}\nInforme IA generado con {token_usage}."
	final_fig = go.Figure(json.loads(final_plot_info['plot'])) if final_plot_info and 'plot' in final_plot_info else create_dummy_plot()
	return final_fig, final_df_data, analysis_report, implementation_code, final_status, final_report_path, log_agent.get_report()

	# ... (create_dummy_excel_file y constantes sin cambios) ...
	def create_dummy_excel_file():
	examples_dir = "examples"; os.makedirs(examples_dir, exist_ok=True); file_path = os.path.join(examples_dir, "archivo.xlsx")
	if not os.path.exists(file_path):
	exp_names = ['CN 20_1', 'CN 20_2', 'CN 30_1', 'CN 40_1']; writer = pd.ExcelWriter(file_path, engine='xlsxwriter'); worksheet = writer.book.add_worksheet('Datos'); writer.sheets['Datos'] = worksheet
	for i, name in enumerate(exp_names): worksheet.write(0, i * 4, name)
	start_col = 0
	for _ in exp_names:
	time = np.arange(0, 11, 2); biomass = 0.2 + (np.random.rand() * 20) / (1 + np.exp(4 - 0.5 * time)) + np.random.rand(len(time)) * 0.2
	substrate = 10 * np.exp(-0.2 * time) + np.random.rand(len(time)) * 0.3; product = 1 * (1 - np.exp(-0.3 * time)) + np.random.rand(len(time)) * 0.1
	df = pd.DataFrame({'Tiempo': time, 'Biomasa': biomass, 'Sustrato': substrate, 'Producto': product})
	df.to_excel(writer, sheet_name='Datos', startrow=1, startcol=start_col, index=False); start_col += 4
	writer.close()

	BIOTECH_MODELS = ['logistic', 'gompertz', 'moser', 'baranyi', 'monod', 'contois', 'andrews', 'tessier', 'richards', 'stannard', 'huang']
	IA_MODELS = ["deepseek-ai/DeepSeek-V3-0324"]
	theme = gr.themes.Soft(primary_hue="blue", secondary_hue="indigo", neutral_hue="slate")

	if __name__ == "__main__":
	create_dummy_excel_file()
	with gr.Blocks(theme=theme, title="BioTech Analysis & Report Generator") as demo:
	gr.Markdown("# 🧬 BioTech Analysis & Report Generator v7.8")
	gr.Markdown("### Un pipeline inteligente de dos etapas: Análisis Base y Refinamiento con IA.")
	baseline_results_state = gr.State(value=None); file_path_state = gr.State(value=None); original_params_state = gr.State(value=None)
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 1. Carga y Configuración del Análisis Base")
	file_input = gr.File(label="📁 Archivo de Datos", file_types=[".xlsx", ".xls"]); gr.Examples(examples=["examples/archivo.xlsx"], inputs=[file_input])
	exp_names_input = gr.CheckboxGroup(label="🔬 Experimentos a Analizar", interactive=False)
	models_input = gr.CheckboxGroup(choices=BIOTECH_MODELS, value=BIOTECH_MODELS, label="📊 Modelos a Evaluar")
	component_input = gr.Dropdown(['all', 'biomass', 'substrate', 'product'], value='all', label="📈 Componente a Analizar/Filtrar")
	with gr.Accordion("Parámetros Avanzados", open=False):
	use_de_input = gr.Checkbox(label="🧮 Usar Evolución Diferencial", value=False)
	maxfev_input = gr.Slider(label="🔄 Máx. Iteraciones", minimum=10000, maximum=100000, value=50000, step=1000)
	run_base_analysis_btn = gr.Button("1. Ejecutar Análisis Base", variant="secondary")

	with gr.Group():
	gr.Markdown("### 2. Refinamiento con IA")
	gr.Markdown("#### Criterios de Selección Primarios (Filtro Estricto)")
	r2_threshold_slider = gr.Slider(minimum=0.0, maximum=0.99, value=0.9, step=0.01, label="R² Mínimo")
	rmse_threshold_input = gr.Number(value=0.5, label="RMSE Máximo")

	# --- NUEVO INPUT DE INSTRUCCIONES ---
	additional_specs_input = gr.Textbox(label="📝 Instrucciones para Selección Avanzada (Plan B)",
	placeholder="Ej: Usa R2 y RMSE y dame el top 2",
	info="Si ningún modelo cumple los criterios de arriba, el agente seguirá estas instrucciones.")

	with gr.Accordion("Parámetros del Informe de IA Final", open=False):
	ia_model_input = gr.Dropdown(choices=IA_MODELS, value=IA_MODELS[0], label="🤖 Modelo de IA para Informe")
	detail_level_input = gr.Radio(['detailed', 'summarized'], value='detailed', label="📋 Nivel de Detalle")
	language_input = gr.Dropdown(['es', 'en'], value='es', label="🌐 Idioma")
	max_output_tokens_input = gr.Slider(minimum=1000, maximum=32000, value=8000, step=100, label="🔢 Máx. Tokens")
	use_personal_key_input = gr.Checkbox(label="Usar Token HF Personal", value=False)
	personal_api_key_input = gr.Textbox(label="Token HF", type="password", visible=False)
	refine_with_ia_btn = gr.Button("2. 🤖 Aplicar Filtro y Generar Informe IA", variant="primary", interactive=False)
	with gr.Column(scale=2):
	gr.Markdown("### 3. Resultados")
	status_output = gr.Textbox(label="📊 Registro de Estado", lines=5, interactive=False)
	with gr.Tabs():
	with gr.TabItem("📊 Visualización"): plot_output = gr.Plot()
	with gr.TabItem("📋 Tabla de Modelado"): table_output = gr.Dataframe()
	with gr.TabItem("📝 Informe IA"): analysis_output = gr.Markdown("El informe aparecerá aquí.")
	with gr.TabItem("💻 Código"): code_output = gr.Code(language="python")
	with gr.TabItem("🕵️ Registro de Agentes"): agent_log_output = gr.Markdown()
	download_link_markdown = gr.Markdown("El enlace de descarga aparecerá aquí.")
	report_output = gr.File(label="📥 Descargar Informe", interactive=False)
	report_path_state = gr.State(value=None)

	file_input.upload(fn=detect_experiments, inputs=file_input, outputs=exp_names_input)
	use_personal_key_input.change(lambda x: gr.update(visible=x), inputs=use_personal_key_input, outputs=personal_api_key_input)
	run_base_analysis_btn.click(
	fn=run_base_analysis,
	inputs=[file_input, models_input, exp_names_input, component_input, use_de_input, maxfev_input],
	outputs=[plot_output, table_output, status_output, refine_with_ia_btn, baseline_results_state, file_path_state, original_params_state, agent_log_output]
	)
	refine_with_ia_btn.click(
	fn=refine_and_generate_report,
	inputs=[baseline_results_state, file_path_state, original_params_state, r2_threshold_slider, rmse_threshold_input, additional_specs_input, ia_model_input, detail_level_input, language_input, max_output_tokens_input, use_personal_key_input, personal_api_key_input],
	outputs=[plot_output, table_output, analysis_output, code_output, status_output, report_path_state, agent_log_output]
	)
	def update_dl_link(path):
	if path and os.path.exists(path): return f"¡Informe listo! 👉 [Descargar '{os.path.basename(path)}'](/file={path})"
	return "No se generó ningún archivo para descargar."
	report_path_state.change(fn=update_dl_link, inputs=report_path_state, outputs=download_link_markdown)

	demo.launch(show_error=True, debug=True)