Spaces:
Sleeping
Sleeping
import gradio as gr | |
import PyPDF2 | |
import pandas as pd | |
import numpy as np | |
import io | |
import os | |
import json | |
import zipfile | |
import tempfile | |
from typing import Dict, List, Tuple, Union, Optional, Generator | |
import re | |
from pathlib import Path | |
import openpyxl | |
from dataclasses import dataclass, asdict | |
from enum import Enum | |
from docx import Document | |
from docx.shared import Inches, Pt, RGBColor | |
from docx.enum.text import WD_ALIGN_PARAGRAPH | |
from reportlab.lib import colors | |
from reportlab.lib.pagesizes import letter, A4 | |
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak | |
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle | |
from reportlab.lib.units import inch | |
from reportlab.pdfbase import pdfmetrics | |
from reportlab.pdfbase.ttfonts import TTFont | |
import matplotlib.pyplot as plt | |
from datetime import datetime | |
from openai import OpenAI | |
# --- CONFIGURACIÓN Y CONSTANTES --- | |
os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False' | |
client = OpenAI( | |
base_url="https://api.studio.nebius.com/v1/", | |
api_key=os.environ.get("NEBIUS_API_KEY") | |
) | |
# Se añade la nueva etiqueta para el selector de columna | |
TRANSLATIONS = { | |
'en': { | |
'title': '🧬 Scalable Biotech Model Analyzer', | |
'subtitle': 'Analyzes large sets of model fitting results using a chunking strategy', | |
'upload_files': '📁 Upload fitting results (CSV/Excel)', | |
'chunk_column_label': '🔬 Select Column for Grouping Experiments', | |
'chunk_column_info': 'Choose the column that identifies each unique experiment. This is used for chunking.', | |
'select_model': '🤖 IA Model (editable)', | |
'select_language': '🌐 Language', | |
'select_theme': '🎨 Theme', | |
'detail_level': '📋 Analysis detail level', | |
'detailed': 'Detailed', | |
'summarized': 'Summarized', | |
'analyze_button': '🚀 Analyze and Compare Models', | |
'export_format': '📄 Export format', | |
'export_button': '💾 Export Report', | |
'comparative_analysis': '📊 Comparative Analysis', | |
'implementation_code': '💻 Implementation Code', | |
'data_format': '📋 Expected data format', | |
'loading': 'Loading...', | |
'error_no_api': 'Please configure NEBIUS_API_KEY in HuggingFace Space secrets', | |
'error_no_files': 'Please upload fitting result files to analyze', | |
'report_exported': 'Report exported successfully as', | |
'additional_specs': '📝 Additional specifications for analysis', | |
'additional_specs_placeholder': 'Add any specific requirements or focus areas for the analysis...', | |
'output_tokens_per_chunk': '🔢 Max output tokens per chunk (1k-32k)', | |
'token_info': 'ℹ️ Token usage information', | |
'input_token_count': 'Input tokens used', | |
'output_token_count': 'Output tokens used', | |
'total_token_count': 'Total tokens used', | |
'token_cost': 'Estimated cost', | |
'thinking_process': '🧠 Thinking Process', | |
'analysis_report': '📊 Analysis Report', | |
'code_output': '💻 Implementation Code', | |
'token_usage': '💰 Token Usage' | |
}, | |
'es': { | |
'title': '🧬 Analizador Escalable de Modelos Biotecnológicos', | |
'subtitle': 'Analiza grandes conjuntos de datos de ajuste de modelos usando una estrategia por partes', | |
'upload_files': '📁 Subir resultados de ajuste (CSV/Excel)', | |
'chunk_column_label': '🔬 Seleccionar Columna para Agrupar Experimentos', | |
'chunk_column_info': 'Elige la columna que identifica cada experimento único. Se usará para dividir el análisis.', | |
'select_model': '🤖 Modelo IA (editable)', | |
'select_language': '🌐 Idioma', | |
'select_theme': '🎨 Tema', | |
'detail_level': '📋 Nivel de detalle del análisis', | |
'detailed': 'Detallado', | |
'summarized': 'Resumido', | |
'analyze_button': '🚀 Analizar y Comparar Modelos', | |
'export_format': '📄 Formato de exportación', | |
'export_button': '💾 Exportar Reporte', | |
'comparative_analysis': '📊 Análisis Comparativo', | |
'implementation_code': '💻 Código de Implementación', | |
'data_format': '📋 Formato de datos esperado', | |
'loading': 'Cargando...', | |
'error_no_api': 'Por favor configura NEBIUS_API_KEY en los secretos del Space', | |
'error_no_files': 'Por favor sube archivos con resultados de ajuste para analizar', | |
'report_exported': 'Reporte exportado exitosamente como', | |
'additional_specs': '📝 Especificaciones adicionales para el análisis', | |
'additional_specs_placeholder': 'Agregue cualquier requerimiento específico o áreas de enfoque para el análisis...', | |
'output_tokens_per_chunk': '🔢 Max tokens de salida por pieza (1k-32k)', | |
'token_info': 'ℹ️ Información de uso de tokens', | |
'input_token_count': 'Tokens de entrada usados', | |
'output_token_count': 'Tokens de salida usados', | |
'total_token_count': 'Total de tokens usados', | |
'token_cost': 'Costo estimado', | |
'thinking_process': '🧠 Proceso de Pensamiento', | |
'analysis_report': '📊 Reporte de Análisis', | |
'code_output': '💻 Código de Implementación', | |
'token_usage': '💰 Uso de Tokens' | |
} | |
} | |
THEMES = { 'light': gr.themes.Soft(), 'dark': gr.themes.Base() } | |
QWEN_MODELS = { | |
"Qwen/Qwen3-14B": {"max_context_tokens": 40960, "input_cost": 0.0000007, "output_cost": 0.0000021}, | |
"Qwen/Qwen3-7B": {"max_context_tokens": 40960, "input_cost": 0.00000035, "output_cost": 0.00000105}, | |
"Qwen/Qwen1.5-14B": {"max_context_tokens": 40960, "input_cost": 0.0000007, "output_cost": 0.0000021} | |
} | |
# --- CLASES DE UTILIDAD (Se asume que existen, omitidas por brevedad) --- | |
class FileProcessor: | |
"""Clase para procesar diferentes tipos de archivos""" | |
def extract_text_from_pdf(pdf_file) -> str: | |
"""Extrae texto de un archivo PDF""" | |
try: | |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file)) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
except Exception as e: | |
return f"Error reading PDF: {str(e)}" | |
def read_csv(csv_file) -> pd.DataFrame: | |
"""Lee archivo CSV""" | |
try: | |
return pd.read_csv(io.BytesIO(csv_file)) | |
except Exception as e: | |
return None | |
def read_excel(excel_file) -> pd.DataFrame: | |
"""Lee archivo Excel""" | |
try: | |
return pd.read_excel(io.BytesIO(excel_file)) | |
except Exception as e: | |
return None | |
def extract_from_zip(zip_file) -> List[Tuple[str, bytes]]: | |
"""Extrae archivos de un ZIP""" | |
files = [] | |
try: | |
with zipfile.ZipFile(io.BytesIO(zip_file), 'r') as zip_ref: | |
for file_name in zip_ref.namelist(): | |
if not file_name.startswith('__MACOSX'): | |
file_data = zip_ref.read(file_name) | |
files.append((file_name, file_data)) | |
except Exception as e: | |
print(f"Error processing ZIP: {e}") | |
return files | |
class ReportExporter: | |
"""Clase para exportar reportes a diferentes formatos""" | |
def export_to_docx(content: str, filename: str, language: str = 'en') -> str: | |
"""Exporta el contenido a un archivo DOCX""" | |
doc = Document() | |
# Configurar estilos | |
title_style = doc.styles['Title'] | |
title_style.font.size = Pt(24) | |
title_style.font.bold = True | |
heading_style = doc.styles['Heading 1'] | |
heading_style.font.size = Pt(18) | |
heading_style.font.bold = True | |
# Título | |
title_text = { | |
'en': 'Comparative Analysis Report - Biotechnological Models', | |
'es': 'Informe de Análisis Comparativo - Modelos Biotecnológicos', | |
} | |
doc.add_heading(title_text.get(language, title_text['en']), 0) | |
# Fecha | |
date_text = { | |
'en': 'Generated on', | |
'es': 'Generado el', | |
} | |
doc.add_paragraph(f"{date_text.get(language, date_text['en'])}: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
doc.add_paragraph() | |
# Procesar contenido | |
lines = content.split('\n') | |
current_paragraph = None | |
for line in lines: | |
line = line.strip() | |
if line.startswith('###'): | |
doc.add_heading(line.replace('###', '').strip(), level=2) | |
elif line.startswith('##'): | |
doc.add_heading(line.replace('##', '').strip(), level=1) | |
elif line.startswith('#'): | |
doc.add_heading(line.replace('#', '').strip(), level=0) | |
elif line.startswith('**') and line.endswith('**'): | |
# Texto en negrita | |
p = doc.add_paragraph() | |
run = p.add_run(line.replace('**', '')) | |
run.bold = True | |
elif line.startswith('- ') or line.startswith('* '): | |
# Lista | |
doc.add_paragraph(line[2:], style='List Bullet') | |
elif line.startswith(tuple('0123456789')): | |
# Lista numerada | |
doc.add_paragraph(line, style='List Number') | |
elif line == '---' or line.startswith('==='): | |
# Separador | |
doc.add_paragraph('_' * 50) | |
elif line: | |
# Párrafo normal | |
doc.add_paragraph(line) | |
# Guardar documento | |
doc.save(filename) | |
return filename | |
def export_to_pdf(content: str, filename: str, language: str = 'en') -> str: | |
"""Exporta el contenido a un archivo PDF""" | |
# Crear documento PDF | |
doc = SimpleDocTemplate(filename, pagesize=letter) | |
story = [] | |
styles = getSampleStyleSheet() | |
# Estilos personalizados | |
title_style = ParagraphStyle( | |
'CustomTitle', | |
parent=styles['Title'], | |
fontSize=24, | |
textColor=colors.HexColor('#1f4788'), | |
spaceAfter=30 | |
) | |
heading_style = ParagraphStyle( | |
'CustomHeading', | |
parent=styles['Heading1'], | |
fontSize=16, | |
textColor=colors.HexColor('#2e5090'), | |
spaceAfter=12 | |
) | |
# Título | |
title_text = { | |
'en': 'Comparative Analysis Report - Biotechnological Models', | |
'es': 'Informe de Análisis Comparativo - Modelos Biotecnológicos', | |
} | |
story.append(Paragraph(title_text.get(language, title_text['en']), title_style)) | |
# Fecha | |
date_text = { | |
'en': 'Generated on', | |
'es': 'Generado el', | |
} | |
story.append(Paragraph(f"{date_text.get(language, date_text['en'])}: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", styles['Normal'])) | |
story.append(Spacer(1, 0.5*inch)) | |
# Procesar contenido | |
lines = content.split('\n') | |
for line in lines: | |
line = line.strip() | |
if not line: | |
story.append(Spacer(1, 0.2*inch)) | |
elif line.startswith('###'): | |
story.append(Paragraph(line.replace('###', '').strip(), styles['Heading3'])) | |
elif line.startswith('##'): | |
story.append(Paragraph(line.replace('##', '').strip(), styles['Heading2'])) | |
elif line.startswith('#'): | |
story.append(Paragraph(line.replace('#', '').strip(), heading_style)) | |
elif line.startswith('**') and line.endswith('**'): | |
text = line.replace('**', '') | |
story.append(Paragraph(f"<b>{text}</b>", styles['Normal'])) | |
elif line.startswith('- ') or line.startswith('* '): | |
story.append(Paragraph(f"• {line[2:]}", styles['Normal'])) | |
elif line == '---' or line.startswith('==='): | |
story.append(Spacer(1, 0.3*inch)) | |
story.append(Paragraph("_" * 70, styles['Normal'])) | |
story.append(Spacer(1, 0.3*inch)) | |
else: | |
# Limpiar caracteres especiales para PDF | |
clean_line = line.replace('📊', '[GRAPH]').replace('🎯', '[TARGET]').replace('🔍', '[SEARCH]').replace('💡', '[TIP]') | |
story.append(Paragraph(clean_line, styles['Normal'])) | |
# Construir PDF | |
doc.build(story) | |
return filename | |
# --- CLASE AIAnalyzer (MODIFICADA PARA ACEPTAR chunk_column) --- | |
class AIAnalyzer: | |
"""Clase para análisis con IA que implementa una estrategia 'chunk-and-stitch'.""" | |
def __init__(self, client): | |
self.client = client | |
self.token_usage = {} | |
self.reset_token_usage() | |
def reset_token_usage(self): | |
self.token_usage = {'input_tokens': 0, 'output_tokens': 0, 'total_tokens': 0, 'estimated_cost': 0.0} | |
def _update_token_usage(self, model_name: str, usage): | |
if not usage: return | |
self.token_usage['input_tokens'] += usage.prompt_tokens | |
self.token_usage['output_tokens'] += usage.completion_tokens | |
self.token_usage['total_tokens'] += usage.total_tokens | |
model_info = QWEN_MODELS.get(model_name, {}) | |
input_cost = model_info.get('input_cost', 0.0) | |
output_cost = model_info.get('output_cost', 0.0) | |
self.token_usage['estimated_cost'] += (usage.prompt_tokens * input_cost) + (usage.completion_tokens * output_cost) | |
def _calculate_safe_max_tokens(self, model_name: str, user_requested_tokens: int) -> int: | |
model_info = QWEN_MODELS.get(model_name, {"max_context_tokens": 32768}) | |
context_limit = model_info['max_context_tokens'] | |
PROMPT_SAFETY_MARGIN = 8192 | |
max_allowable_output = context_limit - PROMPT_SAFETY_MARGIN | |
return max(100, min(user_requested_tokens, max_allowable_output)) | |
#### | |
def _analyze_single_experiment(self, experiment_df: pd.DataFrame, experiment_id: str, qwen_model: str, lang_prefix: str, max_output_tokens: int) -> Optional[Dict]: | |
""" | |
Analiza los resultados de un único experimento (un 'chunk' de datos) y devuelve un JSON estructurado. | |
Esta función es el núcleo de la estrategia 'map' en el enfoque 'map-reduce'. | |
""" | |
# El prompt es la parte más importante. Está diseñado para ser muy específico y dar un ejemplo claro. | |
prompt = f""" | |
{lang_prefix} | |
You are an expert biotechnological data analyst. Your task is to analyze the provided model fitting results for a single experiment identified as: '{experiment_id}'. | |
The data contains different mathematical models that were fitted to experimental data for variables like Biomass, Substrate, or Product. | |
DATA FOR THIS SPECIFIC EXPERIMENT ('{experiment_id}'): | |
``` | |
{experiment_df.to_string()} | |
``` | |
YOUR INSTRUCTIONS: | |
1. **Identify Best Models**: For EACH variable type present in the data (e.g., 'Biomass', 'Substrate'), determine the single best-performing model. The best model is the one with the highest R² value. If R² values are equal, use the lowest RMSE as a tie-breaker. | |
2. **Extract Key Information**: For each of these best models, you must extract: | |
- The model's name. | |
- The specific metrics (R², RMSE, AIC, etc.) as key-value pairs. | |
- All kinetic parameters and their fitted values (e.g., mu_max, Ks) as key-value pairs. | |
3. **Summarize All Tested Models**: Create a simple list of the names of ALL models that were tested in this experiment, regardless of their performance. | |
4. **Provide Biological Interpretation**: Write a brief, concise interpretation (2-3 sentences) of what the results for this specific experiment imply. For example, "The selection of the Monod model for biomass with a µ_max of 0.45 suggests rapid growth under these conditions, while the high R² indicates a strong fit." | |
**CRITICAL OUTPUT FORMAT**: You MUST respond ONLY with a single, valid JSON object. Do not add any explanatory text, markdown formatting, or anything else before or after the JSON structure. | |
Follow this EXACT JSON structure: | |
{{ | |
"experiment_id": "{experiment_id}", | |
"best_models_by_variable": [ | |
{{ | |
"variable_type": "Biomass", | |
"model_name": "Name of the best model for Biomass", | |
"metrics": {{ | |
"R2": 0.99, | |
"RMSE": 0.01, | |
"AIC": -50.2 | |
}}, | |
"parameters": {{ | |
"mu_max": 0.5, | |
"Ks": 10.2 | |
}} | |
}}, | |
{{ | |
"variable_type": "Substrate", | |
"model_name": "Name of the best model for Substrate", | |
"metrics": {{ | |
"R2": 0.98, | |
"RMSE": 0.05 | |
}}, | |
"parameters": {{ | |
"k_consumption": 1.5 | |
}} | |
}} | |
], | |
"all_tested_models": ["Monod", "Logistic", "Gompertz", "First_Order"], | |
"interpretation": "A brief, data-driven interpretation of the kinetic behavior observed in this specific experiment." | |
}} | |
""" | |
try: | |
# Calcular un número seguro de tokens de salida para evitar exceder el límite de contexto del modelo. | |
safe_max_tokens = self._calculate_safe_max_tokens(qwen_model, max_output_tokens) | |
# Realizar la llamada a la API de OpenAI/Nebius | |
response = self.client.chat.completions.create( | |
model=qwen_model, | |
max_tokens=safe_max_tokens, | |
temperature=0.05, # Temperatura baja para una salida más predecible y estructurada | |
response_format={"type": "json_object"}, # Forza la salida a ser un JSON válido | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant designed to output JSON."}, | |
{"role": "user", "content": prompt} | |
] | |
) | |
# Actualizar el contador de tokens y el costo estimado. | |
self._update_token_usage(qwen_model, response.usage) | |
# Extraer el contenido de la respuesta. | |
content = response.choices[0].message.content | |
# Parsear la cadena de texto JSON a un diccionario de Python. | |
# Este paso es propenso a errores si el LLM no sigue las instrucciones perfectamente. | |
parsed_json = json.loads(content) | |
return parsed_json | |
except json.JSONDecodeError as e: | |
# Capturar errores si la respuesta del modelo no es un JSON válido. | |
print(f"CRITICAL ERROR: Failed to decode JSON for experiment '{experiment_id}'.") | |
print(f"JSONDecodeError: {e}") | |
print(f"LLM Raw Output that caused the error:\n---\n{content}\n---") | |
return None # Devolver None para indicar que el análisis de este chunk falló. | |
except Exception as e: | |
# Capturar otros errores de la API (ej. problemas de red, clave inválida, etc.). | |
print(f"API Error during single analysis for experiment '{experiment_id}': {e}") | |
return None # Devolver None para que el proceso principal pueda saltar este chunk. | |
#### | |
def _synthesize_comparative_analysis(self, individual_analyses: List[Dict], qwen_model: str, detail_level: str, lang_prefix: str, additional_specs: str, max_output_tokens: int) -> str: | |
""" | |
Sintetiza los análisis individuales (JSONs) en un reporte comparativo final en formato Markdown. | |
Esta es la etapa 'reduce' del proceso. | |
""" | |
# 1. Preparar los datos de entrada para el modelo. | |
# Convertimos la lista de diccionarios de Python a una cadena de texto JSON bien formateada. | |
# Esto es lo que el LLM verá como su "base de conocimiento". | |
analyses_summary = json.dumps(individual_analyses, indent=2) | |
# 2. Construir el prompt de síntesis. | |
# Este prompt es más conceptual que el anterior. Le pide al modelo que actúe como un científico senior. | |
# Sección para las especificaciones adicionales del usuario. | |
user_specs_section = f""" | |
## User's Additional Specifications | |
Please pay special attention to the following user-provided requirements during your analysis: | |
- {additional_specs} | |
""" if additional_specs else "" | |
# Instrucción de nivel de detalle basada en la selección del usuario. | |
detail_instruction = ( | |
"Your report must be highly detailed and exhaustive. Include multiple tables, in-depth parameter comparisons, and nuanced biological interpretations." | |
if detail_level == "detailed" else | |
"Your report should be a high-level summary. Focus on the main conclusions and key takeaways, using concise tables and bullet points." | |
) | |
prompt = f""" | |
{lang_prefix} | |
You are a Principal Scientist tasked with creating a final, consolidated report from a series of individual experimental analyses. | |
You have been provided with a JSON array, where each object represents the detailed analysis of one specific experiment. | |
{user_specs_section} | |
YOUR PRIMARY OBJECTIVE: | |
Synthesize all the provided information into a single, cohesive, and comparative analysis report. The report must be written in rich Markdown format. | |
{detail_instruction} | |
Your final report MUST contain the following sections: | |
### 1. Executive Summary & Experimental Inventory | |
- Start with a brief paragraph summarizing the scope of the experiments analyzed. | |
- Create a Markdown table that serves as an inventory of all experiments. The table should list each `experiment_id`, the `variable_type` (e.g., Biomass), and the `model_name` of the best-performing model for that variable. | |
### 2. In-Depth Comparative Analysis | |
- **Model Performance Matrix:** This is the most critical part. Create a Markdown table that compares the performance of all major models across all experiments. Use R² as the primary metric. Rows should be model names, and columns should be experiment IDs. This allows for a direct visual comparison of which models are robust across different conditions. | |
- **Parameter Trend Analysis:** Analyze how key kinetic parameters (e.g., `mu_max`, `Ks`, etc.) change across the different experimental conditions. Discuss any observable trends, correlations, or significant differences. For example: "We observed that `mu_max` consistently increased as temperature rose from Exp_A to Exp_C, suggesting a direct correlation in this range." | |
- **Model Selection Justification:** Discuss why certain models performed better under specific conditions, referencing the biological interpretations from the input data. | |
### 3. Overall Recommendations & Conclusions | |
- **Globally Recommended Models:** Based on the entire dataset, declare the best overall model for each primary variable type (Biomass, Substrate, etc.). Justify your choice based on consistent high performance and robustness across experiments. | |
- **Condition-Specific Guidelines:** Provide actionable recommendations. For example, "For experiments conducted under high pH conditions (similar to 'Exp_C'), the 'Gompertz' model is strongly recommended due to its superior fit." | |
- **Suggestions for Future Research:** Briefly suggest a few next steps or potential experiments to validate the findings or explore new hypotheses. | |
--- | |
**INPUT DATA: JSON ARRAY OF INDIVIDUAL ANALYSES** | |
```json | |
{analyses_summary} | |
``` | |
--- | |
Now, generate the complete, final Markdown report based on these instructions. | |
""" | |
try: | |
# Aumentamos el número de tokens de salida solicitados para la etapa de síntesis, | |
# ya que el reporte final puede ser largo. Se multiplica por 2 como heurística. | |
safe_max_tokens = self._calculate_safe_max_tokens(qwen_model, max_output_tokens * 2) | |
# Realizar la llamada a la API | |
response = self.client.chat.completions.create( | |
model=qwen_model, | |
max_tokens=safe_max_tokens, | |
temperature=0.2, # Una temperatura ligeramente más alta que en el análisis individual para permitir más creatividad en la redacción. | |
messages=[ | |
{"role": "user", "content": prompt} | |
] | |
) | |
# Actualizar el uso de tokens y el costo. | |
self._update_token_usage(qwen_model, response.usage) | |
# Devolver el contenido del reporte generado. | |
return response.choices[0].message.content | |
except Exception as e: | |
# Manejar cualquier error durante la llamada a la API de síntesis. | |
error_message = f"CRITICAL ERROR: Failed during the final report synthesis stage. Details: {e}" | |
print(error_message) | |
return error_message | |
# --- DENTRO DE LA CLASE AIAnalyzer --- | |
def analyze_data(self, data: pd.DataFrame, chunk_column: str, qwen_model: str, detail_level: str, language: str, additional_specs: str, max_output_tokens: int) -> Generator[Union[str, Dict], None, None]: | |
""" | |
Orquesta el análisis completo como un generador, produciendo actualizaciones de estado. | |
""" | |
self.reset_token_usage() | |
if chunk_column not in data.columns: | |
yield {"error": f"The selected chunking column '{chunk_column}' was not found in the data."} | |
return | |
unique_experiments = data[chunk_column].unique() | |
yield f"Identified {len(unique_experiments)} groups to analyze using column '{chunk_column}': {list(unique_experiments)}" | |
individual_results = [] | |
lang_prefix = "Please respond in English. " if language == 'en' else "Por favor responde en español. " | |
for i, exp_id in enumerate(unique_experiments): | |
yield f"({i+1}/{len(unique_experiments)}) Analyzing group: '{str(exp_id)}'..." | |
experiment_df = data[data[chunk_column] == exp_id] | |
result = self._analyze_single_experiment(experiment_df, str(exp_id), qwen_model, lang_prefix, max_output_tokens) | |
if result: | |
individual_results.append(result) | |
yield f"✅ Analysis for '{str(exp_id)}' complete." | |
else: | |
yield f"⚠️ Failed to analyze '{str(exp_id)}'. Skipping." | |
if not individual_results: | |
yield {"error": "Could not analyze any of the data groups. Please check data format and API status."} | |
return | |
yield "All groups analyzed. Synthesizing final comparative report..." | |
final_analysis = self._synthesize_comparative_analysis( | |
individual_results, qwen_model, detail_level, lang_prefix, additional_specs, max_output_tokens | |
) | |
yield "✅ Final report generated." | |
yield "Generating implementation code..." | |
code_result = "# Code generation is a placeholder in this version." | |
yield "✅ Code generated." | |
# Al final, produce el diccionario de resultados completo. | |
yield { | |
"analisis_completo": final_analysis, | |
"codigo_implementacion": code_result, | |
} | |
# --- FUNCIÓN DE PROCESAMIENTO PRINCIPAL --- | |
# --- FUNCIÓN DE PROCESAMIENTO PRINCIPAL (fuera de cualquier clase) --- | |
def process_files_and_analyze(files, chunk_column: str, qwen_model: str, detail_level: str, language: str, additional_specs: str, max_output_tokens: int): | |
""" | |
Procesa archivos subidos y orquesta el análisis, actualizando la UI con 'yield'. | |
""" | |
if not files: | |
yield "Please upload a file first.", "", "", "" | |
return | |
if not chunk_column: | |
yield "Please upload a file and select a column for grouping before analyzing.", "", "", "" | |
return | |
# Inicializa las variables que se irán actualizando. | |
thinking_log = ["### 🚀 Starting Analysis\n"] | |
analysis_result, code_result, token_report = "", "", "" | |
# Función auxiliar para actualizar el log y hacer yield a la UI | |
def update_log_and_yield(message): | |
nonlocal thinking_log | |
thinking_log.append(f"- {datetime.now().strftime('%H:%M:%S')}: {message}\n") | |
return "\n".join(thinking_log), gr.update(), gr.update(), gr.update() | |
yield update_log_and_yield("Processing uploaded file...") | |
file = files[0] | |
try: | |
df = pd.read_csv(file.name) if file.name.endswith('.csv') else pd.read_excel(file.name) | |
yield update_log_and_yield(f"Successfully loaded data from '{Path(file.name).name}'.") | |
except Exception as e: | |
yield update_log_and_yield(f"Error reading file: {e}") | |
return | |
# Inicia el analizador | |
analyzer = AIAnalyzer(client) | |
# Itera sobre el generador `analyze_data` | |
# Cada 'item' será una actualización de estado (string) o el resultado final (dict) | |
for item in analyzer.analyze_data(df, chunk_column, qwen_model, detail_level, language, additional_specs, max_output_tokens): | |
if isinstance(item, str): | |
# Es una actualización de estado, actualizamos el log de "thinking" | |
yield update_log_and_yield(item) | |
elif isinstance(item, dict) and "error" in item: | |
# Es un diccionario de error, terminamos el proceso. | |
yield update_log_and_yield(f"ANALYSIS FAILED: {item['error']}") | |
return | |
elif isinstance(item, dict): | |
# Es el diccionario de resultados final. | |
analysis_result = item["analisis_completo"] | |
code_result = item["codigo_implementacion"] | |
# Almacenar en el estado global para la exportación | |
app_state.current_analysis = analysis_result | |
app_state.current_code = code_result | |
# Formatear el reporte de tokens final | |
t = TRANSLATIONS[language] | |
token_info = analyzer.token_usage | |
token_report = f""" | |
### {t['token_info']} | |
- **{t['input_token_count']}:** {token_info['input_tokens']} | |
- **{t['output_token_count']}:** {token_info['output_tokens']} | |
- **{t['total_token_count']}:** {token_info['total_tokens']} | |
- **{t['token_cost']}:** ${token_info['estimated_cost']:.6f} | |
""" | |
# Hacemos un último yield con todos los resultados finales. | |
yield "\n".join(thinking_log), analysis_result, code_result, token_report | |
# --- ESTADO Y FUNCIONES DE UTILIDAD PARA LA UI --- | |
class AppState: | |
def __init__(self): | |
self.current_analysis = "" | |
self.current_code = "" | |
self.current_language = "en" | |
app_state = AppState() | |
app = None | |
def export_report(export_format: str, language: str) -> Tuple[str, Optional[str]]: | |
""" | |
Exporta el reporte al formato seleccionado (DOCX o PDF) usando el estado global. | |
Crea el archivo en un directorio temporal para evitar saturar el directorio de trabajo. | |
""" | |
# 1. Verificar si hay contenido para exportar en el estado global. | |
if not app_state.current_analysis: | |
error_msg = TRANSLATIONS[language].get('error_no_files', 'No analysis available to export.') | |
# Devuelve el mensaje de error y None para la ruta del archivo. | |
return error_msg, None | |
# 2. Generar un nombre de archivo único con marca de tiempo. | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
# 3. Crear un directorio temporal para almacenar el reporte. | |
# Esto es una buena práctica para no llenar el directorio raíz de la aplicación. | |
try: | |
temp_dir = tempfile.mkdtemp() | |
except Exception as e: | |
return f"Error creating temporary directory: {e}", None | |
# 4. Construir la ruta completa del archivo y llamar al exportador correspondiente. | |
try: | |
if export_format == "DOCX": | |
# Construye la ruta para el archivo .docx | |
filename = os.path.join(temp_dir, f"biotech_analysis_report_{timestamp}.docx") | |
# Llama al método estático de la clase ReportExporter para crear el DOCX. | |
# Se asume que ReportExporter está definido en otra parte del código. | |
ReportExporter.export_to_docx( | |
content=app_state.current_analysis, | |
filename=filename, | |
language=language | |
) | |
elif export_format == "PDF": | |
# Construye la ruta para el archivo .pdf | |
filename = os.path.join(temp_dir, f"biotech_analysis_report_{timestamp}.pdf") | |
# Llama al método estático de la clase ReportExporter para crear el PDF. | |
# Se asume que ReportExporter está definido en otra parte del código. | |
ReportExporter.export_to_pdf( | |
content=app_state.current_analysis, | |
filename=filename, | |
language=language | |
) | |
else: | |
# Manejar un caso improbable de formato no soportado. | |
return f"Unsupported export format: {export_format}", None | |
# 5. Si la creación del archivo fue exitosa, devolver un mensaje de éxito y la ruta al archivo. | |
success_msg_template = TRANSLATIONS[language].get('report_exported', 'Report exported successfully as') | |
success_msg = f"{success_msg_template} {os.path.basename(filename)}" | |
return success_msg, filename | |
except Exception as e: | |
# 6. Si ocurre cualquier error durante la exportación (ej. falta de permisos, error en la librería), | |
# capturarlo y devolver un mensaje de error claro. | |
error_message = f"Error during report export to {export_format}: {str(e)}" | |
print(f"EXPORT ERROR: {error_message}") # Loguear el error en la consola para depuración. | |
return error_message, None | |
# --- INTERFAZ GRADIU COMPLETA --- | |
def create_interface(): | |
global app | |
def update_interface_language(language): | |
app_state.current_language = language | |
t = TRANSLATIONS[language] | |
return [ | |
gr.update(value=f"# {t['title']}"), gr.update(value=t['subtitle']), | |
gr.update(label=t['upload_files']), gr.update(label=t['chunk_column_label'], info=t['chunk_column_info']), | |
gr.update(label=t['select_model']), gr.update(label=t['select_language']), gr.update(label=t['select_theme']), | |
gr.update(label=t['detail_level']), gr.update(choices=[(t['detailed'], "detailed"), (t['summarized'], "summarized")]), | |
gr.update(label=t['additional_specs'], placeholder=t['additional_specs_placeholder']), | |
gr.update(label=t['output_tokens_per_chunk']), gr.update(value=t['analyze_button']), | |
gr.update(label=t['export_format']), gr.update(value=t['export_button']), | |
gr.update(label=t['thinking_process']), gr.update(label=t['analysis_report']), | |
gr.update(label=t['code_output']), gr.update(label=t['token_usage']), gr.update(label=t['data_format']) | |
] | |
with gr.Blocks(theme=THEMES['light'], title="Scalable Biotech Analyzer") as demo: | |
with gr.Row(): | |
with gr.Column(scale=3): | |
title_text = gr.Markdown(f"# {TRANSLATIONS['en']['title']}") | |
subtitle_text = gr.Markdown(TRANSLATIONS['en']['subtitle']) | |
with gr.Column(scale=1): | |
language_selector = gr.Dropdown(choices=[("English", "en"), ("Español", "es")], value="en", label="Language/Idioma") | |
theme_selector = gr.Dropdown(choices=["light", "dark"], value="light", label="Theme/Tema") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
files_input = gr.File(label=TRANSLATIONS['en']['upload_files'], file_count="multiple", type="filepath") | |
# NUEVO COMPONENTE: Selector de columna de agrupación | |
chunk_column_selector = gr.Dropdown( | |
label=TRANSLATIONS['en']['chunk_column_label'], | |
info=TRANSLATIONS['en']['chunk_column_info'], | |
interactive=False # Se activa al subir archivo | |
) | |
model_selector = gr.Textbox(label=TRANSLATIONS['en']['select_model'], value="deepseek-ai/DeepSeek-V3-0324") | |
detail_level_radio = gr.Radio(choices=[("Detailed", "detailed"), ("Summarized", "summarized")], value="detailed", label=TRANSLATIONS['en']['detail_level']) | |
additional_specs = gr.Textbox(label=TRANSLATIONS['en']['additional_specs'], placeholder=TRANSLATIONS['en']['additional_specs_placeholder'], lines=3) | |
output_tokens_slider = gr.Slider(minimum=1000, maximum=32000, value=4000, step=500, label=TRANSLATIONS['en']['output_tokens_per_chunk']) | |
analyze_btn = gr.Button(TRANSLATIONS['en']['analyze_button'], variant="primary", interactive=False) # Desactivado por defecto | |
gr.Markdown("---") | |
export_format_radio = gr.Radio(choices=["DOCX", "PDF"], value="PDF", label=TRANSLATIONS['en']['export_format']) | |
export_btn = gr.Button(TRANSLATIONS['en']['export_button']) | |
export_status = gr.Textbox(label="Export Status", visible=False) | |
export_file = gr.File(label="Download Report", visible=False) | |
with gr.Column(scale=2): | |
thinking_output = gr.Markdown(label=TRANSLATIONS['en']['thinking_process']) | |
analysis_output = gr.Markdown(label=TRANSLATIONS['en']['analysis_report']) | |
code_output = gr.Code(label=TRANSLATIONS['en']['code_output'], language="python") | |
token_usage_output = gr.Markdown(label=TRANSLATIONS['en']['token_usage']) | |
data_format_accordion = gr.Accordion(label=TRANSLATIONS['en']['data_format'], open=False) | |
with data_format_accordion: | |
gr.Markdown("""...""") # Contenido del acordeón sin cambios | |
# --- LÓGICA DE EVENTOS DE LA UI --- | |
# NUEVO EVENTO: Se activa al subir un archivo para poblar el selector de columna | |
def update_chunk_column_selector(files): | |
if not files: | |
return gr.update(choices=[], value=None, interactive=False), gr.update(interactive=False) | |
try: | |
file_path = files[0].name | |
df = pd.read_csv(file_path, nrows=0) if file_path.endswith('.csv') else pd.read_excel(file_path, nrows=0) | |
columns = df.columns.tolist() | |
# Intenta encontrar una columna por defecto | |
default_candidates = ['Experiment', 'Experimento', 'Condition', 'Run', 'Batch', 'ID'] | |
default_selection = next((col for col in default_candidates if col in columns), None) | |
return gr.update(choices=columns, value=default_selection, interactive=True), gr.update(interactive=True) | |
except Exception as e: | |
gr.Warning(f"Could not read columns from file: {e}") | |
return gr.update(choices=[], value=None, interactive=False), gr.update(interactive=False) | |
files_input.upload( | |
fn=update_chunk_column_selector, | |
inputs=[files_input], | |
outputs=[chunk_column_selector, analyze_btn] | |
) | |
analyze_btn.click( | |
fn=process_files_and_analyze, | |
inputs=[files_input, chunk_column_selector, model_selector, detail_level_radio, language_selector, additional_specs, output_tokens_slider], | |
outputs=[thinking_output, analysis_output, code_output, token_usage_output] | |
) | |
# Eventos de idioma y exportación (sin cambios) | |
language_selector.change( | |
fn=update_interface_language, | |
inputs=[language_selector], | |
outputs=[title_text, subtitle_text, files_input, chunk_column_selector, model_selector, language_selector, theme_selector, detail_level_radio, detail_level_radio, additional_specs, output_tokens_slider, analyze_btn, export_format_radio, export_btn, thinking_output, analysis_output, code_output, token_usage_output, data_format_accordion] | |
) | |
export_btn.click(fn=export_report, inputs=[export_format_radio, language_selector], outputs=[export_status, export_file]) | |
app = demo | |
return demo | |
# --- FUNCIÓN PRINCIPAL DE EJECUCIÓN --- | |
def main(): | |
if not os.getenv("NEBIUS_API_KEY"): | |
return gr.Interface(lambda: TRANSLATIONS['en']['error_no_api'], [], gr.Textbox(label="Configuration Error")) | |
return create_interface() | |
if __name__ == "__main__": | |
demo = main() | |
if demo: | |
print("===== Application Startup =====") | |
demo.queue().launch(server_name="0.0.0.0", server_port=7860, share=False, inbrowser=True) |