import gradio as gr import torch import numpy as np from transformers import ( AutoModel, AutoProcessor, AutoFeatureExtractor, AutoTokenizer, pipeline ) import warnings warnings.filterwarnings("ignore") def test_single_model(model_name): """Test compatibility of a single model""" if not model_name.strip(): return "Please enter a model name" result_text = f"๐Ÿ” Testing Model: {model_name}\n" result_text += "=" * 60 + "\n\n" try: # 1. Load model result_text += "1๏ธโƒฃ Loading Model...\n" model = AutoModel.from_pretrained(model_name) result_text += " โœ… Model loaded successfully\n" result_text += f" ๐Ÿ“Š Model type: {model.config.model_type}\n" result_text += f" ๐Ÿ—๏ธ Model class: {model.__class__.__name__}\n\n" # 2. Check model architecture result_text += "2๏ธโƒฃ Checking Model Architecture...\n" if hasattr(model.config, 'hidden_size'): result_text += f" ๐Ÿ”ข Hidden size: {model.config.hidden_size}\n" if hasattr(model.config, 'num_hidden_layers'): result_text += f" ๐Ÿ“š Number of layers: {model.config.num_hidden_layers}\n" if hasattr(model.config, 'vocab_size'): result_text += f" ๐Ÿ“– Vocabulary size: {model.config.vocab_size}\n" result_text += "\n" # 3. Try to load processor result_text += "3๏ธโƒฃ Loading Processor...\n" processor = None supports_audio = False try: processor = AutoProcessor.from_pretrained(model_name) result_text += f" โœ… Processor loaded successfully: {processor.__class__.__name__}\n" supports_audio = True except: try: processor = AutoFeatureExtractor.from_pretrained(model_name) result_text += f" โœ… Feature extractor loaded successfully: {processor.__class__.__name__}\n" supports_audio = True except: result_text += " โŒ Cannot load audio processor\n" supports_audio = False result_text += "\n" # 4. Check input requirements result_text += "4๏ธโƒฃ Checking Input Requirements...\n" sampling_rate = 16000 # Default value if processor and supports_audio: if hasattr(processor, 'sampling_rate'): sampling_rate = processor.sampling_rate result_text += f" ๐ŸŽต Sampling rate: {sampling_rate} Hz\n" if hasattr(processor, 'feature_size'): result_text += f" ๐Ÿ“ Feature dimension: {processor.feature_size}\n" if hasattr(processor, 'return_attention_mask'): result_text += f" ๐ŸŽญ Supports attention mask: {processor.return_attention_mask}\n" result_text += "\n" # 5. Test inference result_text += "5๏ธโƒฃ Testing Inference...\n" if supports_audio: try: # Create dummy audio data (2 seconds) dummy_audio = np.random.randn(sampling_rate * 2).astype(np.float32) # Process audio inputs = processor(dummy_audio, sampling_rate=sampling_rate, return_tensors="pt") # Model inference with torch.no_grad(): outputs = model(**inputs) # Check output if hasattr(outputs, 'last_hidden_state'): shape = outputs.last_hidden_state.shape result_text += f" โœ… Inference successful! Hidden state shape: {shape}\n" elif hasattr(outputs, 'logits'): shape = outputs.logits.shape result_text += f" โœ… Inference successful! Logits shape: {shape}\n" else: result_text += f" โœ… Inference successful! Output type: {type(outputs)}\n" except Exception as e: result_text += f" โŒ Inference failed: {str(e)}\n" else: result_text += " โš ๏ธ Audio input not supported, skipping inference test\n" result_text += "\n" # 6. Multilingual support check result_text += "6๏ธโƒฃ Multilingual Support Check...\n" multilingual = False if hasattr(model.config, 'vocab_size') and model.config.vocab_size > 50000: result_text += f" โœ… Likely supports multiple languages (large vocabulary: {model.config.vocab_size})\n" multilingual = True elif any(keyword in model_name.lower() for keyword in ['xlsr', 'multilingual', 'cross-lingual']): result_text += " โœ… Supports multiple languages based on model name\n" multilingual = True else: result_text += " โ“ Multilingual support unclear\n" result_text += "\n" # 7. Depression detection suitability scoring result_text += "7๏ธโƒฃ Depression Detection Suitability Assessment...\n" score = 0 max_score = 15 # Most important: Specifically for depression/mental health detection (6 points) depression_keywords = ['depression', 'mental-health', 'psychological', 'mood', 'phq'] emotion_keywords = ['emotion', 'sentiment', 'affective', 'feeling'] if any(keyword in model_name.lower() for keyword in depression_keywords): score += 6 result_text += " ๐ŸŽฏ Specifically for depression/mental health detection (+6 points)\n" elif any(keyword in model_name.lower() for keyword in emotion_keywords): score += 3 result_text += " ๐Ÿ˜Š For emotion recognition, potentially applicable (+3 points)\n" # Basic requirement: Audio input support (2 points) if supports_audio: score += 2 result_text += " ๐ŸŽต Supports audio input (+2 points)\n" else: result_text += " โŒ Does not support audio input (0 points)\n" # Multilingual support (2 points) if multilingual: score += 2 result_text += " ๐ŸŒ Supports multiple languages (+2 points)\n" # Architecture suitability (2 points) if model.config.model_type in ['wav2vec2', 'hubert', 'wavlm']: score += 2 result_text += " ๐Ÿ—๏ธ Excellent speech representation learning architecture (+2 points)\n" elif model.config.model_type == 'whisper': score += 1 result_text += " โš ๏ธ Whisper architecture needs modification for classification (+1 point)\n" # Check if configured for classification if hasattr(model.config, 'num_labels'): if model.config.num_labels == 2: score += 1 result_text += f" โœ… Binary classification task configuration (likely depression detection) (+1 point)\n" else: score += 0.5 result_text += f" โš ๏ธ Multi-class task ({model.config.num_labels} classes) (+0.5 points)\n" # Check for training dataset clues daic_keywords = ['daic', 'wizard-of-oz', 'depression-detection', 'clinical'] if any(keyword in model_name.lower() for keyword in daic_keywords): score += 2 result_text += " ๐Ÿ“Š Possibly trained on clinical depression datasets (+2 points)\n" result_text += f"\n๐ŸŽฏ Depression Detection Suitability Score: {score}/{max_score}\n" # 8. Recommendations result_text += "\n8๏ธโƒฃ Usage Recommendations...\n" if score >= 12: result_text += " ๐ŸŒŸ Highly recommended! Specifically for depression detection, very suitable\n" elif score >= 8: result_text += " ๐Ÿ‘ Recommended, may need some fine-tuning\n" elif score >= 5: result_text += " โš ๏ธ Use with caution, may need significant modification\n" else: result_text += " โŒ Not recommended, suggest finding specialized depression detection models\n" # 9. Further inspection suggestions result_text += "\n9๏ธโƒฃ Further Inspection Suggestions...\n" result_text += " ๐Ÿ” Check model card for training data description\n" result_text += " ๐Ÿ“Š Check if DAIC-WOZ or other depression datasets are mentioned\n" result_text += " ๐Ÿ“ Check papers or documentation for task description\n" result_text += " ๐Ÿงช Test with small samples to see if model output matches depression detection expectations\n" return result_text except Exception as e: error_msg = f"โŒ Model test failed: {str(e)}\n" error_msg += "\nPossible causes:\n" error_msg += "โ€ข Incorrect model name\n" error_msg += "โ€ข Model requires special permissions\n" error_msg += "โ€ข Network connection issues\n" error_msg += "โ€ข Model architecture incompatibility\n" return error_msg def test_recommended_models(): """Test recommended model list""" recommended_models = [ "facebook/wav2vec2-large-xlsr-53", "microsoft/wavlm-large", "harshit345/xlsr-wav2vec-speech-emotion-recognition", "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim", "speechbrain/emotion-recognition-wav2vec2-IEMOCAP" ] result_text = "๐Ÿ” Batch Testing Recommended Models\n" result_text += "=" * 60 + "\n\n" results = [] for i, model_name in enumerate(recommended_models, 1): result_text += f"๐Ÿ“Š Testing {i}/{len(recommended_models)}: {model_name}\n" result_text += "-" * 50 + "\n" try: # Simplified quick test model = AutoModel.from_pretrained(model_name) # Check audio support supports_audio = False try: processor = AutoProcessor.from_pretrained(model_name) supports_audio = True except: try: processor = AutoFeatureExtractor.from_pretrained(model_name) supports_audio = True except: pass # Check multilingual multilingual = False if hasattr(model.config, 'vocab_size') and model.config.vocab_size > 50000: multilingual = True elif any(keyword in model_name.lower() for keyword in ['xlsr', 'multilingual']): multilingual = True # Calculate simplified score score = 0 if supports_audio: score += 3 if multilingual: score += 2 if model.config.model_type in ['wav2vec2', 'hubert', 'wavlm']: score += 3 results.append({ 'name': model_name, 'score': score, 'audio': supports_audio, 'multilingual': multilingual, 'type': model.config.model_type }) result_text += f"โœ… Loaded successfully | Audio: {'โœ…' if supports_audio else 'โŒ'} | Multilingual: {'โœ…' if multilingual else 'โŒ'} | Score: {score}/8\n\n" except Exception as e: result_text += f"โŒ Loading failed: {str(e)}\n\n" # Sort and recommend results.sort(key=lambda x: x['score'], reverse=True) result_text += "๐Ÿ† Recommendation Rankings:\n" result_text += "=" * 40 + "\n" for i, model in enumerate(results, 1): result_text += f"{i}. {model['name']}\n" result_text += f" Score: {model['score']}/8 | Type: {model['type']}\n\n" return result_text # Create Gradio interface with gr.Blocks(title="๐Ÿค– Depression Detection Model Compatibility Test") as app: gr.Markdown(""" # ๐Ÿค– Depression Detection Model Compatibility Test Tool This tool helps you quickly test whether Hugging Face models are suitable for depression detection tasks. ## Features: - โœ… Check model loading compatibility - ๐ŸŽต Verify audio input support - ๐ŸŒ Assess multilingual capabilities - ๐Ÿ“Š Suitability scoring (0-15 points) - ๐Ÿ’ก Usage recommendations """) with gr.Tab("Single Model Test"): with gr.Row(): model_input = gr.Textbox( placeholder="Enter model name, e.g.: facebook/wav2vec2-large-xlsr-53", label="๐Ÿ” Model Name", value="ireneminhee/speech-to-depression" ) test_btn = gr.Button("๐Ÿš€ Start Test", variant="primary") result_output = gr.Textbox( label="๐Ÿ“‹ Test Results", lines=25, max_lines=50 ) test_btn.click( fn=test_single_model, inputs=[model_input], outputs=[result_output] ) with gr.Tab("Recommended Models Batch Test"): gr.Markdown(""" ### ๐ŸŒŸ Recommended Depression Detection Candidate Models These models perform well in speech emotion recognition and multilingual support: - `facebook/wav2vec2-large-xlsr-53` - Multilingual speech representation learning - `microsoft/wavlm-large` - Speech understanding specialized model - `harshit345/xlsr-wav2vec-speech-emotion-recognition` - Emotion recognition - `audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim` - Emotion dimension recognition - `speechbrain/emotion-recognition-wav2vec2-IEMOCAP` - Emotion classification """) batch_test_btn = gr.Button("๐Ÿ” Batch Test Recommended Models", variant="primary") batch_result_output = gr.Textbox( label="๐Ÿ“Š Batch Test Results", lines=20, max_lines=50 ) batch_test_btn.click( fn=test_recommended_models, inputs=[], outputs=[batch_result_output] ) with gr.Tab("Usage Instructions"): gr.Markdown(""" ## ๐Ÿ“– Usage Instructions ### Scoring Criteria (Redesigned): - **Depression-specific model** (+6 points): Specifically for depression/mental health detection - **Emotion recognition model** (+3 points): For emotion recognition, potentially applicable - **Audio support** (+2 points): Whether the model can process audio input - **Multilingual support** (+2 points): Support for Chinese, English, German, Russian - **Architecture suitability** (+2 points): Whether model architecture is suitable for speech classification - **Classification configuration** (+1 point): Whether configured for classification tasks - **Clinical datasets** (+2 points): Whether trained on clinical depression datasets ### Score Interpretation: - **12-15 points**: ๐ŸŒŸ Highly recommended, specialized depression detection model - **8-11 points**: ๐Ÿ‘ Recommended, may need fine-tuning - **5-7 points**: โš ๏ธ Use with caution, needs modification - **0-4 points**: โŒ Not recommended ### Next Steps: 1. Select the top 2-3 models with highest scores 2. Conduct in-depth testing in Google Colab 3. Fine-tune using DAIC-WOZ dataset 4. Final evaluation with your multilingual data """) # Launch application if __name__ == "__main__": app.launch()