liuyang
update model
6dfc92e
# Configuration settings for the Whisper Transcription Space
# Model configurations
WHISPER_MODEL = "distil-whisper/distil-large-v3"
DIARIZATION_MODEL = "pyannote/speaker-diarization-3.1"
# Audio processing settings
AUDIO_SAMPLE_RATE = 16000
AUDIO_CHANNELS = 1
MAX_AUDIO_DURATION = 600 # 10 minutes in seconds
# Transcription settings
DEFAULT_BEAM_SIZE = 5
DEFAULT_LANGUAGE = None # Auto-detect
DEFAULT_TRANSLATE = False
# Diarization settings
MAX_SPEAKERS = 20
DEFAULT_NUM_SPEAKERS = None # Auto-detect
# Segment grouping settings
MAX_SEGMENT_GAP = 1.0 # seconds
MAX_SEGMENT_DURATION = 30.0 # seconds
# Flash attention settings
FLASH_ATTENTION_ENABLED = True
TORCH_DTYPE = "float16"
# ZeroGPU settings
GPU_MEMORY_FRACTION = 0.8
CUDA_DEVICE = "cuda:0"
# Gradio interface settings
GRADIO_THEME = "soft"
GRADIO_DEBUG = False
GRADIO_SHARE = False
# Environment variables
HF_TOKEN_ENV_VAR = "HF_TOKEN"
# Supported audio formats
SUPPORTED_AUDIO_FORMATS = [
".mp3", ".wav", ".m4a", ".flac", ".ogg",
".aac", ".wma", ".opus", ".webm"
]
# Language codes
SUPPORTED_LANGUAGES = {
"auto": "Auto-detect",
"en": "English",
"es": "Spanish",
"fr": "French",
"de": "German",
"it": "Italian",
"pt": "Portuguese",
"ru": "Russian",
"ja": "Japanese",
"ko": "Korean",
"zh": "Chinese",
"ar": "Arabic",
"hi": "Hindi",
"tr": "Turkish",
"pl": "Polish",
"nl": "Dutch",
"sv": "Swedish",
"da": "Danish",
"no": "Norwegian",
"fi": "Finnish"
}