diff --git a/capspeech/__init__.py b/capspeech/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/ar/README.md b/capspeech/ar/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a979ea30983323da1a58b84e24462e521006f7fc
--- /dev/null
+++ b/capspeech/ar/README.md
@@ -0,0 +1,44 @@
+# CapSpeech-AR
+
+## Pretrain
+
+```bash
+bash pretrain.sh
+```
+Make sure to change paths and keys in `pretrain.sh` to yours.
+
+## Finetune on CapTTS
+
+```bash
+bash finetune_captts.sh
+```
+Make sure to change paths and keys in `finetune_captts.sh` to yours.
+
+## Finetune on EmoCapTTS
+
+```bash
+bash finetune_emocaptts.sh
+```
+Make sure to change paths and keys in `finetune_emocaptts.sh` to yours.
+
+## Finetune on AccCapTTS
+
+```bash
+bash finetune_acccaptts.sh
+```
+Make sure to change paths and keys in `finetune_acccaptts.sh` to yours.
+
+## Finetune on CapTTS-SE
+
+```bash
+bash finetune_capttsse.sh
+```
+Make sure to change paths and keys in `finetune_capttsse.sh` to yours.
+
+
+## Finetune on AgentTTS
+
+```bash
+bash finetune_agenttts.sh
+```
+Make sure to change paths and keys in `finetune_agenttts.sh` to yours.
diff --git a/capspeech/ar/__init__.py b/capspeech/ar/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/ar/events.txt b/capspeech/ar/events.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a7fb37a73888498da3fdcafabbf6ae400016af3
--- /dev/null
+++ b/capspeech/ar/events.txt
@@ -0,0 +1,395 @@
+people whispering
+Microwave oven
+extending ladders
+mosquito buzzing
+dog whimpering
+coyote howling
+hair dryer drying
+Writing
+rapping
+machine gun shooting
+dog bow-wow
+dog howling
+barn swallow calling
+baby babbling
+Fireworks
+church bell ringing
+car horn
+cat caterwauling
+subway, metro, underground
+waterfall burbling
+lions roaring
+toilet flushing
+skateboarding
+wind
+ripping paper
+vacuum cleaner cleaning floors
+mouse squeaking
+keyboard typing
+playing timpani
+playing harp
+sheep bleating
+eletric blender running
+people slapping
+playing ukulele
+frog
+car engine knocking
+cat purring
+chainsaw
+Violin or fiddle
+people hiccup
+playing acoustic guitar
+donkey, ass braying
+playing french horn
+playing squash
+gibbon howling
+playing harmonica
+playing shofar
+hedge trimmer running
+playing washboard
+running electric fan
+splashing water
+playing bassoon
+people slurping
+playing accordion
+playing oboe
+popping popcorn
+glass breaking
+alarm clock ringing
+mouse click
+Laughter
+magpie calling
+playing snare drum
+people finger snapping
+ferret dooking
+tornado roaring
+Hi-hat
+lawn mowing
+church bells
+cat growling
+cheetah chirrup
+heart sounds, heartbeat
+firing muskets
+vehicle horn, car horn, honking
+turkey gobbling
+ice cream truck, ice cream van
+underwater bubbling
+footsteps on snow
+water drops
+people sobbing
+basketball bounce
+Applause
+playing sitar
+playing gong
+train
+coughing
+people screaming
+Gunshot or gunfire
+chinchilla barking
+cat hissing
+horse clip-clop
+engine
+people battle cry
+typing on computer keyboard
+playing clarinet
+driving motorcycle
+male singing
+singing bowl
+skiing
+driving buses
+alligators, crocodiles hissing
+people eating apple
+door slamming
+Flute
+raining
+Electric piano
+sliding door
+washing machine
+opening or closing car electric windows
+baby crying
+people babbling
+snake hissing
+brushing teeth
+playing tambourine
+Acoustic guitar
+clock tick
+playing castanets
+thunder
+playing didgeridoo
+playing synthesizer
+mouse clicking
+lathe spinning
+spraying water
+hen
+stream burbling
+door wood creaks
+sailing
+dog
+car engine idling
+bowling impact
+driving snowmobile
+toilet flush
+bird squawking
+playing timbales
+playing drum kit
+owl hooting
+striking pool
+Oboe
+duck quacking
+people belly laughing
+lighting firecrackers
+roller coaster running
+blowtorch igniting
+wood thrush calling
+Glockenspiel
+frog croaking
+playing harpsichord
+train horning
+plastic bottle crushing
+playing tabla
+fire crackling
+dog barking
+thunderstorm
+playing banjo
+swimming
+volcano explosion
+playing table tennis
+sea lion barking
+rowboat, canoe, kayak rowing
+Meow
+pouring water
+playing tympani
+rooster
+siren
+parrot talking
+Finger snapping
+playing steel guitar, slide guitar
+Trumpet
+tractor digging
+people coughing
+cat meowing
+Snare drum
+playing erhu
+crow cawing
+playing djembe
+whale calling
+mynah bird singing
+playing tennis
+chopping food
+golf driving
+tapping guitar
+playing cello
+dog growling
+elephant trumpeting
+sea waves
+police radio chatter
+lions growling
+playing lacrosse
+children shouting
+missile launch
+baby laughter
+air conditioning noise
+playing saxophone
+typing on typewriter
+printer printing
+race car, auto racing
+Bus
+pigeon, dove cooing
+playing violin, fiddle
+Double bass
+striking bowling
+fireworks banging
+Harmonica
+playing glockenspiel
+reversing beeps
+playing piano
+breathing
+people marching
+electric shaver, electric razor shaving
+chimpanzee pant-hooting
+cricket chirping
+bird chirping, tweeting
+using sewing machines
+crickets
+cow lowing
+playing cymbal
+vacuum cleaner
+playing zither
+train whistling
+goat bleating
+eating with cutlery
+black capped chickadee calling
+ambulance siren
+playing hockey
+dog baying
+Burping or eructation
+cupboard opening or closing
+air horn
+crying baby
+people eating crisps
+sloshing water
+goose honking
+orchestra
+people giggling
+warbler chirping
+child singing
+dinosaurs bellowing
+motorboat, speedboat acceleration
+airplane
+chicken clucking
+woodpecker pecking tree
+Drawer open or close
+people eating
+drinking sipping
+singing choir
+playing bass guitar
+playing bass drum
+car passing by
+playing tuning fork
+Squeak
+pig oinking
+Computer keyboard
+yodelling
+playing trombone
+clapping
+people sneezing
+pheasant crowing
+writing on blackboard with chalk
+Tambourine
+opening or closing car doors
+sharpen knife
+people whistling
+fireworks
+playing bagpipes
+chainsawing trees
+squishing water
+people farting
+playing electric guitar
+people booing
+female singing
+ocean burbling
+cattle mooing
+footsteps
+Knock
+wind rustling leaves
+cattle, bovinae cowbell
+Clarinet
+police car (siren)
+Fart
+cat
+sheep
+chopping wood
+tap dancing
+playing mandolin
+wind chime
+can opening
+playing hammond organ
+zebra braying
+scuba diving
+chirping birds
+playing steelpan
+playing theremin
+Keys jangling
+beat boxing
+firing cannon
+bouncing on trampoline
+door wood knock
+bathroom ventilation fan running
+snake rattling
+bull bellowing
+electric grinder grinding
+penguins braying
+otter growling
+civil defense siren
+wind noise
+people humming
+clock alarm
+disc scratching
+fire truck siren
+telephone bell ringing
+people sniggering
+playing bongo
+cap gun shooting
+opening or closing drawers
+cow
+hammering nails
+ice cracking
+foghorn
+rain
+playing badminton
+eagle screaming
+playing double bass
+insects
+people running
+planing timber
+cutting hair with electric trimmers
+Cello
+people clapping
+smoke detector beeping
+mouse pattering
+bee, wasp, etc. buzzing
+canary calling
+people burping
+Shatter
+baltimore oriole calling
+cuckoo bird calling
+snoring
+strike lighter
+people cheering
+playing bugle
+playing congas
+playing vibraphone
+hail
+rope skipping
+playing trumpet
+pig
+hand saw
+people gargling
+Scissors
+metronome
+chipmunk chirping
+playing flute
+fox barking
+crackling fire
+playing volleyball
+skidding
+Bass drum
+crow
+elk bugling
+Telephone
+Bark
+chicken crowing
+people nose blowing
+car engine starting
+pumping water
+Saxophone
+fly, housefly buzzing
+Cough
+people eating noodle
+francolin calling
+arc welding
+horse neighing
+Tearing
+helicopter
+playing electronic organ
+Cowbell
+railroad car, train wagon
+cell phone buzzing
+playing cornet
+sneezing
+engine accelerating, revving, vroom
+bird wings flapping
+playing marimba, xylophone
+playing guiro
+people crowd
+train wheels squealing
+slot machine
+laughing
+lip smacking
+forging swords
+Chime
+playing darts
+people shuffling
+Gong
+airplane flyby
+None
diff --git a/capspeech/ar/finetune_acccaptts.sh b/capspeech/ar/finetune_acccaptts.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d859633873d4a7207ab0355a5233c1c139337045
--- /dev/null
+++ b/capspeech/ar/finetune_acccaptts.sh
@@ -0,0 +1,64 @@
+# Please log in to huggingface first
+
+LIBRITTSR_WAV_DIR='' # downloaded libritts-r wav dir
+OTHER_WAV_DIR='' # downloaded other wav dirs
+OUTPUT_DIR="./output_finetuning_acccaptts/" # output dir, to save checkpoints
+TEMPORY_SAVE_TO_DISK="./audio_code_finetuning_acccaptts/" # dac codec saved dir
+SAVE_TO_DISK="./dataset_finetuning_acccaptts/" # huggingface metadata saved dir
+WANDB_KEY='' # your wandb key for logging
+
+PRETRAINED_MODEL_PATH="" # your pretrained model path
+
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+
+accelerate launch ./training/finetune_captts.py \
+    --model_name_or_path ${PRETRAINED_MODEL_PATH} \
+    --feature_extractor_name "parler-tts/dac_44khZ_8kbps" \
+    --description_tokenizer_name ${PRETRAINED_MODEL_PATH} \
+    --prompt_tokenizer_name ${PRETRAINED_MODEL_PATH} \
+    --report_to "wandb" \
+    --wandb_key ${WANDB_KEY} \
+    --overwrite_output_dir true \
+    --train_dataset_name "OpenSound/CapSpeech" \
+    --train_split_name "train_SFT_AccCapTTS" \
+    --eval_dataset_name "OpenSound/CapSpeech" \
+    --eval_split_name "validation_SFT_AccCapTTS" \
+    --librittsr_dir ${LIBRITTSR_WAV_DIR} \
+    --other_dir ${OTHER_WAV_DIR} \
+    --max_eval_samples 96 \
+    --per_device_eval_batch_size 32 \
+    --target_audio_column_name "audio_path" \
+    --description_column_name "caption" \
+    --source_column_name "source" \
+    --prompt_column_name "text" \
+    --max_duration_in_seconds 20 \
+    --min_duration_in_seconds 3 \
+    --max_text_length 600 \
+    --preprocessing_num_workers 32 \
+    --do_train true \
+    --num_train_epochs 5 \
+    --gradient_accumulation_steps 6 \
+    --gradient_checkpointing false \
+    --per_device_train_batch_size 4 \
+    --learning_rate 0.0001 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.99 \
+    --weight_decay 0.01 \
+    --lr_scheduler_type "constant_with_warmup" \
+    --warmup_steps 1000 \
+    --logging_steps 200 \
+    --freeze_text_encoder true \
+    --per_device_eval_batch_size 4 \
+    --audio_encoder_per_device_batch_size 24 \
+    --dtype "float16" \
+    --seed 456 \
+    --output_dir ${OUTPUT_DIR} \
+    --temporary_save_to_disk ${TEMPORY_SAVE_TO_DISK} \
+    --save_to_disk ${SAVE_TO_DISK} \
+    --dataloader_num_workers 32 \
+    --do_eval \
+    --evaluation_strategy steps \
+    --eval_steps 500 \
+    --save_steps 500 \
+    --group_by_length true
diff --git a/capspeech/ar/finetune_agenttts.sh b/capspeech/ar/finetune_agenttts.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c6fd7d72d480aacabf14ed8a93b29fcbc7f98398
--- /dev/null
+++ b/capspeech/ar/finetune_agenttts.sh
@@ -0,0 +1,61 @@
+# Please log in to huggingface first
+
+OTHER_WAV_DIR='' # downloaded capspeech-agentdb wav dir
+OUTPUT_DIR="./output_finetuning_agenttts/" # output dir, to save checkpoints
+TEMPORY_SAVE_TO_DISK="./audio_code_finetuning_agenttts/" # dac codec saved dir
+SAVE_TO_DISK="./dataset_finetuning_agenttts/" # huggingface metadata saved dir
+WANDB_KEY='' # your wandb key for logging
+PRETRAINED_MODEL_PATH="" # your pretrained model path
+
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+
+accelerate launch ./training/finetune_captts.py \
+    --model_name_or_path "/export/fs05/hwang258/parler-tts/parler-tts" \
+    --feature_extractor_name "parler-tts/dac_44khZ_8kbps" \
+    --description_tokenizer_name ${PRETRAINED_MODEL_PATH} \
+    --prompt_tokenizer_name ${PRETRAINED_MODEL_PATH} \
+    --report_to "wandb" \
+    --wandb_key ${WANDB_KEY} \
+    --overwrite_output_dir true \
+    --train_dataset_name "OpenSound/CapSpeech" \
+    --train_split_name "train_AgentDB" \
+    --eval_dataset_name "OpenSound/CapSpeech" \
+    --eval_split_name "test_AgentDB" \
+    --other_dir ${OTHER_WAV_DIR} \
+    --max_eval_samples 96 \
+    --per_device_eval_batch_size 32 \
+    --target_audio_column_name "audio_path" \
+    --description_column_name "caption" \
+    --source_column_name "source" \
+    --prompt_column_name "text" \
+    --max_duration_in_seconds 20 \
+    --min_duration_in_seconds 3 \
+    --max_text_length 600 \
+    --preprocessing_num_workers 32 \
+    --do_train true \
+    --num_train_epochs 50 \
+    --gradient_accumulation_steps 6 \
+    --gradient_checkpointing false \
+    --per_device_train_batch_size 4 \
+    --learning_rate 0.0001 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.99 \
+    --weight_decay 0.01 \
+    --lr_scheduler_type "constant_with_warmup" \
+    --warmup_steps 500 \
+    --logging_steps 100 \
+    --freeze_text_encoder true \
+    --per_device_eval_batch_size 4 \
+    --audio_encoder_per_device_batch_size 24 \
+    --dtype "float16" \
+    --seed 456 \
+    --output_dir ${OUTPUT_DIR} \
+    --temporary_save_to_disk ${TEMPORY_SAVE_TO_DISK} \
+    --save_to_disk ${SAVE_TO_DISK} \
+    --dataloader_num_workers 32 \
+    --do_eval \
+    --evaluation_strategy steps \
+    --eval_steps 500 \
+    --save_steps 500 \
+    --group_by_length true
\ No newline at end of file
diff --git a/capspeech/ar/finetune_captts.sh b/capspeech/ar/finetune_captts.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d4a3d9d1d014b925cfeb3399d0af3fa5bd3402f5
--- /dev/null
+++ b/capspeech/ar/finetune_captts.sh
@@ -0,0 +1,64 @@
+# Please log in to huggingface first
+
+LIBRITTSR_WAV_DIR='' # downloaded libritts-r wav dir
+OTHER_WAV_DIR='' # downloaded other wav dirs
+OUTPUT_DIR="./output_finetuning_captts/" # output dir, to save checkpoints
+TEMPORY_SAVE_TO_DISK="./audio_code_finetuning_captts/" # dac codec saved dir
+SAVE_TO_DISK="./dataset_finetuning_captts/" # huggingface metadata saved dir
+WANDB_KEY='' # your wandb key for logging
+
+PRETRAINED_MODEL_PATH="" # your pretrained model path
+
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+
+accelerate launch ./training/finetune_captts.py \
+    --model_name_or_path ${PRETRAINED_MODEL_PATH} \
+    --feature_extractor_name "parler-tts/dac_44khZ_8kbps" \
+    --description_tokenizer_name ${PRETRAINED_MODEL_PATH} \
+    --prompt_tokenizer_name ${PRETRAINED_MODEL_PATH} \
+    --report_to "wandb" \
+    --wandb_key ${WANDB_KEY} \
+    --overwrite_output_dir true \
+    --train_dataset_name "OpenSound/CapSpeech" \
+    --train_split_name "train_SFT_CapTTS" \
+    --eval_dataset_name "OpenSound/CapSpeech" \
+    --eval_split_name "validation_SFT_CapTTS" \
+    --librittsr_dir ${LIBRITTSR_WAV_DIR} \
+    --other_dir ${OTHER_WAV_DIR} \
+    --max_eval_samples 96 \
+    --per_device_eval_batch_size 32 \
+    --target_audio_column_name "audio_path" \
+    --description_column_name "caption" \
+    --source_column_name "source" \
+    --prompt_column_name "text" \
+    --max_duration_in_seconds 20 \
+    --min_duration_in_seconds 3 \
+    --max_text_length 600 \
+    --preprocessing_num_workers 32 \
+    --do_train true \
+    --num_train_epochs 5 \
+    --gradient_accumulation_steps 6 \
+    --gradient_checkpointing false \
+    --per_device_train_batch_size 4 \
+    --learning_rate 0.0001 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.99 \
+    --weight_decay 0.01 \
+    --lr_scheduler_type "constant_with_warmup" \
+    --warmup_steps 1000 \
+    --logging_steps 200 \
+    --freeze_text_encoder true \
+    --per_device_eval_batch_size 4 \
+    --audio_encoder_per_device_batch_size 24 \
+    --dtype "float16" \
+    --seed 456 \
+    --output_dir ${OUTPUT_DIR} \
+    --temporary_save_to_disk ${TEMPORY_SAVE_TO_DISK} \
+    --save_to_disk ${SAVE_TO_DISK} \
+    --dataloader_num_workers 32 \
+    --do_eval \
+    --evaluation_strategy steps \
+    --eval_steps 2000 \
+    --save_steps 2000 \
+    --group_by_length true
diff --git a/capspeech/ar/finetune_capttsse.sh b/capspeech/ar/finetune_capttsse.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ee392882a1094f7a711dfccecc6b52ba1161fb72
--- /dev/null
+++ b/capspeech/ar/finetune_capttsse.sh
@@ -0,0 +1,62 @@
+# Please log in to huggingface first
+
+LIBRITTSRMIX_WAV_DIR='' # downloaded capspeech-sedb wav dir
+OUTPUT_DIR="./output_finetuning_capttsse/" # output dir, to save checkpoints
+TEMPORY_SAVE_TO_DISK="./audio_code_finetuning_capttsse/" # dac codec saved dir
+SAVE_TO_DISK="./dataset_finetuning_capttsse/" # huggingface metadata saved dir
+WANDB_KEY='' # your wandb key for logging
+
+PRETRAINED_MODEL_PATH="" # your pretrained model path
+
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+
+accelerate launch ./training/finetune_capttsse.py \
+    --model_name_or_path ${PRETRAINED_MODEL_PATH} \
+    --feature_extractor_name "parler-tts/dac_44khZ_8kbps" \
+    --description_tokenizer_name ${PRETRAINED_MODEL_PATH} \
+    --prompt_tokenizer_name ${PRETRAINED_MODEL_PATH} \
+    --report_to "wandb" \
+    --wandb_key ${WANDB_KEY} \
+    --overwrite_output_dir true \
+    --train_dataset_name "OpenSound/CapSpeech" \
+    --train_split_name "train_SEDB" \
+    --eval_dataset_name "OpenSound/CapSpeech" \
+    --eval_split_name "test_SEDB" \
+    --librittsrmix_dir ${LIBRITTSRMIX_WAV_DIR} \
+    --max_eval_samples 96 \
+    --per_device_eval_batch_size 32 \
+    --target_audio_column_name "audio_path" \
+    --description_column_name "caption" \
+    --source_column_name "source" \
+    --prompt_column_name "text" \
+    --max_duration_in_seconds 20 \
+    --min_duration_in_seconds 3 \
+    --max_text_length 600 \
+    --preprocessing_num_workers 32 \
+    --do_train true \
+    --num_train_epochs 50 \
+    --gradient_accumulation_steps 6 \
+    --gradient_checkpointing false \
+    --per_device_train_batch_size 4 \
+    --learning_rate 0.0001 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.99 \
+    --weight_decay 0.01 \
+    --lr_scheduler_type "constant_with_warmup" \
+    --warmup_steps 50 \
+    --logging_steps 20 \
+    --freeze_text_encoder true \
+    --per_device_eval_batch_size 4 \
+    --audio_encoder_per_device_batch_size 24 \
+    --dtype "float16" \
+    --seed 456 \
+    --output_dir ${OUTPUT_DIR} \
+    --temporary_save_to_disk ${TEMPORY_SAVE_TO_DISK} \
+    --save_to_disk ${SAVE_TO_DISK} \
+    --dataloader_num_workers 32 \
+    --do_eval \
+    --evaluation_strategy steps \
+    --eval_steps 50 \
+    --save_steps 50 \
+    --group_by_length true
\ No newline at end of file
diff --git a/capspeech/ar/finetune_emocaptts.sh b/capspeech/ar/finetune_emocaptts.sh
new file mode 100644
index 0000000000000000000000000000000000000000..dba6f73082708438042e2a301ea9beef33d04e41
--- /dev/null
+++ b/capspeech/ar/finetune_emocaptts.sh
@@ -0,0 +1,64 @@
+# Please log in to huggingface first
+
+LIBRITTSR_WAV_DIR='' # downloaded libritts-r wav dir
+OTHER_WAV_DIR='' # downloaded other wav dirs
+OUTPUT_DIR="./output_finetuning_emocaptts/" # output dir, to save checkpoints
+TEMPORY_SAVE_TO_DISK="./audio_code_finetuning_emocaptts/" # dac codec saved dir
+SAVE_TO_DISK="./dataset_finetuning_emocaptts/" # huggingface metadata saved dir
+WANDB_KEY='' # your wandb key for logging
+
+PRETRAINED_MODEL_PATH="" # your pretrained model path
+
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+
+accelerate launch ./training/finetune_captts.py \
+    --model_name_or_path ${PRETRAINED_MODEL_PATH} \
+    --feature_extractor_name "parler-tts/dac_44khZ_8kbps" \
+    --description_tokenizer_name ${PRETRAINED_MODEL_PATH} \
+    --prompt_tokenizer_name ${PRETRAINED_MODEL_PATH} \
+    --report_to "wandb" \
+    --wandb_key ${WANDB_KEY} \
+    --overwrite_output_dir true \
+    --train_dataset_name "OpenSound/CapSpeech" \
+    --train_split_name "train_SFT_EmoCapTTS" \
+    --eval_dataset_name "OpenSound/CapSpeech" \
+    --eval_split_name "validation_SFT_EmoCapTTS" \
+    --librittsr_dir ${LIBRITTSR_WAV_DIR} \
+    --other_dir ${OTHER_WAV_DIR} \
+    --max_eval_samples 96 \
+    --per_device_eval_batch_size 32 \
+    --target_audio_column_name "audio_path" \
+    --description_column_name "caption" \
+    --source_column_name "source" \
+    --prompt_column_name "text" \
+    --max_duration_in_seconds 20 \
+    --min_duration_in_seconds 3 \
+    --max_text_length 600 \
+    --preprocessing_num_workers 32 \
+    --do_train true \
+    --num_train_epochs 5 \
+    --gradient_accumulation_steps 6 \
+    --gradient_checkpointing false \
+    --per_device_train_batch_size 4 \
+    --learning_rate 0.0001 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.99 \
+    --weight_decay 0.01 \
+    --lr_scheduler_type "constant_with_warmup" \
+    --warmup_steps 1000 \
+    --logging_steps 200 \
+    --freeze_text_encoder true \
+    --per_device_eval_batch_size 4 \
+    --audio_encoder_per_device_batch_size 24 \
+    --dtype "float16" \
+    --seed 456 \
+    --output_dir ${OUTPUT_DIR} \
+    --temporary_save_to_disk ${TEMPORY_SAVE_TO_DISK} \
+    --save_to_disk ${SAVE_TO_DISK} \
+    --dataloader_num_workers 32 \
+    --do_eval \
+    --evaluation_strategy steps \
+    --eval_steps 400 \
+    --save_steps 400 \
+    --group_by_length true
diff --git a/capspeech/ar/parler_tts/__init__.py b/capspeech/ar/parler_tts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..06eefda998fc55aab8bb9f659ebb71c54d1ea496
--- /dev/null
+++ b/capspeech/ar/parler_tts/__init__.py
@@ -0,0 +1,25 @@
+__version__ = "0.2.2"
+
+
+from transformers import AutoConfig, AutoModel
+
+from .configuration_parler_tts import ParlerTTSConfig, ParlerTTSDecoderConfig
+from .dac_wrapper import DACConfig, DACModel
+from .modeling_parler_tts import (
+    ParlerTTSForCausalLM,
+    ParlerTTSForConditionalGeneration,
+    apply_delay_pattern_mask,
+    build_delay_pattern_mask,
+)
+
+from .streamer import ParlerTTSStreamer
+
+from importlib.metadata import version
+from packaging.version import Version
+
+if Version(version("transformers"))<= Version("4.44.2dev"):
+    AutoConfig.register("dac", DACConfig)
+else:
+    AutoConfig.register("dac_on_the_hub", DACConfig)
+
+AutoModel.register(DACConfig, DACModel)
diff --git a/capspeech/ar/parler_tts/configuration_parler_tts.py b/capspeech/ar/parler_tts/configuration_parler_tts.py
new file mode 100644
index 0000000000000000000000000000000000000000..723831c305e9272f660d947a297f24cb92e0c7c6
--- /dev/null
+++ b/capspeech/ar/parler_tts/configuration_parler_tts.py
@@ -0,0 +1,291 @@
+# coding=utf-8
+# Copyright 2024 and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Parler-TTS model configuration"""
+
+from transformers import AutoConfig, logging
+from transformers.configuration_utils import PretrainedConfig
+
+from importlib.metadata import version
+from packaging.version import Version
+
+use_dac_on_the_hub = Version(version("transformers")) > Version("4.44.2dev")
+
+logger = logging.get_logger(__name__)
+
+PARLER_TTS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "parler-tts/parler-tts-mini-v1": "https://huggingface.co/parler-tts/parler-tts-mini-v1/resolve/main/config.json",
+    # See all ParlerTTS models at https://huggingface.co/models?filter=parler_tts
+}
+
+
+class ParlerTTSDecoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`ParlerTTSDecoder`]. It is used to instantiate a
+    Parler-TTS decoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Parler-TTS
+    [parler-tts/parler-tts-mini-v1](https://huggingface.co/parler-tts/parler-tts-mini-v1) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 2049):
+            Vocabulary size of the ParlerTTSDecoder model. Defines the number of different tokens that can be
+            represented by the `inputs_ids` passed when calling [`ParlerTTSDecoder`].
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of decoder layers.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer block.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        num_cross_attention_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention in the cross-attention layers.
+            If it is not specified, will default to `num_key_value_heads`.
+        ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer block.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the decoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, text_encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically, set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_factor (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(hidden_size).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether the model should return the last key/values attentions (not used by all models)
+        num_codebooks (`int`, *optional*, defaults to 4):
+            The number of parallel codebooks forwarded to the model.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether input and output word embeddings should be tied.
+        rope_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to use ROPE or absolute positional embeddings.
+        rope_theta (`float`, *optional*, defaults to 100000.0):
+            The base period of the RoPE embeddings.
+        cross_attention_implementation_strategy (`str`, *optional*):
+            If not specified, the cross-attention implementation will be the same as `_attn_implementation`. If `always_eager`, it will always be the eager implementation. If `always_sdpa`, it will always be the sdpa implementation.
+        use_fused_lm_heads(`bool`, *optional*, defaults to `False`):
+            Whether to fuse audio LM heads instead of applying them sequentially.
+        codebook_weights(`List[int]`, *optional*):
+            Weights applied to each codebook when computing the loss.
+    """
+
+    model_type = "parler_tts_decoder"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=2049,  # vocab size = 2048 (encodec vocab size) + 1 (eos)
+        max_position_embeddings=2048,
+        num_hidden_layers=24,
+        ffn_dim=4096,
+        num_attention_heads=16,
+        num_key_value_heads=None,
+        num_cross_attention_key_value_heads=None,
+        layerdrop=0.0,
+        use_cache=True,
+        activation_function="gelu",
+        hidden_size=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        initializer_factor=0.02,
+        scale_embedding=False,
+        num_codebooks=4,
+        pad_token_id=2048,
+        bos_token_id=2049,
+        eos_token_id=2048,
+        tie_word_embeddings=False,
+        rope_embeddings=False,
+        rope_theta=10_000.0,
+        cross_attention_implementation_strategy=None,
+        use_fused_lm_heads=False,
+        codebook_weights=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.ffn_dim = ffn_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        if num_cross_attention_key_value_heads is None:
+            num_cross_attention_key_value_heads = num_key_value_heads
+        self.num_cross_attention_key_value_heads = num_cross_attention_key_value_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.initializer_factor = initializer_factor
+        self.layerdrop = layerdrop
+        self.use_cache = use_cache
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.num_codebooks = num_codebooks
+        self.rope_embeddings = rope_embeddings
+        self.rope_theta = rope_theta
+        self.cross_attention_implementation_strategy = cross_attention_implementation_strategy
+        self.use_fused_lm_heads = use_fused_lm_heads
+        self.codebook_weights = codebook_weights
+
+        if codebook_weights is not None and len(codebook_weights) != num_codebooks:
+            raise ValueError(f"`codebook_weights` has length {len(codebook_weights)} when it should be of length {num_codebooks}.")
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class ParlerTTSConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ParlerTTSModel`]. It is used to instantiate a
+    Parler-TTS model according to the specified arguments, defining the text encoder, audio encoder and Parler-TTS decoder
+    configs.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 1024):
+            Vocabulary size of the prompt token ids. Defines the number of different tokens that can be
+            represented by the `prompt_inputs_ids`.
+        prompt_cross_attention (`bool`, *optional*, defaults to `False`):
+            Whether to use cross-attention conditioning for the prompt (as well as the description).
+        kwargs (*optional*):
+            Dictionary of keyword arguments. Notably:
+
+                - **text_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
+                  defines the text encoder config.
+                - **audio_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
+                  defines the audio encoder config.
+                - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
+                  the decoder config.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     ParlerTTSConfig,
+    ...     ParlerTTSDecoderConfig,
+    ...     T5Config,
+    ...     EncodecConfig,
+    ...     ParlerTTSForConditionalGeneration,
+    ... )
+
+    >>> # Initializing text encoder, audio encoder, and decoder model configurations
+    >>> text_encoder_config = T5Config()
+    >>> audio_encoder_config = EncodecConfig()
+    >>> decoder_config = ParlerTTSDecoderConfig()
+
+    >>> configuration = ParlerTTSConfig.from_sub_models_config(
+    ...     text_encoder_config, audio_encoder_config, decoder_config
+    ... )
+
+    >>> # Initializing a ParlerTTSForConditionalGeneration (with random weights) from the parler-tts/parler-tts-mini-v1 style configuration
+    >>> model = ParlerTTSForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> config_text_encoder = model.config.text_encoder
+    >>> config_audio_encoder = model.config.audio_encoder
+    >>> config_decoder = model.config.decoder
+
+    >>> # Saving the model, including its configuration
+    >>> model.save_pretrained("parler_tts-model")
+
+    >>> # loading model and config from pretrained folder
+    >>> parler_tts_config = ParlerTTSConfig.from_pretrained("parler_tts-model")
+    >>> model = ParlerTTSForConditionalGeneration.from_pretrained("parler_tts-model", config=parler_tts_config)
+    ```"""
+
+    model_type = "parler_tts"
+    is_composition = True
+
+    def __init__(self, vocab_size=1024, prompt_cross_attention=False, **kwargs):
+        super().__init__(**kwargs)
+        if "text_encoder" not in kwargs or "audio_encoder" not in kwargs or "decoder" not in kwargs:
+            raise ValueError("Config has to be initialized with text_encoder, audio_encoder and decoder config")
+
+        text_encoder_config = kwargs.pop("text_encoder")
+        text_encoder_model_type = text_encoder_config.pop("model_type")
+
+        audio_encoder_config = kwargs.pop("audio_encoder")
+        audio_encoder_model_type = audio_encoder_config.pop("model_type")
+
+        model_version = kwargs.get("transformers_version", None)
+        if model_version is not None and Version(model_version) <= Version("4.44.2dev") and use_dac_on_the_hub and audio_encoder_model_type=="dac":
+            # here we have to manually change model type if DAC based on transformers version
+            audio_encoder_model_type = "dac_on_the_hub"
+
+        decoder_config = kwargs.pop("decoder")
+
+        self.vocab_size = vocab_size
+        self.prompt_cross_attention = prompt_cross_attention
+        self.text_encoder = AutoConfig.for_model(text_encoder_model_type, **text_encoder_config)
+        self.audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder_config)
+        self.decoder = ParlerTTSDecoderConfig(**decoder_config)
+        self.is_encoder_decoder = True
+
+    @classmethod
+    def from_sub_models_config(
+        cls,
+        text_encoder_config: PretrainedConfig,
+        audio_encoder_config: PretrainedConfig,
+        decoder_config: ParlerTTSDecoderConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`ParlerTTSConfig`] (or a derived class) from text encoder, audio encoder and decoder
+        configurations.
+
+        Returns:
+            [`ParlerTTSConfig`]: An instance of a configuration object
+        """
+
+        return cls(
+            text_encoder=text_encoder_config.to_dict(),
+            audio_encoder=audio_encoder_config.to_dict(),
+            decoder=decoder_config.to_dict(),
+            **kwargs,
+        )
+
+    @property
+    # This is a property because you might want to change the codec model on the fly
+    def sampling_rate(self):
+        return self.audio_encoder.sampling_rate
\ No newline at end of file
diff --git a/capspeech/ar/parler_tts/dac_wrapper/__init__.py b/capspeech/ar/parler_tts/dac_wrapper/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4249591e7b2b58f48decd8e97443458b26bc019
--- /dev/null
+++ b/capspeech/ar/parler_tts/dac_wrapper/__init__.py
@@ -0,0 +1,2 @@
+from .configuration_dac import DACConfig
+from .modeling_dac import DACModel
diff --git a/capspeech/ar/parler_tts/dac_wrapper/configuration_dac.py b/capspeech/ar/parler_tts/dac_wrapper/configuration_dac.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dc1553afbbd52f75eff29c2ab5eaf7e9d3407d6
--- /dev/null
+++ b/capspeech/ar/parler_tts/dac_wrapper/configuration_dac.py
@@ -0,0 +1,27 @@
+
+from transformers import PretrainedConfig
+from importlib.metadata import version
+from packaging.version import Version
+
+
+class DACConfig(PretrainedConfig):
+    model_type = "dac" if Version(version("transformers"))<= Version("4.44.2dev") else "dac_on_the_hub"
+
+    def __init__(
+        self,
+        num_codebooks: int = 9,
+        model_bitrate: int = 8,  # kbps
+        codebook_size: int = 1024,
+        latent_dim: int = 1024,
+        frame_rate: int = 86,
+        sampling_rate: int = 44100,
+        **kwargs,
+    ):
+        self.codebook_size = codebook_size
+        self.model_bitrate = model_bitrate
+        self.latent_dim = latent_dim
+        self.num_codebooks = num_codebooks
+        self.frame_rate = frame_rate
+        self.sampling_rate = sampling_rate
+
+        super().__init__(**kwargs)
diff --git a/capspeech/ar/parler_tts/dac_wrapper/modeling_dac.py b/capspeech/ar/parler_tts/dac_wrapper/modeling_dac.py
new file mode 100644
index 0000000000000000000000000000000000000000..f38ed93f7c3b29363a8e7d4876e108c7ea66d98a
--- /dev/null
+++ b/capspeech/ar/parler_tts/dac_wrapper/modeling_dac.py
@@ -0,0 +1,164 @@
+import torch
+from dac.model import DAC
+from torch import nn
+
+from transformers import PreTrainedModel
+from transformers.models.encodec.modeling_encodec import EncodecDecoderOutput, EncodecEncoderOutput
+
+from .configuration_dac import DACConfig
+
+
+# model doesn't support batching yet
+
+
+class DACModel(PreTrainedModel):
+    config_class = DACConfig
+    main_input_name = "input_values"
+
+    # Set main input to 'input_values' for voice steering
+    main_input_name = "input_values"
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.model = DAC(
+            n_codebooks=config.num_codebooks,
+            latent_dim=config.latent_dim,
+            codebook_size=config.codebook_size,
+        )
+        
+        self.remove_weight_norm()
+        self.apply_weight_norm()
+
+    def encode(
+        self, input_values, padding_mask=None, bandwidth=None, return_dict=None, n_quantizers=None, sample_rate=None
+    ):
+        """
+        Encodes the input audio waveform into discrete codes.
+
+        Args:
+            input_values (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+                Float values of the input audio waveform.
+            padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+                Padding mask used to pad the `input_values`.
+            bandwidth (`float`, *optional*):
+                Not used, kept to have the same inferface as HF encodec.
+            n_quantizers (`int`, *optional*) :
+                Number of quantizers to use, by default None
+                If None, all quantizers are used.
+            sample_rate (`int`, *optional*) :
+                Signal sampling_rate
+
+        Returns:
+            A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
+            factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
+            `codebook` of shape `[batch_size, num_codebooks, frames]`.
+            Scale is not used here.
+
+        """
+        _, channels, input_length = input_values.shape
+
+        if channels < 1 or channels > 2:
+            raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")
+
+        audio_data = self.model.preprocess(input_values, sample_rate)
+
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # TODO: for now, no chunk length
+
+        chunk_length = None  # self.config.chunk_length
+        if chunk_length is None:
+            chunk_length = input_length
+            stride = input_length
+        else:
+            stride = self.config.chunk_stride
+
+        if padding_mask is None:
+            padding_mask = torch.ones_like(input_values).bool()
+
+        encoded_frames = []
+        scales = []
+
+        step = chunk_length - stride
+        if (input_length % stride) - step != 0:
+            raise ValueError(
+                "The input length is not properly padded for batched chunked decoding. Make sure to pad the input correctly."
+            )
+
+        for offset in range(0, input_length - step, stride):
+            mask = padding_mask[..., offset : offset + chunk_length].bool()
+            frame = audio_data[:, :, offset : offset + chunk_length]
+
+            scale = None
+
+            _, encoded_frame, _, _, _ = self.model.encode(frame, n_quantizers=n_quantizers)
+            encoded_frames.append(encoded_frame)
+            scales.append(scale)
+
+        encoded_frames = torch.stack(encoded_frames)
+
+        if not return_dict:
+            return (encoded_frames, scales)
+
+        return EncodecEncoderOutput(encoded_frames, scales)
+
+    def decode(
+        self,
+        audio_codes,
+        audio_scales,
+        padding_mask=None,
+        return_dict=None,
+    ):
+        """
+        Decodes the given frames into an output audio waveform.
+
+        Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
+        trimmed.
+
+        Args:
+            audio_codes (`torch.FloatTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
+                Discret code embeddings computed using `model.encode`.
+            audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
+                Not used, kept to have the same inferface as HF encodec.
+            padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+                Padding mask used to pad the `input_values`.
+                Not used yet, kept to have the same inferface as HF encodec.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+        """
+        return_dict = return_dict or self.config.return_dict
+
+        # TODO: for now, no chunk length
+
+        if len(audio_codes) != 1:
+            raise ValueError(f"Expected one frame, got {len(audio_codes)}")
+
+        audio_values = self.model.quantizer.from_codes(audio_codes.squeeze(0))[0]
+        audio_values = self.model.decode(audio_values)
+        if not return_dict:
+            return (audio_values,)
+        return EncodecDecoderOutput(audio_values)
+
+    def forward(self, tensor):
+        raise ValueError("`DACModel.forward` not implemented yet")
+    
+
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        def _apply_weight_norm(module):
+            if isinstance(module, nn.Conv1d) or isinstance(module, nn.ConvTranspose1d):
+                weight_norm(module)
+
+        self.apply(_apply_weight_norm)
+
+
+    def remove_weight_norm(self):
+        def _remove_weight_norm(module):
+            if isinstance(module, nn.Conv1d) or isinstance(module, nn.ConvTranspose1d):
+                nn.utils.remove_weight_norm(module)
+        self.apply(_remove_weight_norm)
diff --git a/capspeech/ar/parler_tts/logits_processors.py b/capspeech/ar/parler_tts/logits_processors.py
new file mode 100644
index 0000000000000000000000000000000000000000..06d47d754921dcb8503567f4611e8526fe60b623
--- /dev/null
+++ b/capspeech/ar/parler_tts/logits_processors.py
@@ -0,0 +1,54 @@
+from transformers import LogitsProcessor, LogitsProcessorList
+from transformers.pytorch_utils import isin_mps_friendly
+import math
+import torch
+
+class ParlerTTSLogitsProcessor(LogitsProcessor):
+    r"""This processor ensures that the delayed pattern mask constraints are respected.
+
+    <Tip warning={true}>
+
+    This logits processor is exclusively compatible with Parler-TTS. 
+    See the model documentation for examples.
+
+    </Tip>
+
+    Args:
+        eos_token_id (`Union[int, List[int], torch.Tensor]`):
+            The id(s) of the *end-of-sequence* token.
+        min_eos_p (`float`, *optional*):
+            Minimum end of speech threshold.
+    """
+
+    def __init__(self, eos_token_id, num_codebooks: int, batch_size: int, device: str = "cpu"):
+        if not isinstance(eos_token_id, torch.Tensor):
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+            eos_token_id = torch.tensor(eos_token_id, device=device)
+        self.eos_token_id = eos_token_id
+        self.batch_size = batch_size
+
+        if torch.is_floating_point(eos_token_id) or (eos_token_id < 0).any():
+            raise ValueError(f"`eos_token_id` has to be a list of positive integers, but is {eos_token_id}")
+
+        self.num_codebooks = num_codebooks
+        self.device = device
+
+
+        self.codebook_idx = torch.arange(self.batch_size*self.num_codebooks, device=self.device)
+        self.first_codebooks_unfinished = torch.arange(batch_size, device=device)*num_codebooks
+        
+        max_codebooks = torch.arange(self.batch_size, device=self.device)*self.num_codebooks + self.num_codebooks -1
+        self.max_codebooks = max_codebooks
+        
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        
+        is_eos = isin_mps_friendly(input_ids, self.eos_token_id).sum(1)
+        
+        self.first_codebooks_unfinished = torch.where((is_eos[self.first_codebooks_unfinished]>0) & (self.first_codebooks_unfinished<self.max_codebooks), self.first_codebooks_unfinished+1, self.first_codebooks_unfinished)
+                
+        # every codebook higher than the first one unfinished will never be eos
+        eos_token_mask = self.codebook_idx > self.first_codebooks_unfinished.repeat_interleave(self.num_codebooks)
+        scores[eos_token_mask, self.eos_token_id] = -math.inf
+        
+        return scores
\ No newline at end of file
diff --git a/capspeech/ar/parler_tts/modeling_parler_tts.py b/capspeech/ar/parler_tts/modeling_parler_tts.py
new file mode 100644
index 0000000000000000000000000000000000000000..14bcf995cb3520e07e95657d6e4380a89cc94575
--- /dev/null
+++ b/capspeech/ar/parler_tts/modeling_parler_tts.py
@@ -0,0 +1,3788 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modifications by Anuj Diwan, 2025:
+# - Added support for inference-time classifier-free guidance
+""" PyTorch ParlerTTS model."""
+import copy
+import inspect
+import math
+import random
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from transformers import AutoConfig, AutoModel, AutoModelForTextEncoding
+from transformers.activations import ACT2FN
+from transformers.cache_utils import (
+    Cache,
+    DynamicCache,
+    EncoderDecoderCache,
+    SlidingWindowCache,
+    StaticCache,
+)
+
+from transformers.generation.configuration_utils import GenerationConfig, GenerationMode
+from transformers.generation.logits_process import ClassifierFreeGuidanceLogitsProcessor, LogitsProcessorList
+
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from transformers.modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+    _prepare_4d_attention_mask,
+    _prepare_4d_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    ModelOutput,
+    Seq2SeqLMOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+    is_torchdynamo_compiling,
+)
+from transformers.utils.import_utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10
+
+from .configuration_parler_tts import ParlerTTSConfig, ParlerTTSDecoderConfig
+from .dac_wrapper import DACConfig, DACModel
+from .logits_processors import ParlerTTSLogitsProcessor
+
+from importlib.metadata import version
+from packaging.version import Version
+
+is_dac_integrated_to_transformers = Version(version("transformers")) > Version("4.44.2dev")
+if not is_dac_integrated_to_transformers:
+    AutoConfig.register("dac", DACConfig)
+else:
+    AutoConfig.register("dac_on_the_hub", DACConfig)
+
+AutoModel.register(DACConfig, DACModel)
+
+if TYPE_CHECKING:
+    from transformers.generation.streamers import BaseStreamer
+
+logger = logging.get_logger(__name__)
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+else:
+    logger.warn("Flash attention 2 is not installed")
+
+_CONFIG_FOR_DOC = "ParlerTTSConfig"
+_CHECKPOINT_FOR_DOC = "parler-tts/parler-tts-mini-v1"
+
+MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "parler-tts/parler-tts-mini-v1",
+    # See all ParlerTTS models at https://huggingface.co/models?filter=parler_tts
+]
+
+
+NEED_SETUP_CACHE_CLASSES_MAPPING = {"static": StaticCache, "sliding_window": SlidingWindowCache}
+
+def _old_prepare_attention_mask_for_generation(
+    input_ids: torch.Tensor, pad_token_id: int, eos_token_id: int
+) -> torch.LongTensor:
+    is_pad_token_in_inputs_ids = (pad_token_id is not None) and (pad_token_id in input_ids)
+    is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+        (eos_token_id is not None) and (pad_token_id != eos_token_id)
+    )
+    if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
+        return input_ids.ne(pad_token_id).long()
+    return input_ids.new_ones(input_ids.shape)
+
+
+@dataclass
+class ParlerTTSSeq2SeqLMOutput(ModelOutput):
+    """
+    Base class for sequence-to-sequence language models outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    per_codebook_losses: Optional[List[torch.FloatTensor]] = None
+
+@dataclass
+class ParlerTTSCausalLMOutputWithCrossAttentions(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Cross attentions weights after the attention softmax, used to compute the weighted average in the
+            cross-attention heads.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `torch.FloatTensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
+            value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
+            setting. Only relevant if `config.is_decoder = True`.
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    per_codebook_losses: Optional[List[torch.FloatTensor]] = None
+
+def apply_delay_pattern_mask(input_ids, decoder_pad_token_mask):
+    """Apply a delay pattern mask to the decoder input ids, only preserving predictions where
+    the mask is set to -1, and otherwise setting to the value detailed in the mask."""
+    seq_len = input_ids.shape[-1]
+    decoder_pad_token_mask = decoder_pad_token_mask[..., :seq_len]
+    input_ids = torch.where(decoder_pad_token_mask == -1, input_ids, decoder_pad_token_mask)
+    return input_ids
+
+
+def build_delay_pattern_mask(
+    input_ids: torch.LongTensor, bos_token_id: int, pad_token_id: int, max_length: int, num_codebooks: int
+):
+    """Build a delayed pattern mask to the input_ids. Each codebook is offset by the previous codebook by
+    one, giving a delayed pattern mask at the start of sequence and end of sequence. Take the example where there
+    are 4 codebooks and a max sequence length of 8, we have the delayed pattern mask of shape `(codebooks,
+    seq_len)`:
+    - [B, -1, -1, -1, -1, P, P, P]
+    - [B, B, -1, -1, -1, -1, P, P]
+    - [B, B, B, -1, -1, -1, -1, P]
+    - [B, B, B, B, -1, -1, -1, -1]
+    where P is the special padding token id and -1 indicates that the token is valid for prediction. If we include
+    a prompt (decoder input ids), the -1 positions indicate where new tokens should be predicted. Otherwise, the
+    mask is set to the value in the prompt:
+    - [B, a, b, -1, -1, P, P, P]
+    - [B, B, c, d, -1, -1, P, P]
+    - [B, B, B, e, f, -1, -1, P]
+    - [B, B, B, B, g, h, -1, -1]
+    where a-h indicate the input prompt (decoder input ids) that are offset by 1. Now, we only override the -1
+    tokens in our prediction.
+    """
+    # (bsz * num_codebooks, seq_len) -> (bsz, num_codebooks, seq_len)
+    input_ids = input_ids.reshape(-1, num_codebooks, input_ids.shape[-1])
+    bsz, num_codebooks, seq_len = input_ids.shape
+
+    input_ids_shifted = torch.ones((bsz, num_codebooks, max_length), dtype=torch.long, device=input_ids.device) * -1
+
+    # we only apply the mask if we have a large enough seq len - otherwise we return as is
+    if max_length < 2 * num_codebooks - 1:
+        return input_ids.reshape(bsz * num_codebooks, -1), input_ids_shifted.reshape(bsz * num_codebooks, -1)
+
+    # fill the shifted ids with the prompt entries, offset by the codebook idx
+    for codebook in range(num_codebooks):
+        # mono channel - loop over the codebooks one-by-one
+        input_ids_shifted[:, codebook, codebook : seq_len + codebook] = input_ids[:, codebook]
+
+    # construct a pattern mask that indicates the positions of padding tokens for each codebook
+    # first fill the upper triangular part (the EOS padding)
+    eos_delay_pattern = torch.triu(
+        torch.ones((num_codebooks, max_length), dtype=torch.bool), diagonal=max_length - num_codebooks + 1
+    )
+    # then fill the lower triangular part (the BOS padding)
+    bos_delay_pattern = torch.tril(torch.ones((num_codebooks, max_length), dtype=torch.bool))
+
+    bos_mask = ~(bos_delay_pattern).to(input_ids.device)
+    eos_mask = ~(eos_delay_pattern).to(input_ids.device)
+    mask = ~(bos_delay_pattern + eos_delay_pattern).to(input_ids.device)
+    input_ids = mask * input_ids_shifted + ~bos_mask * bos_token_id + ~eos_mask * pad_token_id
+
+    # find the first position to start generating - this is the first place we have the -1 token
+    # and will always be in the first codebook (since it has no codebook offset)
+    first_codebook_ids = input_ids[:, 0, :]
+    start_ids = (first_codebook_ids == -1).nonzero()[:, 1]
+    if len(start_ids) > 0:
+        first_start_id = min(start_ids)
+    else:
+        # we have no tokens that need to be filled - return entire matrix of input ids
+        first_start_id = seq_len
+
+    # (bsz * num_codebooks, seq_len) -> (bsz, num_codebooks, seq_len)
+    pattern_mask = input_ids.reshape(bsz * num_codebooks, -1)
+    input_ids = input_ids[..., :first_start_id].reshape(bsz * num_codebooks, -1)
+    return input_ids, pattern_mask
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+@dataclass
+class ParlerTTSUnconditionalInput(ModelOutput):
+    """
+    Args:
+        encoder_outputs  (`Tuple[torch.FloatTensor]` of length 1, with tensor shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the text encoder model.
+        attention_mask (`torch.LongTensor`)  of shape `(batch_size, sequence_length)`, *optional*):
+            Encoder attention mask to avoid performing attention on padding token indices. Mask values selected in `[0,
+            1]`: 1 for tokens that are **not masked**, 0 for tokens that are **masked**.
+        guidance_scale (`float`, *optional*):
+            Guidance scale for classifier free guidance, setting the balance between the conditional logits (predicted
+            from the prompts) and the unconditional logits (predicted without prompts).
+    """
+
+    encoder_outputs: Tuple[torch.FloatTensor] = None
+    attention_mask: torch.LongTensor = None
+    guidance_scale: Optional[float] = None
+
+
+# Copied from transformers.models.encoder_decoder.modeling_encoder_decoder.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    if decoder_start_token_id is None:
+        raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+# Copied from transformers.models.musicgen.modeling_musicgen.MusicgenSinusoidalPositionalEmbedding with Musicgen->ParlerTTS
+class ParlerTTSSinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.make_weights(num_positions, embedding_dim)
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.weights = nn.Parameter(emb_weights)
+        self.weights.requires_grad = False
+        self.weights.detach_()
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int):
+        """
+        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
+        description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.cos(emb), torch.sin(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        return emb.to(torch.get_default_dtype())
+
+    @torch.no_grad()
+    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
+        bsz, seq_len, _ = input_ids.size()
+        # Create the position ids from the input token ids.
+        position_ids = torch.arange(seq_len, device=input_ids.device) + past_key_values_length
+        # expand embeddings if needed
+        if seq_len > self.weights.size(0):
+            self.make_weights(seq_len + self.offset, self.embedding_dim)
+        return self.weights.index_select(0, position_ids.view(-1)).detach()
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->ParlerTTS
+class ParlerTTSRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        super().__init__()
+        self.scaling_factor = scaling_factor
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # For BC we register cos and sin cached
+        self.max_seq_len_cached = max_position_embeddings
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+        t = t / self.scaling_factor
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("_cos_cached", emb.cos().to(torch.get_default_dtype()), persistent=False)
+        self.register_buffer("_sin_cached", emb.sin().to(torch.get_default_dtype()), persistent=False)
+
+    # Ignore copy
+    @torch.no_grad()
+    def forward(self, device_type, position_ids):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        inv_freq_expanded = self.inv_freq[None, :, None].expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :]
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos, sin
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(x, cos, sin, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        x (`torch.Tensor`): The tensor over which to apply the rope embeddings
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    x_embed = (x * cos) + (rotate_half(x) * sin)
+    return x_embed
+
+
+class ParlerTTSAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper. Modified to use GQA and MQA."""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        num_key_value_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        rope_embeddings: bool = False,
+        layer_idx: Optional[int] = None,
+        config: Optional[ParlerTTSDecoderConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        if layer_idx is None and is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.layer_idx = layer_idx
+
+        self.k_proj = nn.Linear(embed_dim, self.num_key_value_heads * self.head_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, self.num_key_value_heads * self.head_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+        self.rope_embeddings = rope_embeddings
+
+    def _shape_query(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def _shape_key_value(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cos: Optional[torch.LongTensor] = None,
+        sin: Optional[torch.LongTensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len = hidden_states.shape[:2]
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        query_states = self._shape_query(query_states, tgt_len, bsz)
+        if self.rope_embeddings:
+            query_states = apply_rotary_pos_emb(query_states, cos, sin)
+
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = self._shape_key_value(self.k_proj(current_states), -1, bsz)
+            value_states = self._shape_key_value(self.v_proj(current_states), -1, bsz)
+
+            if not is_cross_attention:
+                # cached key states already have rope applied - only apply to new state
+                key_states = apply_rotary_pos_emb(key_states, cos, sin) if self.rope_embeddings else key_states
+
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_probs, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights, past_key_value
+
+
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.musicgen.modeling_musicgen.MusicgenFlashAttention2 with Musicgen->ParlerTTS
+class ParlerTTSFlashAttention2(ParlerTTSAttention):
+    """
+    ParlerTTS flash attention module. This module inherits from `ParlerTTSAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cos: Optional[torch.LongTensor] = None,
+        sin: Optional[torch.LongTensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # ParlerTTSFlashAttention2 attention does not support output_attentions
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "The `static` cache implementation is not compatible with `attn_implementation='flash_attention_2'`. "
+                "Use `attn_implementation='sdpa'` in the meantime, and open an issue at https://github.com/huggingface/transformers"
+            )
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len = hidden_states.shape[:2]
+
+        # get query proj
+        query_states = self.q_proj(hidden_states).view(bsz, tgt_len, self.num_heads, self.head_dim)
+
+        if self.rope_embeddings:
+            query_states = apply_rotary_pos_emb(query_states, cos, sin, unsqueeze_dim=2)
+
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = self._shape_key_value(self.k_proj(current_states), -1, bsz)
+            value_states = self._shape_key_value(self.v_proj(current_states), -1, bsz)
+
+            if not is_cross_attention and self.rope_embeddings:
+                # cached key states already have rope applied - only apply to new state
+                key_states = apply_rotary_pos_emb(key_states, cos, sin)
+
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+
+        # # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]
+        # #  We would need to refactor the KV cache to be able to avoid many of these transpose/reshape/view.
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+
+        if query_states.dtype == torch.float32 or value_states.dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, tgt_len, dropout=self.dropout
+        )
+
+        attn_output = attn_output.reshape(bsz, tgt_len, -1)
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+
+        return attn_output
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# Copied from transformers.models.bart.modeling_bart.BartSdpaAttention with Bart->Musicgen
+class ParlerTTSSdpaAttention(ParlerTTSAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cos: Optional[torch.LongTensor] = None,
+        sin: Optional[torch.LongTensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        if output_attentions or layer_head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "ParlerTTSModel is using ParlerTTSSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
+                ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states,
+                key_value_states=key_value_states,
+                past_key_value=past_key_value,
+                attention_mask=attention_mask,
+                layer_head_mask=layer_head_mask,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len = hidden_states.shape[:2]
+
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+        query_states = self._shape_query(query_states, tgt_len, bsz)
+
+        if self.rope_embeddings:
+            query_states = apply_rotary_pos_emb(query_states, cos, sin)
+
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = self._shape_key_value(self.k_proj(current_states), -1, bsz)
+            value_states = self._shape_key_value(self.v_proj(current_states), -1, bsz)
+
+            if not is_cross_attention and self.rope_embeddings:
+                # cached key states already have rope applied - only apply to new state
+                key_states = apply_rotary_pos_emb(key_states, cos, sin)
+
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and causal_mask is None and tgt_len > 1 else False
+
+        # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
+        # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+            is_causal=is_causal,
+        )
+
+        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+PARLERTTS_ATTENTION_CLASSES = {
+    "eager": ParlerTTSAttention,
+    "sdpa": ParlerTTSSdpaAttention,
+    "flash_attention_2": ParlerTTSFlashAttention2,
+}
+
+
+class ParlerTTSDecoderLayer(nn.Module):
+    def __init__(self, config: ParlerTTSDecoderConfig, layer_idx: int = None):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+
+        self.self_attn = PARLERTTS_ATTENTION_CLASSES[config._attn_implementation](
+            embed_dim=self.embed_dim,
+            num_heads=config.num_attention_heads,
+            num_key_value_heads=config.num_key_value_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            is_causal=True,
+            bias=False,
+            rope_embeddings=config.rope_embeddings,
+            layer_idx=layer_idx,
+            config=config,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        cross_attn_implementation = config._attn_implementation
+        if config.cross_attention_implementation_strategy == "always_eager":
+            cross_attn_implementation = "eager"
+        elif config.cross_attention_implementation_strategy == "always_sdpa":
+            cross_attn_implementation = "sdpa"
+        self.encoder_attn = PARLERTTS_ATTENTION_CLASSES[cross_attn_implementation](
+            self.embed_dim,
+            config.num_attention_heads,
+            num_key_value_heads=config.num_cross_attention_key_value_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            bias=False,
+            rope_embeddings=config.rope_embeddings,
+            layer_idx=layer_idx,
+            config=config,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=False)
+        self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, bias=False)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        cos: Optional[torch.LongTensor] = None,
+        sin: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+                config.n_positions - 1]`.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=past_key_value,
+            attention_mask=attention_mask,
+            cos=cos,
+            sin=sin,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                cos=cos,
+                sin=sin,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+
+            # add cross-attn to positions 1 of present_key_value tuple
+            present_key_value = (present_key_value, cross_attn_present_key_value)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+# Copied from transformers.models.musicgen.modeling_musicgen.MusicgenPreTrainedModel with Musicgen->ParlerTTS
+class ParlerTTSPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ParlerTTSDecoderConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _no_split_modules = ["ParlerTTSDecoderLayer", "ParlerTTSAttention"]
+    _supports_cache_class = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_factor
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+MUSICGEN_START_DOCSTRING = r"""
+
+    The ParlerTTS model was proposed in [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by
+    Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi, Alexandre Défossez. It is an
+    encoder decoder transformer trained on the task of conditional music generation
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`ParlerTTSConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MUSICGEN_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary, corresponding to the sequence of audio codes.
+
+            Indices can be obtained by encoding an audio prompt with an audio encoder model to predict audio codes,
+            such as with the [`EncodecModel`]. See [`EncodecModel.encode`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            <Tip warning={true}>
+
+            The `decoder_input_ids` will automatically be converted from shape `(batch_size * num_codebooks,
+            target_sequence_length)` to `(batch_size, num_codebooks, target_sequence_length)` in the forward pass. If
+            you obtain audio codes from an audio encoding model, such as [`EncodecModel`], ensure that the number of
+            frames is equal to 1, and that you reshape the audio codes from `(frames, batch_size, num_codebooks,
+            target_sequence_length)` to `(batch_size * num_codebooks, target_sequence_length)` prior to passing them as
+            `decoder_input_ids`.
+
+            </Tip>
+
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+            TODO: it's passed through enc_to_dec_proj and optionnally we concat the prompt hidden states in certain cases.
+        past_key_values (`EncoderDecoderCache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states that can be used to speed up auto-regressive (sequential) decoding. There are
+            four sets of pre-computed hidden-states: key and values states in the self-attention blocks (2) and
+            in the cross-attention blocks (2). The `past_key_values` are returned when `use_cache=True` is passed or
+            when `config.use_cache=True`
+
+            Two formats are allowed:
+            - An [`~cache_utils.EncoderDecoderCache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
+        prompt_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input prompt sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        prompt_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding prompt token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        prompt_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `prompt_input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `prompt_input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. It is used to update the cache
+            in the correct position and to infer the complete sequence length.
+"""
+
+MUSICGEN_DECODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary, corresponding to the sequence of audio codes.
+
+            Indices can be obtained by encoding an audio prompt with an audio encoder model to predict audio codes,
+            such as with the [`EncodecModel`]. See [`EncodecModel.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            <Tip warning={true}>
+
+            The `input_ids` will automatically be converted from shape `(batch_size * num_codebooks,
+            target_sequence_length)` to `(batch_size, num_codebooks, target_sequence_length)` in the forward pass. If
+            you obtain audio codes from an audio encoding model, such as [`EncodecModel`], ensure that the number of
+            frames is equal to 1, and that you reshape the audio codes from `(frames, batch_size, num_codebooks,
+            target_sequence_length)` to `(batch_size * num_codebooks, target_sequence_length)` prior to passing them as
+            `input_ids`.
+
+            </Tip>
+
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
+            the decoder.
+        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+            selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        prompt_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+            Sequence of prompt hidden-states at the output of the initial embedding layer. Concatenated to the input embeds.
+        prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+            Mask to avoid performing cross-attention on padding tokens indices of prompt input_ids. Mask values
+            selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+            cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class ParlerTTSDecoder(ParlerTTSPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ParlerTTSDecoderLayer`]
+    """
+
+    def __init__(self, config: ParlerTTSDecoderConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.layerdrop
+        self.max_target_positions = config.max_position_embeddings
+        self.d_model = config.hidden_size
+        self.num_codebooks = config.num_codebooks
+        self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
+
+        # TODO(YL): actually doesn't need the +1 if initialized correctly. Too late to change now.
+        embed_dim = config.vocab_size + 1  # + 1 for pad token id
+        self.embed_tokens = nn.ModuleList(
+            [nn.Embedding(embed_dim, config.hidden_size) for _ in range(config.num_codebooks)]
+        )
+
+        self.rope_embeddings = config.rope_embeddings
+        if not config.rope_embeddings:
+            self.embed_positions = ParlerTTSSinusoidalPositionalEmbedding(
+                config.max_position_embeddings,
+                config.hidden_size,
+            )
+        else:
+            self.rotary_emb = ParlerTTSRotaryEmbedding(
+                config.hidden_size // config.num_attention_heads,
+                max_position_embeddings=config.max_position_embeddings,
+                base=config.rope_theta,
+            )
+        self.layers = nn.ModuleList(
+            [ParlerTTSDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.attn_implementation = config._attn_implementation
+        encoder_attn_implementation = config._attn_implementation
+        if config.cross_attention_implementation_strategy is not None:
+            encoder_attn_implementation = (
+                "sdpa" if config.cross_attention_implementation_strategy == "always_sdpa" else "eager"
+            )
+        self.encoder_attn_implementation = encoder_attn_implementation
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        prompt_hidden_states: Optional[torch.FloatTensor] = None,
+        prompt_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position=None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            # (bsz * codebooks, seq_len) -> (bsz, codebooks, seq_len)
+            input = input_ids.reshape(-1, self.num_codebooks, input_ids.shape[-1])
+            bsz, num_codebooks, seq_len = input.shape
+            input_shape = (bsz, seq_len)
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1:]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = sum([self.embed_tokens[codebook](input[:, codebook]) for codebook in range(num_codebooks)])
+
+        prepended_sequence_length = 0
+        # if prompt_hidden_states, fuse to inputs_embeds and update input shape
+        if prompt_hidden_states is not None:
+            prepended_sequence_length = prompt_hidden_states.shape[-2]
+            inputs_embeds = torch.cat([prompt_hidden_states, inputs_embeds], dim=1)
+
+        return_legacy_cache = False
+        return_self_attention_cache = False
+        if use_cache or past_key_values is not None:
+            if isinstance(past_key_values, Cache) and not isinstance(past_key_values, EncoderDecoderCache):
+                return_self_attention_cache = True
+                past_key_values = EncoderDecoderCache(past_key_values, DynamicCache())
+            elif not isinstance(past_key_values, EncoderDecoderCache):
+                return_legacy_cache = True
+                logger.warning_once(
+                    "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. "
+                    "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                    "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+                )
+                past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        past_key_values_length = 0
+        if cache_position is not None:
+            past_key_values_length = cache_position[0]
+        elif past_key_values is not None:
+            past_key_values_length = past_key_values.get_seq_length()
+
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + input_shape[1] + prepended_sequence_length, device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # NOTE: 1. As it is, the masked ids from the prompt will still count in the positions embeddings
+        # NOTE: 2. we want to concatenate the prompt attention mask and the decoder attention mask
+        # i.i.f `prompt_cross_attention=False`. ParlerTTSForConditionalGeneration's taking care of setting
+        # `prompt_attention_mask=None`
+        if prompt_attention_mask is not None and attention_mask is not None:
+            attention_mask = torch.cat([prompt_attention_mask, attention_mask], dim=1)
+        elif prompt_attention_mask is not None:
+            logger.warning_once(
+                "`prompt_attention_mask` is specified but `attention_mask` is not. A full `attention_mask` will be created. Make sure this is the intended behaviour."
+            )
+            if past_key_values_length == 0:
+                attention_mask = torch.cat(
+                    [
+                        prompt_attention_mask,
+                        torch.ones(input_shape, device=self.device, dtype=prompt_attention_mask.dtype),
+                    ],
+                    dim=1,
+                )
+            else:
+                # In the generation case of `prompt_cross_attention=True`, we need to recreate an attention mask from scratch
+                # to be able to prepend the prompt attention mask.
+                # Since we generate token per token, we can recompute the generated length from the information we have.
+                generated_length = past_key_values_length - prompt_attention_mask.shape[1] + 1
+                attention_mask = torch.cat(
+                    [
+                        prompt_attention_mask,
+                        torch.ones(
+                            (input_shape[0], generated_length), device=self.device, dtype=prompt_attention_mask.dtype
+                        ),
+                    ],
+                    dim=1,
+                )
+
+        input_shape = inputs_embeds.size()[:-1]
+        cos, sin = None, None
+
+        if not self.rope_embeddings:
+            # embed positions
+            # TODO: As it is, the masked ids from the prompt will still count in the positions embeddings
+            # maybe should modify position embeddings
+            positions = self.embed_positions(inputs_embeds, past_key_values_length)
+            hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
+        else:
+            hidden_states = inputs_embeds
+
+            if position_ids is None:
+                if attention_mask is not None:
+                    # masked ids will **not** count in the position embeddings
+                    position_ids = attention_mask.long().cumsum(-1) - 1
+                    position_ids.masked_fill_(attention_mask == 0, 1)
+                else:
+                    position_ids = torch.arange(
+                        past_key_values_length,
+                        input_shape[1] + past_key_values_length,
+                        dtype=torch.long,
+                        device=inputs_embeds.device,
+                    )
+                    position_ids = position_ids.unsqueeze(0)
+
+                # Some generation methods already pass only the last input ID
+                if position_ids.shape[1] > input_shape[1]:
+                    position_ids = position_ids[:, -input_shape[1] :]
+
+            cos, sin = self.rotary_emb(hidden_states.device.type, position_ids)
+            cos, sin = cos.to(hidden_states.dtype), sin.to(hidden_states.dtype)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            past_key_values.self_attention_cache if past_key_values is not None else None,
+            output_attentions,
+        )
+
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            if self.encoder_attn_implementation == "flash_attention_2":
+                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+            elif self.encoder_attn_implementation == "sdpa" and cross_attn_head_mask is None and not output_attentions:
+                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask,
+                    inputs_embeds.dtype,
+                    tgt_len=input_shape[-1],
+                )
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask(
+                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != len(self.layers):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {attn_mask.size()[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.forward,
+                    hidden_states,
+                    causal_mask,
+                    cos,
+                    sin,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    cos=cos,
+                    sin=sin,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_values if use_cache else None,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = past_key_values if use_cache else None
+        if return_self_attention_cache:
+            next_cache = past_key_values.self_attention_cache
+        if return_legacy_cache:
+            next_cache = past_key_values.to_legacy_cache()
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+            if attention_mask.max() != 0:
+                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
+            causal_mask = attention_mask
+        else:
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+
+@add_start_docstrings(
+    "The bare ParlerTTS decoder model outputting raw hidden-states without any specific head on top.",
+    MUSICGEN_START_DOCSTRING,
+)
+# Copied from transformers.models.musicgen.modeling_musicgen.MusicgenModel with Musicgen->ParlerTTS
+class ParlerTTSModel(ParlerTTSPreTrainedModel):
+    def __init__(self, config: ParlerTTSDecoderConfig):
+        super().__init__(config)
+        self.decoder = ParlerTTSDecoder(config)
+        self.config = config
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.decoder.embed_tokens = value
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        prompt_hidden_states: Optional[torch.FloatTensor] = None,
+        prompt_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[EncoderDecoderCache, Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            encoder_attention_mask=encoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            prompt_hidden_states=prompt_hidden_states,
+            prompt_attention_mask=prompt_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        if not return_dict:
+            return decoder_outputs
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            hidden_states=decoder_outputs.hidden_states,
+            attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The Parler-TTS decoder model with a language modelling head on top.",
+    MUSICGEN_START_DOCSTRING,
+)
+class ParlerTTSForCausalLM(ParlerTTSPreTrainedModel):
+    def __init__(self, config: ParlerTTSDecoderConfig):
+        super().__init__(config)
+
+        self.model = ParlerTTSModel(config)
+
+        self.num_codebooks = config.num_codebooks
+        self.vocab_size = config.vocab_size
+        self.num_codebooks = config.num_codebooks
+        
+        self.use_fused_lm_heads = config.use_fused_lm_heads
+        if self.use_fused_lm_heads:
+            self.lm_heads = nn.Linear(config.hidden_size, config.vocab_size * config.num_codebooks, bias=False)
+        else:
+            self.lm_heads = nn.ModuleList(
+            [nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(config.num_codebooks)]
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_heads
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_heads = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model.decoder = decoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    @add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ParlerTTSCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        prompt_hidden_states: Optional[torch.FloatTensor] = None,
+        prompt_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        loss_reduction: str = "mean",
+    ) -> Union[Tuple, ParlerTTSCausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        Returns:
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            prompt_hidden_states=prompt_hidden_states,
+            prompt_attention_mask=prompt_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs[0]
+
+        if self.use_fused_lm_heads:
+            lm_logits = self.lm_heads(hidden_states).view(hidden_states.shape[0], -1, self.num_codebooks, self.vocab_size).transpose(1,2)
+        else:
+            lm_logits = torch.stack([head(hidden_states) for head in self.lm_heads], dim=1)
+
+        loss = None
+        per_codebook_losses = None
+        if labels is not None:
+            codebook_weights = self.config.codebook_weights
+            # since encoder hidden states have concatenated to hidden states, take the last hidden states corresponding to labels
+            logits = lm_logits[:, :, -labels.shape[1] :]
+
+            loss_fct = CrossEntropyLoss(reduction=loss_reduction)
+            loss = torch.zeros([], device=self.device)
+            
+            per_codebook_losses = []
+
+            # (bsz, vocab_size, seq_len, num_codebooks), (bsz, seq_len, num_codebooks)
+            labels = labels.masked_fill(labels == self.config.bos_token_id, -100)
+
+            # we use every codebooks token AND one single EOS at the end of each codebooks
+            mask = (input_ids.transpose(1, 2) != self.config.eos_token_id) & ((labels != -100))
+
+            # per codebook cross-entropy
+            for codebook in range(self.config.num_codebooks):
+                codebook_logits = logits[:, codebook].contiguous().view(-1, logits.shape[-1])
+                codebook_mask = mask[..., codebook].contiguous().view(-1)
+                codebook_labels = labels[..., codebook].contiguous().view(-1)
+
+                codebook_loss = loss_fct(codebook_logits[codebook_mask], codebook_labels[codebook_mask])
+                per_codebook_losses.append(codebook_loss)
+
+                if codebook_weights is not None:
+                    codebook_loss = codebook_loss*codebook_weights[codebook]
+                    
+                loss += codebook_loss
+
+            if codebook_weights is not None:
+                loss = loss / sum(codebook_weights)
+            else:
+                loss = loss / self.config.num_codebooks
+
+        # (bsz, num_codebooks, seq_len, vocab_size) -> (bsz * num_codebooks, seq_len, vocab_size)
+        lm_logits = lm_logits.reshape(-1, *lm_logits.shape[2:])
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((loss,) + output + (per_codebook_losses, )) if loss is not None else output
+
+        return ParlerTTSCausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+            per_codebook_losses=per_codebook_losses,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        prompt_hidden_states=None,
+        prompt_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        use_cache=True,
+        delay_pattern_mask=None,
+        guidance_scale=None,
+        cache_position=None,
+        inputs_embeds=None,
+        **kwargs,
+    ):
+        if delay_pattern_mask is None:
+            input_ids, delay_pattern_mask = self.build_delay_pattern_mask(
+                input_ids,
+                bos_token_id=self.generation_config.bos_token_id,
+                pad_token_id=self.generation_config.pad_token_id,
+                max_length=self.generation_config.max_length,
+            )
+
+        # apply the delay pattern mask
+        input_ids = self.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+        
+        if guidance_scale is not None and guidance_scale > 1:
+            # for classifier free guidance we need to replicate the decoder args across the batch dim (we'll split these
+            # before sampling)
+            input_ids = input_ids.repeat((2, 1))
+            if attention_mask is not None:
+                attention_mask = attention_mask.repeat((2, 1))
+            if position_ids is not None:
+                position_ids = position_ids.repeat((2, 1))
+
+
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+            if position_ids is not None:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+            # we only want to use prompt signal in the 1st generation step but keeping the attention mask
+            prompt_hidden_states = None
+
+        return {
+            "input_ids": input_ids.contiguous(), # `contiguous()` needed for compilation use cases
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "encoder_hidden_states": encoder_hidden_states,
+            "encoder_attention_mask": encoder_attention_mask,
+            "prompt_hidden_states": prompt_hidden_states,
+            "prompt_attention_mask": prompt_attention_mask,
+            "head_mask": head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+            "cache_position": cache_position,
+            "inputs_embeds": inputs_embeds,
+        }
+
+    # Ignore copy
+    def build_delay_pattern_mask(
+        self, input_ids: torch.LongTensor, bos_token_id: int, pad_token_id: int, max_length: int = None
+    ):
+        """Build a delayed pattern mask to the input_ids. Each codebook is offset by the previous codebook by
+        one, giving a delayed pattern mask at the start of sequence and end of sequence. Take the example where there
+        are 4 codebooks and a max sequence length of 8, we have the delayed pattern mask of shape `(codebooks,
+        seq_len)`:
+        - [B, -1, -1, -1, -1, P, P, P]
+        - [B, B, -1, -1, -1, -1, P, P]
+        - [B, B, B, -1, -1, -1, -1, P]
+        - [B, B, B, B, -1, -1, -1, -1]
+        where P is the special padding token id and -1 indicates that the token is valid for prediction. If we include
+        a prompt (decoder input ids), the -1 positions indicate where new tokens should be predicted. Otherwise, the
+        mask is set to the value in the prompt:
+        - [B, a, b, -1, -1, P, P, P]
+        - [B, B, c, d, -1, -1, P, P]
+        - [B, B, B, e, f, -1, -1, P]
+        - [B, B, B, B, g, h, -1, -1]
+        where a-h indicate the input prompt (decoder input ids) that are offset by 1. Now, we only override the -1
+        tokens in our prediction.
+        """
+        max_length = max_length if max_length is not None else self.generation_config.max_length
+        return build_delay_pattern_mask(input_ids, bos_token_id, pad_token_id, max_length, self.num_codebooks)
+
+    @staticmethod
+    def apply_delay_pattern_mask(input_ids, decoder_pad_token_mask):
+        """Apply a delay pattern mask to the decoder input ids, only preserving predictions where
+        the mask is set to -1, and otherwise setting to the value detailed in the mask."""
+        return apply_delay_pattern_mask(input_ids, decoder_pad_token_mask)
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        synced_gpus: Optional[bool] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        **kwargs,
+    ):
+        """
+        Generates sequences of token ids for models with a language modeling head.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
+                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
+                should be in the format `input_ids`. For encoder-decoder models *inputs* can represent any of
+                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
+
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GenerateDecoderOnlyOutput`],
+                    - [`~generation.GenerateBeamDecoderOnlyOutput`]
+
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GenerateEncoderDecoderOutput`],
+                    - [`~generation.GenerateBeamEncoderDecoderOutput`]
+        """
+        # 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
+        if generation_config is None:
+            generation_config = self.generation_config
+
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
+        generation_config.validate()
+        self._validate_model_kwargs(model_kwargs.copy())
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        requires_attention_mask = "encoder_outputs" not in model_kwargs
+        kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None
+
+        # 3. Define model inputs`
+        input_ids, model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
+        batch_size = input_ids.shape[0] // self.num_codebooks
+        self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=input_ids.device)
+
+        # 4. Define other model kwargs
+        model_kwargs["use_cache"] = generation_config.use_cache
+        model_kwargs["guidance_scale"] = generation_config.guidance_scale
+
+        if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
+            model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
+                input_ids, generation_config.pad_token_id, generation_config.eos_token_id
+            )
+
+        # 5. Prepare `max_length` depending on other stopping criteria.
+        input_ids_length = input_ids.shape[-1]
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
+        generation_config = self._prepare_generated_length(
+            generation_config=generation_config,
+            has_default_max_length=has_default_max_length,
+            has_default_min_length=has_default_min_length,
+            model_input_name=model_input_name,
+            inputs_tensor=input_ids,
+            input_ids_length=input_ids_length,
+        )
+
+        # 6. Prepare `input_ids` which will be used for auto-regressive generation
+        # Build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to Parler-TTS)
+        input_ids, delay_pattern_mask = self.build_delay_pattern_mask(
+            input_ids,
+            pad_token_id=generation_config._decoder_start_token_tensor,
+            max_length=generation_config.max_length,
+        )
+
+        if streamer is not None:
+            streamer.put(input_ids.cpu())
+
+        # stash the delay mask so that we don't have to recompute it in each forward pass
+        model_kwargs["delay_pattern_mask"] = delay_pattern_mask
+
+        # 7. determine generation mode
+        is_greedy_gen_mode = (
+            (generation_config.num_beams == 1)
+            and (generation_config.num_beam_groups == 1)
+            and generation_config.do_sample is False
+        )
+        is_sample_gen_mode = (
+            (generation_config.num_beams == 1)
+            and (generation_config.num_beam_groups == 1)
+            and generation_config.do_sample is True
+        )
+
+        # 7.5 prepare batched CFG externally (to enable coexistance with the unbatched CFG)
+        if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
+            cfg_logits_processor_item = ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale)
+            generation_config.guidance_scale = None
+        else:
+            cfg_logits_processor_item = None
+
+        # 8. prepare distribution pre_processing samplers
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=None,
+            logits_processor=logits_processor,
+            device=input_ids.device,
+        )
+
+        # 8.5 now prepend the cfg_logits_processor_item to the list of processors if it exists
+        if cfg_logits_processor_item is not None:
+            # Imitate the behaviour of transformers.generation.utils._merge_criteria_processor_list
+            for logits_processor_item in logits_processor:
+                if type(logits_processor_item) is type(cfg_logits_processor_item):
+                    raise ValueError(
+                        f"A custom logits processor of type {type(cfg_logits_processor_item)} with values {cfg_logits_processor_item} has been passed to"
+                        f" `.generate()`, but it has already been created with the values {logits_processor_item}. {logits_processor_item} has been"
+                        " created by passing the corresponding arguments to generate or by the model's config default"
+                        f" values. If you just want to change the default values of logits processor consider passing"
+                        f" them as arguments to `.generate()` instead of using a custom logits processor."
+                    )
+            logits_processor.insert(0, cfg_logits_processor_item)
+
+        # 9. prepare stopping criteria
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+
+        if is_greedy_gen_mode:
+            if generation_config.num_return_sequences > 1:
+                raise ValueError(
+                    "num_return_sequences has to be 1 when doing greedy search, "
+                    f"but is {generation_config.num_return_sequences}."
+                )
+
+            # 10. run greedy search
+            outputs = self._sample(
+                input_ids,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                generation_config=generation_config,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
+
+        elif is_sample_gen_mode:
+            # 10. prepare logits warper
+            logits_warper = self._get_logits_warper(generation_config, device=input_ids.device)
+
+            # expand input_ids with `num_return_sequences` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=input_ids,
+                expand_size=generation_config.num_return_sequences,
+                **model_kwargs,
+            )
+
+            # 11. run sample
+            outputs = self._sample(
+                input_ids,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                stopping_criteria=stopping_criteria,
+                generation_config=generation_config,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
+
+        else:
+            raise ValueError(
+                "Got incompatible mode for generation, should be one of greedy or sampling. "
+                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+            )
+
+        if generation_config.return_dict_in_generate:
+            output_ids = outputs.sequences
+        else:
+            output_ids = outputs
+
+        # apply the pattern mask to the final ids
+        output_ids = self.apply_delay_pattern_mask(output_ids, model_kwargs["delay_pattern_mask"])
+
+        # revert the pattern delay mask by filtering the eos and bos token ids from the delay pattern mask
+        _, mask = self.build_delay_pattern_mask(
+            input_ids,
+            bos_token_id=generation_config.bos_token_id,
+            pad_token_id=generation_config.pad_token_id,
+            max_length=output_ids.shape[1],
+        )
+
+        mask = (mask != generation_config._bos_token_tensor) & (mask != generation_config._pad_token_tensor)
+        output_ids = output_ids[mask].reshape(batch_size, self.num_codebooks, -1)
+
+        if generation_config.return_dict_in_generate:
+            outputs.sequences = output_ids
+            return outputs
+        else:
+            return output_ids
+
+
+@add_start_docstrings(
+    "The composite Parler-TTS model with a text encoder, audio encoder and ParlerTTS decoder, "
+    "for music generation tasks with one or both of text and audio prompts.",
+    MUSICGEN_START_DOCSTRING,
+)
+class ParlerTTSForConditionalGeneration(PreTrainedModel):
+    config_class = ParlerTTSConfig
+    base_model_prefix = "encoder_decoder"
+    main_input_name = "input_ids"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+
+    def __init__(
+        self,
+        config: Optional[ParlerTTSConfig] = None,
+        text_encoder: Optional[PreTrainedModel] = None,
+        audio_encoder: Optional[PreTrainedModel] = None,
+        decoder: Optional[ParlerTTSForCausalLM] = None,
+    ):
+        if config is None and (text_encoder is None or audio_encoder is None or decoder is None):
+            raise ValueError(
+                "Either a configuration has to be provided, or all three of text encoder, audio encoder and Parler-TTS decoder."
+            )
+        if config is None:
+            config = ParlerTTSConfig.from_sub_models_config(text_encoder.config, audio_encoder.config, decoder.config)
+        else:
+            if not isinstance(config, self.config_class):
+                raise ValueError(f"Config: {config} has to be of type {self.config_class}")
+
+        if config.decoder.cross_attention_hidden_size is not None:
+            if config.decoder.cross_attention_hidden_size != config.text_encoder.hidden_size:
+                raise ValueError(
+                    "If `cross_attention_hidden_size` is specified in the Parler-TTS decoder's configuration, it has to be equal"
+                    f" to the text encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
+                    f" `config.decoder.cross_attention_hidden_size` and {config.text_encoder.hidden_size} for"
+                    " `config.text_encoder.hidden_size`."
+                )
+
+        # initialize with config
+        super().__init__(config)
+
+        if text_encoder is None:
+            from transformers.models.auto.modeling_auto import AutoModelForTextEncoding
+
+            text_encoder = AutoModelForTextEncoding.from_config(config.text_encoder)
+
+        if audio_encoder is None:
+            from transformers.models.auto.modeling_auto import AutoModel
+
+            audio_encoder = AutoModel.from_config(config.audio_encoder)
+
+        if decoder is None:
+            decoder = ParlerTTSForCausalLM._from_config(config.decoder)
+
+        self.text_encoder = text_encoder
+        self.audio_encoder = audio_encoder
+        self.decoder = decoder
+
+        if self.text_encoder.config.to_dict() != self.config.text_encoder.to_dict():
+            logger.warning(
+                f"Config of the text_encoder: {self.text_encoder.__class__} is overwritten by shared text_encoder config:"
+                f" {self.config.text_encoder}"
+            )
+        if self.audio_encoder.config.to_dict() != self.config.audio_encoder.to_dict():
+            logger.warning(
+                f"Config of the audio_encoder: {self.audio_encoder.__class__} is overwritten by shared audio_encoder config:"
+                f" {self.config.audio_encoder}"
+            )
+        if self.decoder.config.to_dict() != self.config.decoder.to_dict():
+            logger.warning(
+                f"Config of the decoder: {self.decoder.__class__} is overwritten by shared decoder config:"
+                f" {self.config.decoder}"
+            )
+
+        # make sure that the individual model's config refers to the shared config
+        # so that the updates to the config will be synced
+        self.config.text_encoder._attn_implementation = self.text_encoder.config._attn_implementation
+        self.config.audio_encoder._attn_implementation = self.audio_encoder.config._attn_implementation
+        self.config.decoder._attn_implementation = self.decoder.config._attn_implementation
+        self.text_encoder.config = self.config.text_encoder
+        self.audio_encoder.config = self.config.audio_encoder
+        self.decoder.config = self.config.decoder
+
+        # text encoder outputs might need to be projected to different dimension for decoder
+        if (
+            self.text_encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            self.enc_to_dec_proj = nn.Linear(self.text_encoder.config.hidden_size, self.decoder.config.hidden_size)
+
+        # prompt embeddings
+        self.embed_prompts = nn.Embedding(config.vocab_size, self.decoder.config.hidden_size)
+
+        self.prompt_cross_attention = config.prompt_cross_attention
+        if config.prompt_cross_attention:
+            self.embed_positions = ParlerTTSSinusoidalPositionalEmbedding(
+                config.decoder.max_position_embeddings,
+                config.decoder.hidden_size,
+            )
+
+        if self.text_encoder.get_output_embeddings() is not None:
+            raise ValueError(
+                f"The encoder {self.text_encoder} should not have a LM Head. Please use a model without and LM Head"
+            )
+
+        decoder_signature = set(inspect.signature(self.decoder.forward).parameters.keys())
+        if "encoder_hidden_states" not in decoder_signature:
+            raise ValueError(
+                "The selected decoder is not prepared for the encoder hidden states to be passed. Please see the "
+                "following discussion on GitHub: https://github.com/huggingface/transformers/issues/23350"
+            )
+
+        audio_encoder_signature = set(inspect.signature(self.audio_encoder.decode).parameters.keys())
+        self.use_audio_scales = "audio_scales" in audio_encoder_signature
+
+        self.use_4dim_audio_codes = False
+        audio_type = audio_encoder.config.model_type
+        if audio_type in {"encodec", "dac_on_the_hub"} or (audio_type == "dac" and not is_dac_integrated_to_transformers):
+            self.use_4dim_audio_codes = True 
+ 
+        # Initialize projection and embedding layers and tie text encoder and decoder weights if set accordingly
+        self.post_init()
+
+    def _init_weights(self, module):
+        std = self.decoder.config.initializer_factor
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def tie_weights(self):
+        # tie text encoder & decoder if needed
+        if self.config.tie_encoder_decoder:
+            # tie text encoder and decoder base model
+            decoder_base_model_prefix = self.decoder.base_model_prefix
+            self._tie_encoder_decoder_weights(
+                self.text_encoder, self.decoder._modules[decoder_base_model_prefix], self.decoder.base_model_prefix
+            )
+
+    def get_audio_encoder(self):
+        return self.audio_encoder
+
+    def get_text_encoder(self):
+        return self.text_encoder
+
+    def get_encoder(self):
+        # get the text encoder to compute the encoder hidden-states for generation
+        return self.get_text_encoder()
+
+    def get_decoder(self):
+        return self.decoder
+
+    def get_input_embeddings(self):
+        return self.text_encoder.get_input_embeddings()
+
+    def get_output_embeddings(self):
+        return self.decoder.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        return self.decoder.set_output_embeddings(new_embeddings)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r"""
+        Example:
+
+        ```python
+        >>> from parler_tts import ParlerTTSForConditionalGeneration
+
+        >>> model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1")
+        ```"""
+
+        # At the moment fast initialization is not supported for composite models
+        if kwargs.get("_fast_init", False):
+            logger.warning(
+                "Fast initialization is currently not supported for ParlerTTSForConditionalGeneration. "
+                "Falling back to slow initialization..."
+            )
+        kwargs["_fast_init"] = False
+
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+    @classmethod
+    def from_sub_models_pretrained(
+        cls,
+        text_encoder_pretrained_model_name_or_path: str = None,
+        audio_encoder_pretrained_model_name_or_path: str = None,
+        decoder_pretrained_model_name_or_path: str = None,
+        *model_args,
+        **kwargs,
+    ) -> PreTrainedModel:
+        r"""
+        Instantiate a text encoder, an audio encoder, and a Parler-TTS decoder from one, two or three base classes of the
+        library from pretrained model checkpoints.
+
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
+        the model, you need to first set it back in training mode with `model.train()`.
+
+        Params:
+            text_encoder_pretrained_model_name_or_path (`str`, *optional*):
+                Information necessary to initiate the text encoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `t5-base`, or namespaced under a user or
+                      organization name, like `google/flan-t5-base.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+
+            audio_encoder_pretrained_model_name_or_path (`str`, *optional*):
+                Information necessary to initiate the audio encoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                      user or organization name, like `facebook/encodec_24khz`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+
+            decoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
+                Information necessary to initiate the decoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `gpt2`, or namespaced under a user or
+                      organization name, like `parler-tts/parler-tts-mini-v1`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+
+            model_args (remaining positional arguments, *optional*):
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
+
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`).
+
+                - To update the text encoder configuration, use the prefix *text_encoder_* for each configuration
+                  parameter.
+                - To update the audio encoder configuration, use the prefix *audio_encoder_* for each configuration
+                  parameter.
+                - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
+                - To update the parent model configuration, do not use a prefix for each configuration parameter.
+
+                Behaves differently depending on whether a `config` is provided or automatically loaded.
+
+        Example:
+
+        ```python
+        >>> from parler_tts import ParlerTTSForConditionalGeneration
+
+        >>> # initialize a parler_tts model from a t5 text encoder, encodec audio encoder, and parler_tts decoder
+        >>> model = ParlerTTSForConditionalGeneration.from_sub_models_pretrained(
+        ...     text_encoder_pretrained_model_name_or_path="t5-base",
+        ...     audio_encoder_pretrained_model_name_or_path="facebook/encodec_24khz",
+        ...     decoder_pretrained_model_name_or_path="parler-tts/parler-tts-mini-v1",
+        ... )
+        >>> # saving model after fine-tuning
+        >>> model.save_pretrained("./parler_tts-ft")
+        >>> # load fine-tuned model
+        >>> model = ParlerTTSForConditionalGeneration.from_pretrained("./parler_tts-ft")
+        ```"""
+
+        kwargs_text_encoder = {
+            argument[len("text_encoder_") :]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("text_encoder_")
+        }
+
+        kwargs_audio_encoder = {
+            argument[len("audio_encoder_") :]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("audio_encoder_")
+        }
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+
+        # remove text encoder, audio encoder and decoder kwargs from kwargs
+        for key in kwargs_text_encoder.keys():
+            del kwargs["text_encoder_" + key]
+        for key in kwargs_audio_encoder.keys():
+            del kwargs["audio_encoder_" + key]
+        for key in kwargs_decoder.keys():
+            del kwargs["decoder_" + key]
+
+        # Load and initialize the encoder and decoder
+        # The distinction between encoder and decoder at the model level is made
+        # by the value of the flag `is_decoder` that we need to set correctly.
+        text_encoder = kwargs_text_encoder.pop("model", None)
+        if text_encoder is None:
+            if text_encoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `text_encoder_model` is not defined as an argument, a `text_encoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_text_encoder:
+                encoder_config, kwargs_text_encoder = AutoConfig.from_pretrained(
+                    text_encoder_pretrained_model_name_or_path, **kwargs_text_encoder, return_unused_kwargs=True
+                )
+
+                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+                    logger.info(
+                        f"Initializing {text_encoder_pretrained_model_name_or_path} as a text_encoder model "
+                        "from a decoder model. Cross-attention and casual mask are disabled."
+                    )
+                    encoder_config.is_decoder = False
+                    encoder_config.add_cross_attention = False
+
+                kwargs_text_encoder["config"] = encoder_config
+
+            text_encoder = AutoModelForTextEncoding.from_pretrained(
+                text_encoder_pretrained_model_name_or_path, *model_args, **kwargs_text_encoder
+            )
+
+        audio_encoder = kwargs_audio_encoder.pop("model", None)
+        if audio_encoder is None:
+            if audio_encoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `audio_encoder_model` is not defined as an argument, an `audio_encoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_audio_encoder:
+                encoder_config, kwargs_audio_encoder = AutoConfig.from_pretrained(
+                    audio_encoder_pretrained_model_name_or_path, **kwargs_audio_encoder, return_unused_kwargs=True
+                )
+
+                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+                    logger.info(
+                        f"Initializing {audio_encoder_pretrained_model_name_or_path} as an audio_encoder model "
+                        "from a decoder model. Cross-attention and casual mask are disabled."
+                    )
+                    encoder_config.is_decoder = False
+                    encoder_config.add_cross_attention = False
+
+                kwargs_audio_encoder["config"] = encoder_config
+
+            audio_encoder = AutoModel.from_pretrained(
+                audio_encoder_pretrained_model_name_or_path, *model_args, **kwargs_audio_encoder
+            )
+
+        decoder = kwargs_decoder.pop("model", None)
+        if decoder is None:
+            if decoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_decoder:
+                decoder_config, kwargs_decoder = ParlerTTSDecoderConfig.from_pretrained(
+                    decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True
+                )
+
+                if isinstance(decoder_config, ParlerTTSConfig):
+                    decoder_config = decoder_config.decoder
+
+                if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
+                    logger.info(
+                        f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
+                        f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
+                        f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
+                    )
+                    decoder_config.is_decoder = True
+                    decoder_config.add_cross_attention = True
+
+                kwargs_decoder["config"] = decoder_config
+
+            if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
+                logger.warning(
+                    f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
+                    f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
+                    "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
+                    "passed to `.from_sub_models_pretrained(...)` are set to `True` or do not pass a "
+                    "`decoder_config` to `.from_sub_models_pretrained(...)`"
+                )
+
+            decoder = ParlerTTSForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
+
+        # instantiate config with corresponding kwargs
+        config = ParlerTTSConfig.from_sub_models_config(
+            text_encoder.config, audio_encoder.config, decoder.config, **kwargs
+        )
+        return cls(text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder, config=config)
+
+    @add_start_docstrings_to_model_forward(MUSICGEN_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ParlerTTSSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        input_values: Optional[torch.FloatTensor] = None,
+        padding_mask: Optional[torch.BoolTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
+        past_key_values: Optional[Union[EncoderDecoderCache, Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        prompt_input_ids: Optional[torch.FloatTensor] = None,
+        prompt_attention_mask: Optional[torch.LongTensor] = None,
+        prompt_hidden_states: Optional[torch.FloatTensor] = None,
+        decoder_position_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        negative_input_ids: Optional[torch.LongTensor] = None, # Only added to pass model_kwargs validation
+        negative_attention_mask: Optional[torch.BoolTensor] = None, # Only added to pass model_kwargs validation
+        loss_reduction: str = "mean",
+        **kwargs,
+    ) -> Union[Tuple, ParlerTTSSeq2SeqLMOutput]:
+        r"""
+        Returns:
+
+        Examples:
+        ```python
+        >>> from transformers import AutoProcessor, ParlerTTSForConditionalGeneration
+        >>> import torch
+
+        >>> processor = AutoProcessor.from_pretrained("parler-tts/parler-tts-mini-v1")
+        >>> model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1")
+
+        >>> inputs = processor(
+        ...     text=["80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums"],
+        ...     padding=True,
+        ...     return_tensors="pt",
+        ... )
+
+        >>> pad_token_id = model.generation_config.pad_token_id
+        >>> decoder_input_ids = (
+        ...     torch.ones((inputs.input_ids.shape[0] * model.decoder.num_codebooks, 1), dtype=torch.long)
+        ...     * pad_token_id
+        ... )
+
+        >>> logits = model(**inputs, decoder_input_ids=decoder_input_ids).logits
+        >>> logits.shape  # (bsz * num_codebooks, tgt_len, vocab_size)
+        torch.Size([8, 1, 2048])
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        kwargs_text_encoder = {
+            argument[len("text_encoder_")]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("text_encoder_")
+        }
+
+        kwargs_audio_encoder = {
+            argument[len("audio_encoder_")]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("audio_encoder_")
+        }
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+
+        if prompt_hidden_states is None:
+            if prompt_input_ids is not None:
+                prompt_hidden_states = self.embed_prompts(prompt_input_ids)
+
+        if encoder_outputs is None:
+            encoder_outputs = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                **kwargs_text_encoder,
+            )
+            encoder_hidden_states = encoder_outputs[0]
+
+            # optionally project encoder_hidden_states
+            if (
+                self.text_encoder.config.hidden_size != self.decoder.config.hidden_size
+                and self.decoder.config.cross_attention_hidden_size is None
+            ):
+                encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
+
+            if attention_mask is not None:
+                encoder_hidden_states = encoder_hidden_states * attention_mask[..., None]
+
+            if prompt_hidden_states is not None and self.prompt_cross_attention:
+                # add sinusoidal positional embedding
+                positions = self.embed_positions(prompt_hidden_states, 0)
+                prompt_hidden_states = prompt_hidden_states + positions.to(prompt_hidden_states.device)
+
+                if prompt_attention_mask is not None and attention_mask is None:
+                    attention_mask = torch.ones(
+                        encoder_hidden_states.shape[:2], device=self.device, dtype=prompt_attention_mask.dtype
+                    )
+                elif attention_mask is not None and prompt_attention_mask is None:
+                    prompt_attention_mask = torch.ones(
+                        prompt_hidden_states.shape[:2], device=self.device, dtype=attention_mask.dtype
+                    )
+
+                # concatenate text description states with prompt description states
+                encoder_hidden_states = torch.cat([encoder_hidden_states, prompt_hidden_states], dim=1)
+                if prompt_attention_mask is not None:
+                    attention_mask = torch.cat([attention_mask, prompt_attention_mask], dim=1)
+
+                prompt_hidden_states = None
+                prompt_attention_mask = None
+
+            encoder_outputs["last_hidden_state"] = encoder_hidden_states
+
+        elif isinstance(encoder_outputs, tuple):
+            encoder_outputs = BaseModelOutput(*encoder_outputs)
+
+        encoder_hidden_states = encoder_outputs.last_hidden_state
+
+        if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
+            decoder_input_ids = shift_tokens_right(
+                labels, self.config.pad_token_id, self.config.decoder_start_token_id
+            ).transpose(1, 2)
+
+        elif decoder_input_ids is None and decoder_inputs_embeds is None:
+            audio_encoder_outputs = self.audio_encoder(
+                input_values=input_values,
+                padding_mask=padding_mask,
+                **kwargs_audio_encoder,
+            )
+            audio_codes = audio_encoder_outputs.audio_codes
+            frames, bsz, codebooks, seq_len = audio_codes.shape
+            if frames != 1:
+                raise ValueError(
+                    f"Expected 1 frame in the audio code outputs, got {frames} frames. Ensure chunking is "
+                    "disabled by setting `chunk_length=None` in the audio encoder."
+                )
+
+            if self.config.decoder.audio_channels == 2 and audio_codes.shape[2] == self.decoder.num_codebooks // 2:
+                # mono input through encodec that we convert to stereo
+                audio_codes = audio_codes.repeat_interleave(2, dim=2)
+
+            decoder_input_ids = audio_codes[0, ...].reshape(bsz * self.decoder.num_codebooks, seq_len)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+            prompt_hidden_states=prompt_hidden_states,
+            prompt_attention_mask=prompt_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            return_dict=return_dict,
+            labels=labels,
+            cache_position=cache_position,
+            loss_reduction=loss_reduction,
+            **kwargs_decoder,
+        )
+
+        if not return_dict:
+            return decoder_outputs + (encoder_hidden_states,)
+
+        return ParlerTTSSeq2SeqLMOutput(
+            loss=decoder_outputs.loss,
+            logits=decoder_outputs.logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            per_codebook_losses=decoder_outputs.per_codebook_losses,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_attention_mask=None,
+        decoder_head_mask=None,
+        prompt_hidden_states=None,
+        prompt_attention_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        decoder_delay_pattern_mask=None,
+        guidance_scale=None,
+        cache_position=None,
+        inputs_embeds=None,
+        **kwargs,
+    ):
+        if decoder_delay_pattern_mask is None:
+            decoder_input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask(
+                decoder_input_ids,
+                bos_token_id=self.generation_config.bos_token_id,
+                pad_token_id=self.generation_config.pad_token_id,
+                max_length=self.generation_config.max_length,
+            )
+
+        # apply the delay pattern mask
+        decoder_input_ids = self.decoder.apply_delay_pattern_mask(decoder_input_ids, decoder_delay_pattern_mask)
+
+        past_length = 0
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
+                if past_key_values.get_seq_length() > 0:
+                    # we only want to use prompt signal in the 1st generation step
+                    prompt_hidden_states = None
+            else:
+                past_length = past_key_values[0][0].shape[2]
+                # we only want to use prompt signal in the 1st generation step
+                prompt_hidden_states = None
+
+            # Some generation methods already pass only the last input ID
+            if decoder_input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = decoder_input_ids.shape[1] - 1
+
+            decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
+        
+        if guidance_scale is not None and guidance_scale > 1:
+            # for classifier free guidance we need to replicate the decoder args across the batch dim (we'll split these
+            # before sampling)
+            decoder_input_ids = decoder_input_ids.repeat((2, 1))
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.repeat((2, 1))
+            if prompt_hidden_states is not None:
+                prompt_hidden_states = prompt_hidden_states.repeat((2, 1, 1))
+
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_length, past_length + decoder_input_ids.shape[1], device=decoder_input_ids.device
+            )
+        elif use_cache:
+            cur_len = decoder_input_ids.shape[1]
+            if prompt_hidden_states is not None and not self.prompt_cross_attention:
+                # meaning we are in 1st generation step and prompt_hidden_state will be prepended
+                cur_len += prompt_hidden_states.shape[1]
+
+            cache_position = cache_position[-cur_len:]
+
+        if decoder_attention_mask is None and prompt_attention_mask is not None:
+            input = decoder_input_ids.reshape(-1, self.decoder.num_codebooks, decoder_input_ids.shape[-1])
+            bsz, _, seq_len = input.shape
+            input_shape = (bsz, seq_len)
+
+            past_key_values_length = 0
+            if cache_position is not None:
+                past_key_values_length = cache_position[0]
+            elif past_key_values is not None:
+                past_key_values_length = past_key_values.get_seq_length()
+
+            logger.warning_once(
+                "`prompt_attention_mask` is specified but `attention_mask` is not. A full `attention_mask` will be created. Make sure this is the intended behaviour."
+            )
+            if past_key_values is None or (
+                isinstance(past_key_values, EncoderDecoderCache) and past_key_values.get_seq_length() == 0
+            ):
+                decoder_attention_mask = torch.ones(input_shape, device=self.device, dtype=decoder_input_ids.dtype)
+            elif prompt_attention_mask is not None:
+                # In the generation case of `prompt_cross_attention=True`, we need to recreate an attention mask from scratch
+                # to be able to prepend the prompt attention mask.
+                # Since we generate token per token, we can recompute the generated length from the information we have.
+                generated_length = past_key_values_length - prompt_attention_mask.shape[1] + 1
+                decoder_attention_mask = torch.ones(
+                    (input_shape[0], generated_length), device=self.device, dtype=prompt_attention_mask.dtype
+                )
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids.contiguous(),
+            "attention_mask": attention_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "prompt_hidden_states": prompt_hidden_states,
+            "prompt_attention_mask": prompt_attention_mask,
+            "use_cache": use_cache,
+            "cache_position": cache_position,
+            "inputs_embeds": inputs_embeds,
+        }
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        batch_size: int,
+        model_input_name: str,
+        model_kwargs: Dict[str, torch.Tensor],
+        decoder_start_token_id: int = None,
+        bos_token_id: int = None,
+        device: torch.device = None,
+    ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
+        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
+
+        # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
+        # we also allow the user to pass it under `input_ids`, if the encoder does not use it as the main input.
+        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
+            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
+        elif "input_ids" in model_kwargs and model_input_name != "input_ids":
+            decoder_input_ids = model_kwargs.pop("input_ids")
+        else:
+            decoder_input_ids = None
+
+        # 2. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
+        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
+        if device is None:
+            device = self.device
+        decoder_input_ids_start = (
+            torch.ones((batch_size * self.decoder.num_codebooks, 1), dtype=torch.long, device=device)
+            * decoder_start_token_id
+        )
+
+        # no user input -> use decoder_start_token_id as decoder_input_ids
+        if decoder_input_ids is None:
+            decoder_input_ids = decoder_input_ids_start
+
+        # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust
+        # decoder_attention_mask if provided)
+        elif (decoder_input_ids[..., 0] != decoder_start_token_id).all().item():
+            decoder_input_ids = torch.cat([decoder_input_ids_start, decoder_input_ids], dim=-1)
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                decoder_attention_mask = torch.cat(
+                    (torch.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
+                    dim=-1,
+                )
+                model_kwargs["decoder_attention_mask"] = decoder_attention_mask
+
+        if not self.prompt_cross_attention:
+            prompt_hidden_states = model_kwargs["prompt_hidden_states"]
+            num_codebooks = self.decoder.num_codebooks
+            input = decoder_input_ids.reshape(-1, num_codebooks, decoder_input_ids.shape[-1])
+            inputs_embeds = sum(
+                [
+                    self.decoder.model.decoder.embed_tokens[codebook](input[:, codebook])
+                    for codebook in range(num_codebooks)
+                ]
+            )
+            inputs_embeds = torch.cat([prompt_hidden_states, inputs_embeds], dim=1)
+            model_kwargs["inputs_embeds"] = inputs_embeds
+
+        return decoder_input_ids, model_kwargs
+
+    def _prepare_text_encoder_kwargs_for_generation(
+        self,
+        inputs_tensor: torch.Tensor,
+        negative_prompt_already_exists: bool,
+        model_kwargs,
+        model_input_name: Optional[str],
+        generation_config: GenerationConfig,
+    ) -> Dict[str, Any]:
+        # 1. get text encoder
+        encoder = self.get_text_encoder()
+        # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
+        # as the inputs.
+        if hasattr(encoder, "_hf_hook"):
+            encoder._hf_hook.io_same_device = True
+
+        # 2. Prepare encoder args and encoder kwargs from model kwargs.
+        irrelevant_prefix = ["decoder_", "cross_attn", "prompt_", "use_cache", "labels"]
+        encoder_kwargs = {
+            argument: value
+            for argument, value in model_kwargs.items()
+            if not any(argument.startswith(p) for p in irrelevant_prefix)
+        }
+        encoder_signature = set(inspect.signature(encoder.forward).parameters)
+        encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
+        if not encoder_accepts_wildcard:
+            encoder_kwargs = {
+                argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
+            }
+        encoder_kwargs["output_attentions"] = generation_config.output_attentions
+        encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states
+        guidance_scale = generation_config.guidance_scale
+
+        # 3. make sure that encoder returns `ModelOutput`
+        model_input_name = model_input_name if model_input_name is not None else self.text_encoder.main_input_name
+        encoder_kwargs["return_dict"] = True
+        encoder_kwargs[model_input_name] = inputs_tensor
+        last_hidden_state = encoder(**encoder_kwargs).last_hidden_state
+
+        # we optionnally project last_hidden_state to avoid recomputing every time
+        if (
+            self.text_encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            last_hidden_state = self.enc_to_dec_proj(last_hidden_state)
+
+        # for classifier free guidance we need to add a 'null' input to our encoder hidden states if a negative prompt is not already present
+        if guidance_scale is not None and guidance_scale > 1 and not negative_prompt_already_exists:
+            last_hidden_state = torch.concatenate([last_hidden_state, torch.zeros_like(last_hidden_state)], dim=0)
+            if model_kwargs["attention_mask"] is not None:
+                model_kwargs["attention_mask"] = torch.concatenate(
+                    [model_kwargs["attention_mask"], torch.zeros_like(model_kwargs["attention_mask"])], dim=0
+                )
+        
+        if model_kwargs["attention_mask"] is not None:
+            last_hidden_state = last_hidden_state * model_kwargs["attention_mask"][..., None]
+
+        model_kwargs["encoder_outputs"] = BaseModelOutput(last_hidden_state=last_hidden_state)
+
+        return model_kwargs
+
+    def _prepare_prompt_kwargs_for_generation(self, prompt_input_ids, model_kwargs, generation_config):
+        prompt_hidden_states = self.embed_prompts(prompt_input_ids)
+
+        if self.prompt_cross_attention:
+            # add sinusoidal positional embedding
+            positions = self.embed_positions(prompt_hidden_states, 0)
+            prompt_hidden_states = prompt_hidden_states + positions.to(prompt_hidden_states.device)
+
+            attention_mask = model_kwargs.get("attention_mask", None)
+            prompt_attention_mask = model_kwargs.get("prompt_attention_mask", None)
+            encoder_hidden_states = model_kwargs["encoder_outputs"].last_hidden_state
+
+            if prompt_attention_mask is not None and attention_mask is None:
+                attention_mask = torch.ones(
+                    encoder_hidden_states.shape[:2], device=self.device, dtype=prompt_attention_mask.dtype
+                )
+            elif attention_mask is not None and prompt_attention_mask is None:
+                prompt_attention_mask = torch.ones(
+                    prompt_hidden_states.shape[:2], device=self.device, dtype=attention_mask.dtype
+                )
+
+            # concatenate text description states with prompt description states
+            encoder_hidden_states = torch.cat([encoder_hidden_states, prompt_hidden_states], dim=1)
+            if prompt_attention_mask is not None:
+                attention_mask = torch.cat([attention_mask, prompt_attention_mask], dim=1)
+
+            model_kwargs["encoder_outputs"].last_hidden_state = encoder_hidden_states
+            model_kwargs["attention_mask"] = attention_mask
+
+            # in this case, since we already concatenated the prompt hidden states and attention mask, we don't need them anymore.
+            model_kwargs["prompt_hidden_states"] = None
+            model_kwargs["prompt_attention_mask"] = None
+        else:
+            model_kwargs["prompt_hidden_states"] = prompt_hidden_states
+            # we're keeping the prompt attention mask because it has to be prepended to the decoder attention mask on the fly
+        return model_kwargs
+
+    def _prepare_audio_encoder_kwargs_for_generation(
+        self, input_values, model_kwargs, model_input_name: Optional[str] = None
+    ):
+        # 1. get audio encoder
+        encoder = self.get_audio_encoder()
+        # Compatibility with Accelerate big model inference: we need the encoder to outputs stuff on the same device
+        # as the inputs.
+        if hasattr(encoder, "_hf_hook"):
+            encoder._hf_hook.io_same_device = True
+
+        # 2. Prepare encoder args and encoder kwargs from model kwargs.
+        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
+        encoder_kwargs = {
+            argument: value
+            for argument, value in model_kwargs.items()
+            if not any(argument.startswith(p) for p in irrelevant_prefix)
+        }
+        encoder_signature = set(inspect.signature(encoder.forward).parameters)
+        encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
+        if not encoder_accepts_wildcard:
+            encoder_kwargs = {
+                argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
+            }
+
+        # 3. make sure that encoder returns `ModelOutput`
+        model_input_name = model_input_name if model_input_name is not None else self.audio_encoder.main_input_name
+        encoder_kwargs["return_dict"] = True
+        
+        if "num_quantizers" in encoder_signature:
+            encoder_kwargs["num_quantizers"] = self.config.decoder.num_codebooks
+        elif "num_codebooks" in encoder_signature:
+            encoder_kwargs["num_codebooks"] = self.config.decoder.num_codebooks
+        elif "n_quantizers" in encoder_signature:
+            encoder_kwargs["n_quantizers"] = self.config.decoder.num_codebooks
+
+        encoder_kwargs[model_input_name] = input_values
+        audio_encoder_outputs = encoder.encode(**encoder_kwargs)
+        audio_codes = audio_encoder_outputs.audio_codes
+        audio_scales = audio_encoder_outputs.get("audio_scales")
+
+        if audio_codes.ndim == 3:
+            bsz, codebooks, seq_len = audio_codes.shape
+            decoder_input_ids = audio_codes.reshape(bsz * self.decoder.num_codebooks, seq_len)
+        else:
+            frames, bsz, codebooks, seq_len = audio_codes.shape
+
+            if frames != 1:
+                raise ValueError(
+                    f"Expected 1 frame in the audio code outputs, got {frames} frames. Ensure chunking is "
+                    "disabled by setting `chunk_length=None` in the audio encoder."
+                )
+
+            decoder_input_ids = audio_codes[0, ...].reshape(bsz * self.decoder.num_codebooks, seq_len)
+        
+        model_kwargs["decoder_input_ids"] = decoder_input_ids
+        if audio_scales is not None:
+            model_kwargs["audio_scales"] = audio_scales
+
+        return model_kwargs
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(
+            labels, self.config.decoder.pad_token_id, self.config.decoder.bos_token_id
+        ).transpose(1, 2)
+
+    def resize_token_embeddings(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Resizing the embedding layers via the EncoderDecoderModel directly is not supported. Please use the"
+            " respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
+            " model.decoder.resize_token_embeddings(...))"
+        )
+
+    def _maybe_initialize_input_ids_for_generation(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        bos_token_id: Optional[int] = None,
+        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.LongTensor:
+        """Initializes input ids for generation, if necessary."""
+        if inputs is not None:
+            return inputs
+
+        encoder_outputs = model_kwargs.get("encoder_outputs")
+        if encoder_outputs is not None:
+            # make dummy input_ids with value -100, as a sanity check ensuring that they won't be used for encoding
+            shape = encoder_outputs[0].size()[:-1]
+            return torch.ones(shape, dtype=torch.long, device=self.device) * -100
+
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")
+
+        # If there is some tensor in `model_kwargs`, we can infer the batch size from it. This is helpful with
+        # soft-prompting or in multimodal implementations built on top of decoder-only language models.
+        batch_size = 1
+        for value in model_kwargs.values():
+            if isinstance(value, torch.Tensor):
+                batch_size = value.shape[0]
+                break
+        return torch.ones((batch_size, 1), dtype=torch.long, device=self.device) * bos_token_id
+
+    def _get_decoder_start_token_id(
+        self, decoder_start_token_id: Union[int, List[int]] = None, bos_token_id: int = None
+    ) -> int:
+        decoder_start_token_id = (
+            decoder_start_token_id
+            if decoder_start_token_id is not None
+            else self.generation_config.decoder_start_token_id
+        )
+        bos_token_id = bos_token_id if bos_token_id is not None else self.generation_config.bos_token_id
+
+        if decoder_start_token_id is not None:
+            return decoder_start_token_id
+        elif bos_token_id is not None:
+            return bos_token_id
+        raise ValueError(
+            "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
+        )
+
+    def _get_cache(self, cache_implementation: str, max_batch_size: int, max_cache_len: int, model_kwargs) -> Cache:
+        """
+        Sets a cache for `generate`, that will persist across calls. A new cache will only be initialized a
+        new `generate` call requires a larger cache.
+
+        Returns the resulting cache object.
+        """
+        cache_cls: Cache = NEED_SETUP_CACHE_CLASSES_MAPPING[cache_implementation]
+        requires_cross_attention_cache = (
+            self.config.is_encoder_decoder or model_kwargs.get("encoder_outputs") is not None
+        )
+
+        if hasattr(self, "_cache"):
+            cache_to_check = self._cache.self_attention_cache if requires_cross_attention_cache else self._cache
+
+        if cache_implementation == "sliding_window":
+            max_cache_len = min(self.config.sliding_window, max_cache_len)
+
+        need_new_cache = (
+            not hasattr(self, "_cache")
+            or (not isinstance(cache_to_check, cache_cls))
+            or cache_to_check.max_batch_size != max_batch_size
+            or cache_to_check.max_cache_len < max_cache_len
+        )
+
+        if requires_cross_attention_cache and hasattr(self, "_cache"):
+            need_new_cache = (
+                need_new_cache
+                or self._cache.cross_attention_cache.max_cache_len != model_kwargs["encoder_outputs"][0].shape[1]
+            )
+
+        if need_new_cache:
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                cache_dtype = self.config._pre_quantization_dtype
+            else:
+                cache_dtype = self.dtype
+            cache_kwargs = {
+                "config": self.config.decoder,
+                "max_batch_size": max_batch_size,
+                "max_cache_len": max_cache_len,
+                "device": self.device,
+                "dtype": cache_dtype,
+            }
+            self._cache = cache_cls(**cache_kwargs)
+            if requires_cross_attention_cache:
+                encoder_kwargs = cache_kwargs.copy()
+                encoder_kwargs["max_cache_len"] = model_kwargs["encoder_outputs"][0].shape[1]
+                config_cross_attention_cache = copy.deepcopy(self.config.decoder)
+                config_cross_attention_cache.update(
+                    {"num_key_value_heads": self.config.decoder.num_cross_attention_key_value_heads}
+                )
+                encoder_kwargs["config"] = config_cross_attention_cache
+                self._cache = EncoderDecoderCache(self._cache, cache_cls(**encoder_kwargs))
+        else:
+            self._cache.reset()
+        return self._cache
+
+    def freeze_encoders(self, freeze_text_encoder=True):
+        if freeze_text_encoder:
+            for param in self.text_encoder.parameters():
+                param.requires_grad = False
+            self.text_encoder._requires_grad = False
+
+        for param in self.audio_encoder.parameters():
+            param.requires_grad = False
+        self.audio_encoder._requires_grad = False
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        synced_gpus: Optional[bool] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        **kwargs,
+    ):
+        """
+
+        Generates sequences of token ids for models with a language modeling head.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
+                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
+                should be in the format `input_ids`. For encoder-decoder models *inputs* can represent any of
+                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
+
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GenerateDecoderOnlyOutput`],
+                    - [`~generation.GenerateBeamDecoderOnlyOutput`]
+
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GenerateEncoderDecoderOutput`],
+                    - [`~generation.GenerateBeamEncoderDecoderOutput`]
+        """
+        # 1. Handle `generation_config` and kwargs that might update it, and validate the resulting objects
+        if generation_config is None:
+            generation_config = self.generation_config
+
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)  # All unused kwargs must be model kwargs
+        generation_config.validate()
+        self._validate_model_kwargs(model_kwargs.copy())
+
+        if model_kwargs.get("encoder_outputs") is not None and type(model_kwargs["encoder_outputs"]) == tuple:
+            # wrap the unconditional outputs as a BaseModelOutput for compatibility with the rest of generate
+            model_kwargs["encoder_outputs"] = BaseModelOutput(last_hidden_state=model_kwargs["encoder_outputs"][0])
+
+        # 2. Set generation parameters if not already defined
+        requires_attention_mask = "encoder_outputs" not in model_kwargs
+        kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None
+
+        # 3. Define model inputs
+        inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
+        batch_size = inputs_tensor.shape[0]
+        self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=inputs_tensor.device)
+
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList([ParlerTTSLogitsProcessor(generation_config.eos_token_id, self.decoder.num_codebooks, batch_size, inputs_tensor.device)])
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        # 4. Define other model kwargs
+        model_kwargs["use_cache"] = generation_config.use_cache
+        model_kwargs["guidance_scale"] = generation_config.guidance_scale
+
+        # If negative_input_ids are provided, guidance should be enabled and "encoder_outputs" should not be there alread
+        if model_kwargs.get("negative_input_ids") is not None:
+            assert "encoder_outputs" not in model_kwargs, "encoder_outputs should not be provided when negative_input_ids are provided since we will need to use the negative prompt in the encoder"
+            assert model_kwargs.get("attention_mask", None) is None and requires_attention_mask, "attention_mask must not be provided when negative_input_ids are provided since we will need to use the negative prompt in the encoder"
+            assert generation_config.guidance_scale is not None and generation_config.guidance_scale > 1, "Guidance scale must be greater than 1 for negative prompting"
+            assert inputs_tensor.shape[0] == model_kwargs["negative_input_ids"].shape[0], "Batch size of inputs and negative_input_ids must be the same"
+            # We must append the negative input ids to the input_tensor. Since their lengths are different, we must pad them both to the same length using generation_config._pad_token_tensor
+            negative_input_ids = model_kwargs.pop("negative_input_ids")
+            max_length = max(inputs_tensor.shape[1], negative_input_ids.shape[1])
+            inputs_tensor = torch.cat([inputs_tensor, torch.ones((batch_size, max_length - inputs_tensor.shape[1]), dtype=inputs_tensor.dtype, device=inputs_tensor.device) * generation_config._pad_token_tensor], dim=1)
+            negative_input_ids = torch.cat([negative_input_ids, torch.ones((batch_size, max_length - negative_input_ids.shape[1]), dtype=negative_input_ids.dtype, device=negative_input_ids.device) * generation_config._pad_token_tensor], dim=1)
+            inputs_tensor = torch.concatenate([inputs_tensor, negative_input_ids], dim=0)
+            negative_prompt_already_exists = True
+        else:
+            negative_prompt_already_exists = False
+
+        if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
+            model_kwargs["attention_mask"] = _old_prepare_attention_mask_for_generation(
+                inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor
+            )
+
+        if "encoder_outputs" not in model_kwargs:
+            # encoder_outputs are created and added to `model_kwargs`
+            model_kwargs = self._prepare_text_encoder_kwargs_for_generation(
+                inputs_tensor, negative_prompt_already_exists, model_kwargs, model_input_name, generation_config
+            )
+
+        if "prompt_hidden_states" not in model_kwargs and "prompt_input_ids" in model_kwargs:
+            # `prompt_hidden_states` are created and added to `model_kwargs`
+            model_kwargs = self._prepare_prompt_kwargs_for_generation(
+                model_kwargs["prompt_input_ids"],
+                model_kwargs, generation_config,
+            )
+
+        if "decoder_input_ids" not in model_kwargs and "input_values" in model_kwargs:
+            model_kwargs = self._prepare_audio_encoder_kwargs_for_generation(
+                model_kwargs["input_values"],
+                model_kwargs,
+            )
+
+        # 5. Prepare `input_ids` which will be used for auto-regressive generation
+        input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
+            batch_size=batch_size,
+            model_input_name=model_input_name,
+            model_kwargs=model_kwargs,
+            decoder_start_token_id=generation_config._decoder_start_token_tensor,
+            bos_token_id=generation_config._bos_token_tensor,
+            device=inputs_tensor.device,
+        )
+
+        # 6. Prepare `max_length` depending on other stopping criteria.
+        input_ids_length = input_ids.shape[-1]
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
+        generation_config = self._prepare_generated_length(
+            generation_config=generation_config,
+            has_default_max_length=has_default_max_length,
+            has_default_min_length=has_default_min_length,
+            model_input_name=model_input_name,
+            inputs_tensor=inputs_tensor,
+            input_ids_length=input_ids_length,
+        )
+
+        if generation_config.cache_implementation is not None and model_kwargs.get("past_key_values") is not None:
+            raise ValueError(
+                "Passing both `cache_implementation` (used to initialize certain caches) and `past_key_values` (a "
+                "Cache object) is unsupported. Please use only one of the two."
+            )
+        elif generation_config.cache_implementation is not None:
+            if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING:
+                if generation_config.cache_implementation == "static" and not self._supports_static_cache:
+                    raise ValueError(
+                        "This model does not support `cache_implementation='static'`. Please check the following "
+                        "issue: https://github.com/huggingface/transformers/issues/28981"
+                    )
+                if not self.prompt_cross_attention:
+                    # when we prepend prompt_hidden_state to inputs_embeds, max_cache_len needs to be actualised
+                    # generation_config.max_length has already been increased by input_ids_length which is
+                    # already counted in input_embeds_seq_length so we remove it
+                    input_embeds_seq_length = model_kwargs["inputs_embeds"].shape[1]
+                    max_cache_len = generation_config.max_length + input_embeds_seq_length - input_ids_length
+                else:
+                    max_cache_len = self.generation_config.max_length
+                model_kwargs["past_key_values"] = self._get_cache(
+                    generation_config.cache_implementation,
+                    getattr(generation_config, "num_beams", 1) * batch_size,
+                    max_cache_len,
+                    model_kwargs,
+                )
+            elif generation_config.cache_implementation == "quantized":
+                raise ValueError(
+                    "This model does not support the quantized cache. If you want your model to support quantized "
+                    "cache, please open an issue on the Parler-TTS repository https://github.com/huggingface/parler-tts"
+                )
+        # Use DynamicCache() instance by default. This will avoid back and forth from legacy format that
+        # keeps copying the cache thus using much more memory
+        elif generation_config.cache_implementation is None and self._supports_default_dynamic_cache():
+            past = model_kwargs.get("past_key_values", None)
+            requires_cross_attention_cache = (
+                self.config.is_encoder_decoder or model_kwargs.get("encoder_outputs") is not None
+            )
+            if past is None:
+                model_kwargs["past_key_values"] = (
+                    DynamicCache()
+                    if not requires_cross_attention_cache
+                    else EncoderDecoderCache(DynamicCache(), DynamicCache())
+                )
+            elif isinstance(past, tuple):
+                model_kwargs["past_key_values"] = (
+                    DynamicCache.from_legacy_cache(past)
+                    if not requires_cross_attention_cache
+                    else EncoderDecoderCache.from_legacy_cache(past)
+                )
+
+        # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to Parler-TTS)
+        delayed_input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask(
+            input_ids,
+            bos_token_id=generation_config._bos_token_tensor,
+            pad_token_id=generation_config._pad_token_tensor,
+            max_length=generation_config.max_length,
+        )
+        # stash the delay mask so that we don't have to recompute in each forward pass
+        model_kwargs["decoder_delay_pattern_mask"] = decoder_delay_pattern_mask
+
+        # input_ids are ready to be placed on the streamer (if used)
+        if streamer is not None:
+            streamer.put(delayed_input_ids.cpu())
+
+        # 7. determine generation mode
+        generation_mode = generation_config.get_generation_mode()
+
+        # 7.5 prepare batched CFG externally (to enable coexistance with the unbatched CFG)
+        if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
+            cfg_logits_processor_item = ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale)
+            generation_config.guidance_scale = None
+        else:
+            cfg_logits_processor_item = None
+
+        # 8. prepare distribution pre_processing samplers
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_length,
+            encoder_input_ids=inputs_tensor,
+            prefix_allowed_tokens_fn=None,
+            logits_processor=logits_processor,
+            device=delayed_input_ids.device,
+        )
+
+        # 8.5 now prepend the cfg_logits_processor_item to the list of processors if it exists
+        if cfg_logits_processor_item is not None:
+            # Imitate the behaviour of transformers.generation.utils._merge_criteria_processor_list
+            for logits_processor_item in logits_processor:
+                if type(logits_processor_item) is type(cfg_logits_processor_item):
+                    raise ValueError(
+                        f"A custom logits processor of type {type(cfg_logits_processor_item)} with values {cfg_logits_processor_item} has been passed to"
+                        f" `.generate()`, but it has already been created with the values {logits_processor_item}. {logits_processor_item} has been"
+                        " created by passing the corresponding arguments to generate or by the model's config default"
+                        f" values. If you just want to change the default values of logits processor consider passing"
+                        f" them as arguments to `.generate()` instead of using a custom logits processor."
+                    )
+            logits_processor.insert(0, cfg_logits_processor_item)
+
+        # 9. prepare stopping criteria
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+
+        if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
+            # expand input_ids with `num_return_sequences` additional sequences per batch
+            delayed_input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids=delayed_input_ids,
+                expand_size=generation_config.num_return_sequences,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                **model_kwargs,
+            )
+
+            # 10. run sample
+            outputs = self._sample(
+                delayed_input_ids,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                generation_config=generation_config,
+                synced_gpus=synced_gpus,
+                streamer=streamer,
+                **model_kwargs,
+            )
+
+        else:
+            raise ValueError(
+                "Got incompatible mode for generation, should be one of greedy or sampling. "
+                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+            )
+
+        if generation_config.return_dict_in_generate:
+            output_ids = outputs.sequences
+        else:
+            output_ids = outputs
+
+        # Apply the pattern mask to the final ids
+        output_ids = self.decoder.apply_delay_pattern_mask(output_ids, model_kwargs["decoder_delay_pattern_mask"])
+
+        # Revert the pattern delay mask by filtering the eos and bos token ids from the delay pattern mask
+        _, mask = self.decoder.build_delay_pattern_mask(
+            input_ids,
+            bos_token_id=generation_config.bos_token_id,
+            pad_token_id=generation_config.pad_token_id,
+            max_length=output_ids.shape[1],
+        )
+
+        mask = (mask != generation_config.bos_token_id) & (mask != generation_config.pad_token_id)
+        output_ids = output_ids[mask].reshape(batch_size, self.decoder.num_codebooks, -1)
+
+        # append the frame dimension back to the audio codes
+        output_ids = output_ids[None, ...]
+
+        audio_decode_kwargs = {}
+        if self.use_audio_scales:
+            audio_scales = model_kwargs.get("audio_scales")
+            if audio_scales is None:
+                audio_scales = [None] * batch_size
+            audio_decode_kwargs["audio_scales"] = audio_scales
+
+        
+        if not self.use_4dim_audio_codes:
+            # remove chunk dim
+            output_ids = output_ids.squeeze(0)
+            
+            
+        decode_sequentially = (
+            generation_config.bos_token_id in output_ids
+            or generation_config.pad_token_id in output_ids
+            or generation_config.eos_token_id in output_ids
+        )
+        if not decode_sequentially:
+            output_values = self.audio_encoder.decode(
+                audio_codes=output_ids,
+                **audio_decode_kwargs,
+            ).audio_values.squeeze(1)
+            output_lengths = [audio.shape[0] for audio in output_values]
+        else:
+            output_values = []
+            for sample_id in range(batch_size):
+                sample = output_ids[:, sample_id] if self.use_4dim_audio_codes else output_ids[sample_id]
+                sample_mask = (sample >= self.audio_encoder.config.codebook_size)
+                sample_mask = (sample_mask.sum(dim=(0, 1)) == 0) if self.use_4dim_audio_codes else (sample_mask.sum(dim=0) == 0)
+                single_audio_decode_kwargs = {}
+                if self.use_audio_scales:
+                    single_audio_decode_kwargs["audio_scales"] = [audio_decode_kwargs["audio_scales"][sample_id]]
+                if sample_mask.sum() > 0:
+                    sample = sample[:, :, sample_mask] if self.use_4dim_audio_codes else sample[:, sample_mask]
+                    sample = self.audio_encoder.decode(audio_codes=sample[None, ...], **single_audio_decode_kwargs).audio_values
+                    sample = sample if sample.ndim == 3 else sample.unsqueeze(0)
+                    output_values.append(sample.transpose(0, 2))
+                else:
+                    output_values.append(torch.zeros((1, 1, 1)).to(self.device))
+            output_lengths = [audio.shape[0] for audio in output_values]
+            output_values = (
+                torch.nn.utils.rnn.pad_sequence(output_values, batch_first=True, padding_value=0)
+                .squeeze(-1)
+                .squeeze(-1)
+            )
+        if generation_config.return_dict_in_generate:
+            outputs["audios_length"] = output_lengths
+            outputs.sequences = output_values
+            return outputs
+        else:
+            return output_values
+
+    def _get_initial_cache_position(self, input_ids, model_kwargs):
+        """Calculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past length"""
+        # `torch.compile`-friendly `torch.arange` from a shape -- the lines below are equivalent to `torch.arange`
+        if "inputs_embeds" in model_kwargs:
+            cache_position = torch.ones_like(model_kwargs["inputs_embeds"][0, :, 0], dtype=torch.int64).cumsum(0) - 1
+        else:
+            cache_position = torch.ones_like(input_ids[0, :], dtype=torch.int64).cumsum(0) - 1
+
+        past_length = 0
+        if model_kwargs.get("past_key_values") is not None:
+            cache = model_kwargs["past_key_values"]
+            past_length = 0
+            if not isinstance(cache, Cache):
+                past_length = cache[0][0].shape[2]
+            elif hasattr(cache, "get_seq_length") and cache.get_seq_length() is not None:
+                past_length = cache.get_seq_length()
+
+            # TODO(joao): this is not torch.compile-friendly, find a work-around. If the cache is not empty,
+            # end-to-end compilation will yield bad results because `cache_position` will be incorrect.
+            if not is_torchdynamo_compiling():
+                cache_position = cache_position[past_length:]
+
+        model_kwargs["cache_position"] = cache_position
+        return model_kwargs
\ No newline at end of file
diff --git a/capspeech/ar/parler_tts/streamer.py b/capspeech/ar/parler_tts/streamer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5143c89796cfc1a4d34d31572aae6876d92890f
--- /dev/null
+++ b/capspeech/ar/parler_tts/streamer.py
@@ -0,0 +1,147 @@
+
+from .modeling_parler_tts import ParlerTTSForConditionalGeneration
+from transformers.generation.streamers import BaseStreamer
+from typing import Optional
+import torch
+import numpy as np
+import math
+from queue import Queue
+
+
+class ParlerTTSStreamer(BaseStreamer):
+    def __init__(
+        self,
+        model: ParlerTTSForConditionalGeneration,
+        device: Optional[str] = None,
+        play_steps: Optional[int] = 10,
+        stride: Optional[int] = None,
+        timeout: Optional[float] = None,
+    ):
+        """
+        Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
+        useful for applications that benefit from accessing the generated audio in a non-blocking way (e.g. in an interactive
+        Gradio demo).
+        Parameters:
+            model (`ParlerTTSForConditionalGeneration`):
+                The Parler-TTS model used to generate the audio waveform.
+            device (`str`, *optional*):
+                The torch device on which to run the computation. If `None`, will default to the device of the model.
+            play_steps (`int`, *optional*, defaults to 10):
+                The number of generation steps with which to return the generated audio array. Using fewer steps will
+                mean the first chunk is ready faster, but will require more codec decoding steps overall. This value
+                should be tuned to your device and latency requirements.
+            stride (`int`, *optional*):
+                The window (stride) between adjacent audio samples. Using a stride between adjacent audio samples reduces
+                the hard boundary between them, giving smoother playback. If `None`, will default to a value equivalent to
+                play_steps // 6 in the audio space.
+            timeout (`int`, *optional*):
+                The timeout for the audio queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
+                in `.generate()`, when it is called in a separate thread.
+        """
+        self.decoder = model.decoder
+        self.audio_encoder = model.audio_encoder
+        self.generation_config = model.generation_config
+        self.device = device if device is not None else model.device
+        self.use_audio_scales = model.use_audio_scales
+        self.use_4dim_audio_codes = model.use_4dim_audio_codes
+        self.audio_kwargs = {}
+        if self.use_audio_scales:
+            self.audio_kwargs["audio_scales"] = [None]
+
+        # variables used in the streaming process
+        self.play_steps = play_steps
+        if stride is not None:
+            self.stride = stride
+        else:
+            hop_length = math.floor(self.audio_encoder.config.sampling_rate / self.audio_encoder.config.frame_rate)
+            self.stride = hop_length * (play_steps - self.decoder.num_codebooks) // 6
+        self.token_cache = None
+        self.to_yield = 0
+
+        # varibles used in the thread process
+        self.audio_queue = Queue()
+        self.stop_signal = None
+        self.timeout = timeout
+
+    def apply_delay_pattern_mask(self, input_ids):
+        # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to Parler)
+        _, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
+            input_ids[:, :1],
+            bos_token_id=self.generation_config.bos_token_id,
+            pad_token_id=self.generation_config.decoder_start_token_id,
+            max_length=input_ids.shape[-1],
+        )
+        # apply the pattern mask to the input ids
+        input_ids = self.decoder.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
+
+        # revert the pattern delay mask by filtering the pad token id
+        mask = (delay_pattern_mask != self.generation_config.bos_token_id) & (delay_pattern_mask != self.generation_config.pad_token_id)
+        input_ids = input_ids[mask].reshape(1, self.decoder.num_codebooks, -1)
+
+        if self.use_4dim_audio_codes:
+            # append the frame dimension back to the audio codes
+            input_ids = input_ids[None, ...]
+
+        # send the input_ids to the correct device
+        input_ids = input_ids.to(self.audio_encoder.device)
+
+        decode_sequentially = (
+            self.generation_config.bos_token_id in input_ids
+            or self.generation_config.pad_token_id in input_ids
+            or self.generation_config.eos_token_id in input_ids
+        )
+        if not decode_sequentially:
+            sample = self.audio_encoder.decode(
+                audio_codes=input_ids,
+                **self.audio_kwargs,
+            ).audio_values
+            output_values = sample if sample.ndim == 3 else sample.unsqueeze(0)
+        else:
+            sample = input_ids[:, 0] if self.use_4dim_audio_codes else input_ids[0]
+            sample_mask = ((sample >= self.audio_encoder.config.codebook_size).sum(dim=(0, 1)) == 0) if self.use_4dim_audio_codes else ((sample >= self.audio_encoder.config.codebook_size).sum(dim=0) == 0)
+            sample = sample[:, :, sample_mask] if self.use_4dim_audio_codes else sample[:, sample_mask]
+            sample = self.audio_encoder.decode(audio_codes=sample[None, ...], **self.audio_kwargs).audio_values
+            output_values = sample if sample.ndim == 3 else sample.unsqueeze(0)
+
+        audio_values = output_values[0, 0]
+        return audio_values.cpu().float().numpy()
+
+    def put(self, value):
+        batch_size = value.shape[0] // self.decoder.num_codebooks
+        if batch_size > 1:
+            raise ValueError("ParlerTTSStreamer only supports batch size 1")
+
+        if self.token_cache is None:
+            self.token_cache = value
+        else:
+            self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
+
+        if self.token_cache.shape[-1] % self.play_steps == 0:
+            audio_values = self.apply_delay_pattern_mask(self.token_cache)
+            self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
+            self.to_yield += len(audio_values) - self.to_yield - self.stride
+
+    def end(self):
+        """Flushes any remaining cache and appends the stop symbol."""
+        if self.token_cache is not None:
+            audio_values = self.apply_delay_pattern_mask(self.token_cache)
+        else:
+            audio_values = np.zeros(self.to_yield)
+
+        self.on_finalized_audio(audio_values[self.to_yield :], stream_end=True)
+
+    def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
+        """Put the new audio in the queue. If the stream is ending, also put a stop signal in the queue."""
+        self.audio_queue.put(audio, timeout=self.timeout)
+        if stream_end:
+            self.audio_queue.put(self.stop_signal, timeout=self.timeout)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        value = self.audio_queue.get(timeout=self.timeout)
+        if not isinstance(value, np.ndarray) and value == self.stop_signal:
+            raise StopIteration()
+        else:
+            return value
\ No newline at end of file
diff --git a/capspeech/ar/pretrain.sh b/capspeech/ar/pretrain.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3236e55bb875ffce8b2e68f67934a5f0fc8135fc
--- /dev/null
+++ b/capspeech/ar/pretrain.sh
@@ -0,0 +1,68 @@
+# Please log in to huggingface first
+
+MLS_WAV_DIR='' # downloaded mls wav path
+LIBRITTSRMIX_WAV_DIR='' # downloaded librittsrmix wav path
+GIGASPEECH_WAV_DIR='' # downloaded gigaspeech wav path
+COMMONVOICE_WAV_DIR='' # downloaded commonvoice wav path
+EMILIA_WAV_DIR='' # downloaded emilia wav path
+OUTPUT_DIR="./output_pretraining/" # output dir, to save checkpoints
+TEMPORY_SAVE_TO_DISK="./audio_code_pretraining/" # dac codec saved dir
+SAVE_TO_DISK="./dataset_pretraining/" # huggingface metadata saved dir
+WANDB_KEY='' # your wandb key for logging
+
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+
+accelerate launch ./training/run_parler_tts_training.py \
+    --model_name_or_path "parler-tts/parler-tts-mini-v1" \
+    --feature_extractor_name "parler-tts/dac_44khZ_8kbps" \
+    --description_tokenizer_name "google/flan-t5-large" \
+    --prompt_tokenizer_name "google/flan-t5-large" \
+    --report_to "wandb" \
+    --wandb_key ${WANDB_KEY} \
+    --overwrite_output_dir true \
+    --train_dataset_name "OpenSound/CapSpeech" \
+    --train_split_name "train_PT" \
+    --eval_dataset_name "OpenSound/CapSpeech" \
+    --eval_split_name "validation_PT" \
+    --mls_dir ${MLS_WAV_DIR} \
+    --librittsrmix_dir ${LIBRITTSRMIX_WAV_DIR} \
+    --gigaspeech_dir ${GIGASPEECH_WAV_DIR} \
+    --commonvoice_dir ${COMMONVOICE_WAV_DIR} \
+    --emilia_dir ${EMILIA_WAV_DIR} \
+    --max_eval_samples 96 \
+    --per_device_eval_batch_size 32 \
+    --target_audio_column_name "audio_path" \
+    --description_column_name "caption" \
+    --source_column_name "source" \
+    --prompt_column_name "text" \
+    --max_duration_in_seconds 20 \
+    --min_duration_in_seconds 3 \
+    --max_text_length 600 \
+    --preprocessing_num_workers 32 \
+    --do_train true \
+    --num_train_epochs 10 \
+    --gradient_accumulation_steps 6 \
+    --gradient_checkpointing false \
+    --per_device_train_batch_size 4 \
+    --learning_rate 0.001 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.99 \
+    --weight_decay 0.01 \
+    --lr_scheduler_type "constant_with_warmup" \
+    --warmup_steps 5000 \
+    --logging_steps 200 \
+    --freeze_text_encoder false \
+    --per_device_eval_batch_size 4 \
+    --audio_encoder_per_device_batch_size 24 \
+    --dtype "float16" \
+    --seed 456 \
+    --output_dir ${OUTPUT_DIR} \
+    --temporary_save_to_disk ${TEMPORY_SAVE_TO_DISK} \
+    --save_to_disk ${SAVE_TO_DISK} \
+    --dataloader_num_workers 32 \
+    --do_eval \
+    --evaluation_strategy steps \
+    --eval_steps 5000 \
+    --save_steps 5000 \
+    --group_by_length true
diff --git a/capspeech/ar/training/__init__.py b/capspeech/ar/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/ar/training/arguments.py b/capspeech/ar/training/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..31bc8750b5b8d13b455f8e98fe843e8cd9431b3d
--- /dev/null
+++ b/capspeech/ar/training/arguments.py
@@ -0,0 +1,403 @@
+from dataclasses import dataclass, field
+from typing import Optional, List
+
+from transformers import Seq2SeqTrainingArguments
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    feature_extractor_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained feature extractor name or path if not the same as model_name"}
+    )
+    description_tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained description tokenizer name or path if not the same as model_name"}
+    )
+    prompt_tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Pretrained prompt tokenizer name or path if not the same as description_tokenizer_name"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    pad_token_id: int = field(
+        default=None,
+        metadata={"help": "If specified, change the model pad token id."},
+    )
+    decoder_start_token_id: int = field(
+        default=None,
+        metadata={"help": "If specified, change the model decoder start token id."},
+    )
+    freeze_text_encoder: bool = field(
+        default=False,
+        metadata={"help": "Whether to freeze the text encoder."},
+    )
+    do_sample: bool = field(
+        default=True,
+        metadata={"help": "Whether to do sampling or greedy decoding."},
+    )
+    temperature: float = field(
+        default=1.0,
+        metadata={"help": "Temperature if sampling."},
+    )
+    max_length: int = field(
+        default=2580,
+        metadata={"help": "Generation max length."},
+    )
+    bandwidth: float = field(
+        default=6,
+        metadata={"help": "Audio encoder bandwidth."},
+    )
+    asr_model_name_or_path: str = field(
+        default="distil-whisper/distil-large-v2",
+        metadata={
+            "help": "Used to compute WER during evaluation. Path to pretrained model or model identifier from huggingface.co/models"
+        },
+    )
+    clap_model_name_or_path: str = field(
+        default="laion/larger_clap_music_and_speech",
+        metadata={
+            "help": "Used to compute audio similarity during evaluation. Path to pretrained model or model identifier from huggingface.co/models"
+        },
+    )
+    attn_implementation: str = field(
+        default="eager",
+        metadata={"help": "Attention implementation used. One of `eager`, `sdpa`, `flash_attention_2`"},
+    )
+    cross_attention_implementation_strategy: str = field(
+        default=None,
+        metadata={
+            "help": "If not specified, the cross-attention implementation will be the same as `_attn_implementation`. If `always_eager`, it will always be the eager implementation. If `always_sdpa`, it will always be the sdpa implementation."
+        },
+    )
+    prompt_padding_side: Optional[str] = field(
+        default="left",
+        metadata={
+            "help": "Prompt tokenizer padding side. Defaults to `left`. If the prompt is pre-pended to the codebooks hidden states, it should be padded on the left."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    train_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
+            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    train_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset configs by a '+' symbol."
+        },
+    )
+    train_split_name: str = field(
+        default="train",
+        metadata={
+            "help": ("The name of the training data set split to use (via the datasets library). Defaults to 'train'")
+        },
+    )
+    train_dataset_samples: str = field(
+        default=None,
+        metadata={
+            "help": "Number of samples in the training data. Load and combine "
+            "multiple datasets by separating dataset samples by a '+' symbol."
+        },
+    )
+    train_metadata_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the metadata training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
+            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    eval_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset name if unspecified."
+        },
+    )
+    eval_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset config name if unspecified"
+        },
+    )
+    eval_split_name: str = field(
+        default="test",
+        metadata={
+            "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
+        },
+    )
+    eval_metadata_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the metadata training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
+            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    target_audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the target audio data. Defaults to 'audio'"},
+    )
+    description_column_name: str = field(
+        default=None,
+        metadata={"help": "The name of the dataset column containing the description text data. Defaults to 'None'."},
+    )
+    prompt_column_name: str = field(
+        default=None,
+        metadata={"help": "The name of the dataset column containing the prompt text data. Defaults to 'None'."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of validation examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_duration_in_seconds: float = field(
+        default=35.0,
+        metadata={
+            "help": (
+                "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`."
+                "Also, used to set maximum audio length if `pad_to_max_length=True`."
+            )
+        },
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
+    )
+    max_text_length: int = field(
+        default=500, metadata={"help": "If set, max description lengths in number of characters."}
+    )
+    max_prompt_token_length: int = field(
+        default=None,
+        metadata={
+            "help": (
+                "If set, filter samples with prompts that are longer than `max_prompt_token_length` tokens."
+                "Also, used to set maximum prompt token length if `pad_to_max_length=True`."
+            )
+        },
+    )
+    max_description_token_length: int = field(
+        default=None,
+        metadata={
+            "help": (
+                "If set, filter samples with descriptions that are longer than `max_description_token_length` tokens."
+                "Also, used to set maximum description token length if `pad_to_max_length=True`."
+            )
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "If `True`, pad audio, prompt and description to a maximum length set with respectively "
+                "`max_duration_in_seconds`, `max_prompt_token_length`, `max_description_token_length`."
+            )
+        },
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to only do data preprocessing and skip training. This is especially useful when data"
+                " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
+                " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
+                " can consequently be loaded in distributed training."
+                " In this training script, `save_to_disk` must be set to the path in which the dataset should be saved. "
+            )
+        },
+    )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
+    add_audio_samples_to_wandb: bool = field(
+        default=False,
+        metadata={"help": "If set and if `wandb` in args.report_to, will add generated audio samples to wandb logs."},
+    )
+    id_column_name: str = field(default=None, metadata={"help": "id column name."})
+    wandb_project: str = field(
+        default="parler-speech",
+        metadata={"help": "The name of the wandb project."},
+    )
+    wandb_run_name: str = field(
+        default=None,
+        metadata={
+            "help": "If specified, the name of the run. If not specified, wandb will give a random name to this run."
+        },
+    )
+    save_to_disk: str = field(
+        default=None,
+        metadata={
+            "help": "If set, will save the dataset to this path if this is an empyt folder. If not empty, will load the datasets from it."
+        },
+    )
+    temporary_save_to_disk: str = field(default=None, metadata={"help": "Temporarily save audio labels here."})
+    save_codec_steps: Optional[int] = field(
+        default=500,
+        metadata={"help": "Temporarily save the audio labels every `save_steps`."},
+    )
+    pad_to_multiple_of: Optional[int] = field(
+        default=2,
+        metadata={"help": ("Pad to multiple of for tokenizers.")},
+    )
+    mls_dir: str = field(
+        default=None,
+        metadata={"help": "mls audio dir"},
+    )
+    librittsrmix_dir: str = field(
+        default=None,
+        metadata={"help": "librittsrmix audio dir"},
+    )
+    gigaspeech_dir: str = field(
+        default=None,
+        metadata={"help": "gigaspeech audio dir"},
+    )
+    commonvoice_dir: str = field(
+        default=None,
+        metadata={"help": "commonvoice audio dir"},
+    )
+    emilia_dir: str = field(
+        default=None,
+        metadata={"help": "emilia audio dir"},
+    )
+    source_column_name: str = field(
+        default="source",
+        metadata={"help": "The name of the source column."},
+    )
+    wandb_key: str = field(
+        default=None,
+        metadata={"help": "wandb key name"},
+    )
+
+
+@dataclass
+class ParlerTTSTrainingArguments(Seq2SeqTrainingArguments):
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": (
+                "The data type (dtype) in which to run training. One of `float32` (full-precision), "
+                "`float16` or `bfloat16` (both half-precision)."
+            )
+        },
+    )
+    audio_encoder_per_device_batch_size: int = field(
+        default=8,
+        metadata={"help": ("Specify the batch size of the audio encoding pre-processing steps.")},
+    )
+    eval_dataloader_num_workers: Optional[int] = field(
+        default=0,
+        metadata={
+            "help": (
+                "Number of subprocesses to use for evaluation data loading (PyTorch only). 0 means that the data will be loaded in the main process."
+            )
+        },
+    )
+    compute_clap_similarity_metric: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether or not to compute the clap similarity metric between the description and the generation during evalution."
+            )
+        },
+    )
+    compute_noise_level_metric: bool = field(
+        default=True,
+        metadata={"help": ("Whether or not to compute the squim si-sdr measure of the generations.")},
+    )
+    noise_level_to_compute_clean_wer: float = field(
+        default=25,
+        metadata={
+            "help": (
+                "if `compute_noise_level_metric=True`, will compute a 'clean' WER on samples with generated noise higher than `noise_level_to_compute_clean_wer`."
+                "This is a proxy measure to compute WER on clean audios, provided that the model learn to generate clean audios."
+            )
+        },
+    )
+    eval_generation_steps: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Number of update steps between two generation evaluation.  Will default to the same"
+                "value as `eval_steps` if not set. Should be an integer and a multiple of `eval_steps`."
+            )
+        },
+    )       
+    codebook_weights: Optional[List[float]] = field(
+        default=None,
+        metadata={"help": "Weights applied to each codebook."},
+    )
diff --git a/capspeech/ar/training/arguments_captts.py b/capspeech/ar/training/arguments_captts.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f9996cb3169e770d8cf73c17e0dfd78a79c5ee7
--- /dev/null
+++ b/capspeech/ar/training/arguments_captts.py
@@ -0,0 +1,391 @@
+from dataclasses import dataclass, field
+from typing import Optional, List
+
+from transformers import Seq2SeqTrainingArguments
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    feature_extractor_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained feature extractor name or path if not the same as model_name"}
+    )
+    description_tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained description tokenizer name or path if not the same as model_name"}
+    )
+    prompt_tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Pretrained prompt tokenizer name or path if not the same as description_tokenizer_name"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    pad_token_id: int = field(
+        default=None,
+        metadata={"help": "If specified, change the model pad token id."},
+    )
+    decoder_start_token_id: int = field(
+        default=None,
+        metadata={"help": "If specified, change the model decoder start token id."},
+    )
+    freeze_text_encoder: bool = field(
+        default=False,
+        metadata={"help": "Whether to freeze the text encoder."},
+    )
+    do_sample: bool = field(
+        default=True,
+        metadata={"help": "Whether to do sampling or greedy decoding."},
+    )
+    temperature: float = field(
+        default=1.0,
+        metadata={"help": "Temperature if sampling."},
+    )
+    max_length: int = field(
+        default=2580,
+        metadata={"help": "Generation max length."},
+    )
+    bandwidth: float = field(
+        default=6,
+        metadata={"help": "Audio encoder bandwidth."},
+    )
+    asr_model_name_or_path: str = field(
+        default="distil-whisper/distil-large-v2",
+        metadata={
+            "help": "Used to compute WER during evaluation. Path to pretrained model or model identifier from huggingface.co/models"
+        },
+    )
+    clap_model_name_or_path: str = field(
+        default="laion/larger_clap_music_and_speech",
+        metadata={
+            "help": "Used to compute audio similarity during evaluation. Path to pretrained model or model identifier from huggingface.co/models"
+        },
+    )
+    attn_implementation: str = field(
+        default="eager",
+        metadata={"help": "Attention implementation used. One of `eager`, `sdpa`, `flash_attention_2`"},
+    )
+    cross_attention_implementation_strategy: str = field(
+        default=None,
+        metadata={
+            "help": "If not specified, the cross-attention implementation will be the same as `_attn_implementation`. If `always_eager`, it will always be the eager implementation. If `always_sdpa`, it will always be the sdpa implementation."
+        },
+    )
+    prompt_padding_side: Optional[str] = field(
+        default="left",
+        metadata={
+            "help": "Prompt tokenizer padding side. Defaults to `left`. If the prompt is pre-pended to the codebooks hidden states, it should be padded on the left."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    train_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
+            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    train_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset configs by a '+' symbol."
+        },
+    )
+    train_split_name: str = field(
+        default="train",
+        metadata={
+            "help": ("The name of the training data set split to use (via the datasets library). Defaults to 'train'")
+        },
+    )
+    train_dataset_samples: str = field(
+        default=None,
+        metadata={
+            "help": "Number of samples in the training data. Load and combine "
+            "multiple datasets by separating dataset samples by a '+' symbol."
+        },
+    )
+    train_metadata_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the metadata training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
+            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    eval_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset name if unspecified."
+        },
+    )
+    eval_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset config name if unspecified"
+        },
+    )
+    eval_split_name: str = field(
+        default="test",
+        metadata={
+            "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
+        },
+    )
+    eval_metadata_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the metadata training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
+            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    target_audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the target audio data. Defaults to 'audio'"},
+    )
+    description_column_name: str = field(
+        default=None,
+        metadata={"help": "The name of the dataset column containing the description text data. Defaults to 'None'."},
+    )
+    prompt_column_name: str = field(
+        default=None,
+        metadata={"help": "The name of the dataset column containing the prompt text data. Defaults to 'None'."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of validation examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_duration_in_seconds: float = field(
+        default=35.0,
+        metadata={
+            "help": (
+                "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`."
+                "Also, used to set maximum audio length if `pad_to_max_length=True`."
+            )
+        },
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
+    )
+    max_text_length: int = field(
+        default=500, metadata={"help": "If set, max description lengths in number of characters."}
+    )
+    max_prompt_token_length: int = field(
+        default=None,
+        metadata={
+            "help": (
+                "If set, filter samples with prompts that are longer than `max_prompt_token_length` tokens."
+                "Also, used to set maximum prompt token length if `pad_to_max_length=True`."
+            )
+        },
+    )
+    max_description_token_length: int = field(
+        default=None,
+        metadata={
+            "help": (
+                "If set, filter samples with descriptions that are longer than `max_description_token_length` tokens."
+                "Also, used to set maximum description token length if `pad_to_max_length=True`."
+            )
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "If `True`, pad audio, prompt and description to a maximum length set with respectively "
+                "`max_duration_in_seconds`, `max_prompt_token_length`, `max_description_token_length`."
+            )
+        },
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to only do data preprocessing and skip training. This is especially useful when data"
+                " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
+                " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
+                " can consequently be loaded in distributed training."
+                " In this training script, `save_to_disk` must be set to the path in which the dataset should be saved. "
+            )
+        },
+    )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
+    add_audio_samples_to_wandb: bool = field(
+        default=False,
+        metadata={"help": "If set and if `wandb` in args.report_to, will add generated audio samples to wandb logs."},
+    )
+    id_column_name: str = field(default=None, metadata={"help": "id column name."})
+    wandb_project: str = field(
+        default="parler-speech",
+        metadata={"help": "The name of the wandb project."},
+    )
+    wandb_run_name: str = field(
+        default=None,
+        metadata={
+            "help": "If specified, the name of the run. If not specified, wandb will give a random name to this run."
+        },
+    )
+    save_to_disk: str = field(
+        default=None,
+        metadata={
+            "help": "If set, will save the dataset to this path if this is an empyt folder. If not empty, will load the datasets from it."
+        },
+    )
+    temporary_save_to_disk: str = field(default=None, metadata={"help": "Temporarily save audio labels here."})
+    save_codec_steps: Optional[int] = field(
+        default=500,
+        metadata={"help": "Temporarily save the audio labels every `save_steps`."},
+    )
+    pad_to_multiple_of: Optional[int] = field(
+        default=2,
+        metadata={"help": ("Pad to multiple of for tokenizers.")},
+    )
+    librittsr_dir: str = field(
+        default=None,
+        metadata={"help": "librittsr audio dir"},
+    )
+    other_dir: str = field(
+        default=None,
+        metadata={"help": "other audio dir"},
+    )
+    source_column_name: str = field(
+        default="source",
+        metadata={"help": "The name of the source column."},
+    )
+    wandb_key: str = field(
+        default=None,
+        metadata={"help": "wandb key name"},
+    )
+
+
+@dataclass
+class ParlerTTSTrainingArguments(Seq2SeqTrainingArguments):
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": (
+                "The data type (dtype) in which to run training. One of `float32` (full-precision), "
+                "`float16` or `bfloat16` (both half-precision)."
+            )
+        },
+    )
+    audio_encoder_per_device_batch_size: int = field(
+        default=8,
+        metadata={"help": ("Specify the batch size of the audio encoding pre-processing steps.")},
+    )
+    eval_dataloader_num_workers: Optional[int] = field(
+        default=0,
+        metadata={
+            "help": (
+                "Number of subprocesses to use for evaluation data loading (PyTorch only). 0 means that the data will be loaded in the main process."
+            )
+        },
+    )
+    compute_clap_similarity_metric: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether or not to compute the clap similarity metric between the description and the generation during evalution."
+            )
+        },
+    )
+    compute_noise_level_metric: bool = field(
+        default=True,
+        metadata={"help": ("Whether or not to compute the squim si-sdr measure of the generations.")},
+    )
+    noise_level_to_compute_clean_wer: float = field(
+        default=25,
+        metadata={
+            "help": (
+                "if `compute_noise_level_metric=True`, will compute a 'clean' WER on samples with generated noise higher than `noise_level_to_compute_clean_wer`."
+                "This is a proxy measure to compute WER on clean audios, provided that the model learn to generate clean audios."
+            )
+        },
+    )
+    eval_generation_steps: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Number of update steps between two generation evaluation.  Will default to the same"
+                "value as `eval_steps` if not set. Should be an integer and a multiple of `eval_steps`."
+            )
+        },
+    )       
+    codebook_weights: Optional[List[float]] = field(
+        default=None,
+        metadata={"help": "Weights applied to each codebook."},
+    )
diff --git a/capspeech/ar/training/arguments_capttsse.py b/capspeech/ar/training/arguments_capttsse.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e08cdd0f102ab87dbc0ec8b753065eb46e68b91
--- /dev/null
+++ b/capspeech/ar/training/arguments_capttsse.py
@@ -0,0 +1,387 @@
+from dataclasses import dataclass, field
+from typing import Optional, List
+
+from transformers import Seq2SeqTrainingArguments
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    feature_extractor_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained feature extractor name or path if not the same as model_name"}
+    )
+    description_tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained description tokenizer name or path if not the same as model_name"}
+    )
+    prompt_tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Pretrained prompt tokenizer name or path if not the same as description_tokenizer_name"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    pad_token_id: int = field(
+        default=None,
+        metadata={"help": "If specified, change the model pad token id."},
+    )
+    decoder_start_token_id: int = field(
+        default=None,
+        metadata={"help": "If specified, change the model decoder start token id."},
+    )
+    freeze_text_encoder: bool = field(
+        default=False,
+        metadata={"help": "Whether to freeze the text encoder."},
+    )
+    do_sample: bool = field(
+        default=True,
+        metadata={"help": "Whether to do sampling or greedy decoding."},
+    )
+    temperature: float = field(
+        default=1.0,
+        metadata={"help": "Temperature if sampling."},
+    )
+    max_length: int = field(
+        default=2580,
+        metadata={"help": "Generation max length."},
+    )
+    bandwidth: float = field(
+        default=6,
+        metadata={"help": "Audio encoder bandwidth."},
+    )
+    asr_model_name_or_path: str = field(
+        default="distil-whisper/distil-large-v2",
+        metadata={
+            "help": "Used to compute WER during evaluation. Path to pretrained model or model identifier from huggingface.co/models"
+        },
+    )
+    clap_model_name_or_path: str = field(
+        default="laion/larger_clap_music_and_speech",
+        metadata={
+            "help": "Used to compute audio similarity during evaluation. Path to pretrained model or model identifier from huggingface.co/models"
+        },
+    )
+    attn_implementation: str = field(
+        default="eager",
+        metadata={"help": "Attention implementation used. One of `eager`, `sdpa`, `flash_attention_2`"},
+    )
+    cross_attention_implementation_strategy: str = field(
+        default=None,
+        metadata={
+            "help": "If not specified, the cross-attention implementation will be the same as `_attn_implementation`. If `always_eager`, it will always be the eager implementation. If `always_sdpa`, it will always be the sdpa implementation."
+        },
+    )
+    prompt_padding_side: Optional[str] = field(
+        default="left",
+        metadata={
+            "help": "Prompt tokenizer padding side. Defaults to `left`. If the prompt is pre-pended to the codebooks hidden states, it should be padded on the left."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    train_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
+            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    train_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset configs by a '+' symbol."
+        },
+    )
+    train_split_name: str = field(
+        default="train",
+        metadata={
+            "help": ("The name of the training data set split to use (via the datasets library). Defaults to 'train'")
+        },
+    )
+    train_dataset_samples: str = field(
+        default=None,
+        metadata={
+            "help": "Number of samples in the training data. Load and combine "
+            "multiple datasets by separating dataset samples by a '+' symbol."
+        },
+    )
+    train_metadata_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the metadata training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
+            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    eval_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset name if unspecified."
+        },
+    )
+    eval_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset config name if unspecified"
+        },
+    )
+    eval_split_name: str = field(
+        default="test",
+        metadata={
+            "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
+        },
+    )
+    eval_metadata_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the metadata training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
+            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    target_audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the target audio data. Defaults to 'audio'"},
+    )
+    description_column_name: str = field(
+        default=None,
+        metadata={"help": "The name of the dataset column containing the description text data. Defaults to 'None'."},
+    )
+    prompt_column_name: str = field(
+        default=None,
+        metadata={"help": "The name of the dataset column containing the prompt text data. Defaults to 'None'."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of validation examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_duration_in_seconds: float = field(
+        default=35.0,
+        metadata={
+            "help": (
+                "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`."
+                "Also, used to set maximum audio length if `pad_to_max_length=True`."
+            )
+        },
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
+    )
+    max_text_length: int = field(
+        default=500, metadata={"help": "If set, max description lengths in number of characters."}
+    )
+    max_prompt_token_length: int = field(
+        default=None,
+        metadata={
+            "help": (
+                "If set, filter samples with prompts that are longer than `max_prompt_token_length` tokens."
+                "Also, used to set maximum prompt token length if `pad_to_max_length=True`."
+            )
+        },
+    )
+    max_description_token_length: int = field(
+        default=None,
+        metadata={
+            "help": (
+                "If set, filter samples with descriptions that are longer than `max_description_token_length` tokens."
+                "Also, used to set maximum description token length if `pad_to_max_length=True`."
+            )
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "If `True`, pad audio, prompt and description to a maximum length set with respectively "
+                "`max_duration_in_seconds`, `max_prompt_token_length`, `max_description_token_length`."
+            )
+        },
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to only do data preprocessing and skip training. This is especially useful when data"
+                " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
+                " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
+                " can consequently be loaded in distributed training."
+                " In this training script, `save_to_disk` must be set to the path in which the dataset should be saved. "
+            )
+        },
+    )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
+    add_audio_samples_to_wandb: bool = field(
+        default=False,
+        metadata={"help": "If set and if `wandb` in args.report_to, will add generated audio samples to wandb logs."},
+    )
+    id_column_name: str = field(default=None, metadata={"help": "id column name."})
+    wandb_project: str = field(
+        default="parler-speech",
+        metadata={"help": "The name of the wandb project."},
+    )
+    wandb_run_name: str = field(
+        default=None,
+        metadata={
+            "help": "If specified, the name of the run. If not specified, wandb will give a random name to this run."
+        },
+    )
+    save_to_disk: str = field(
+        default=None,
+        metadata={
+            "help": "If set, will save the dataset to this path if this is an empyt folder. If not empty, will load the datasets from it."
+        },
+    )
+    temporary_save_to_disk: str = field(default=None, metadata={"help": "Temporarily save audio labels here."})
+    save_codec_steps: Optional[int] = field(
+        default=500,
+        metadata={"help": "Temporarily save the audio labels every `save_steps`."},
+    )
+    pad_to_multiple_of: Optional[int] = field(
+        default=2,
+        metadata={"help": ("Pad to multiple of for tokenizers.")},
+    )
+    librittsrmix_dir: str = field(
+        default=None,
+        metadata={"help": "librittsrmix audio dir"},
+    )
+    source_column_name: str = field(
+        default="source",
+        metadata={"help": "The name of the source column."},
+    )
+    wandb_key: str = field(
+        default=None,
+        metadata={"help": "wandb key name"},
+    )
+
+
+@dataclass
+class ParlerTTSTrainingArguments(Seq2SeqTrainingArguments):
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": (
+                "The data type (dtype) in which to run training. One of `float32` (full-precision), "
+                "`float16` or `bfloat16` (both half-precision)."
+            )
+        },
+    )
+    audio_encoder_per_device_batch_size: int = field(
+        default=8,
+        metadata={"help": ("Specify the batch size of the audio encoding pre-processing steps.")},
+    )
+    eval_dataloader_num_workers: Optional[int] = field(
+        default=0,
+        metadata={
+            "help": (
+                "Number of subprocesses to use for evaluation data loading (PyTorch only). 0 means that the data will be loaded in the main process."
+            )
+        },
+    )
+    compute_clap_similarity_metric: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether or not to compute the clap similarity metric between the description and the generation during evalution."
+            )
+        },
+    )
+    compute_noise_level_metric: bool = field(
+        default=True,
+        metadata={"help": ("Whether or not to compute the squim si-sdr measure of the generations.")},
+    )
+    noise_level_to_compute_clean_wer: float = field(
+        default=25,
+        metadata={
+            "help": (
+                "if `compute_noise_level_metric=True`, will compute a 'clean' WER on samples with generated noise higher than `noise_level_to_compute_clean_wer`."
+                "This is a proxy measure to compute WER on clean audios, provided that the model learn to generate clean audios."
+            )
+        },
+    )
+    eval_generation_steps: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Number of update steps between two generation evaluation.  Will default to the same"
+                "value as `eval_steps` if not set. Should be an integer and a multiple of `eval_steps`."
+            )
+        },
+    )       
+    codebook_weights: Optional[List[float]] = field(
+        default=None,
+        metadata={"help": "Weights applied to each codebook."},
+    )
diff --git a/capspeech/ar/training/data.py b/capspeech/ar/training/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2ad6069d917de4a9fd216edc46569c506d9ed85
--- /dev/null
+++ b/capspeech/ar/training/data.py
@@ -0,0 +1,277 @@
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Set, Union
+import os
+import datasets
+import numpy as np
+import torch
+from accelerate import Accelerator
+from datasets import Dataset, IterableDataset, concatenate_datasets, interleave_datasets, load_dataset
+from tqdm import tqdm
+from transformers import AutoFeatureExtractor, AutoTokenizer
+import torchaudio
+import torchaudio.transforms as T
+
+@dataclass
+class DataCollatorEncodecWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received to the longest sequence in the batch or
+    to `max_length` if `max_length` is set and `padding=max_length`.
+    """
+
+    feature_extractor: AutoFeatureExtractor
+    audio_column_name: str
+    mls_dir: Optional[str] = None
+    librittsrmix_dir: Optional[str] = None
+    gigaspeech_dir: Optional[str] = None
+    commonvoice_dir: Optional[str] = None
+    emilia_dir: Optional[str] = None
+    feature_extractor_input_name: Optional[str] = "input_values"
+    max_length: Optional[int] = None
+    padding: Optional[str] = "longest"
+
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+        sampling_rate = self.feature_extractor.sampling_rate
+        # load audio
+        audios = []
+        for f in features:
+            path = f[self.audio_column_name]
+            source = f["source"]
+            if source == "libritts-r":
+                path = os.path.join(self.librittsrmix_dir, path)
+            elif source == "mls":
+                path = os.path.join(self.mls_dir, path)
+            elif source == "gigaspeech":
+                path = os.path.join(self.gigaspeech_dir, path)
+            elif source == "commonvoice":
+                path = os.path.join(self.commonvoice_dir, path)
+            elif source == "emilia":
+                path = os.path.join(self.emilia_dir, path)
+            else:
+                raise ValueError(source)
+
+            if os.path.exists(path):
+                waveform, sr = torchaudio.load(path)
+                if sr != sampling_rate:
+                    resampler = T.Resample(orig_freq=sr, new_freq=sampling_rate)
+                    waveform = resampler(waveform)
+                if waveform.shape[0] > 1:
+                    waveform = waveform.mean(dim=0, keepdim=True)
+                audios.append(waveform.squeeze())
+            else:
+                print(f"Read error: {path}")
+
+
+        len_audio = [len(audio) for audio in audios]
+        if self.max_length is not None:
+            audios = [audio[: min(l, self.max_length)] for audio, l in zip(audios, len_audio)]
+
+        # since resampling has already been performed in the 'load_multiple_datasets' function,
+        # a fixed sampling_rate(44100hz) is passed to the feature_extractor.
+        batch = self.feature_extractor(
+            [np.asarray(a, dtype=np.float32) for a in audios], sampling_rate=sampling_rate, return_tensors="pt", padding=self.padding, max_length=self.max_length
+        )
+        batch["len_audio"] = torch.tensor(len_audio).unsqueeze(1)
+        return batch
+
+
+@dataclass
+class DataCollatorParlerTTSWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        prompt_tokenizer (:class:`~transformers.AutoTokenizer`)
+            The prompt_tokenizer used for proccessing the data.
+        description_tokenizer (:class:`~transformers.AutoTokenizer`)
+            The description_tokenizer used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+
+    prompt_tokenizer: AutoTokenizer
+    description_tokenizer: AutoTokenizer
+    padding: Union[bool, str] = "longest"
+    pad_to_multiple_of: Optional[int] = None
+    prompt_max_length: Optional[int] = None
+    description_max_length: Optional[int] = None
+    audio_max_length: Optional[int] = None
+
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+
+        labels = [torch.tensor(feature["labels"]).transpose(0, 1) for feature in features]
+        # (bsz, seq_len, num_codebooks)
+        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
+        if self.audio_max_length is not None and self.padding == "max_length":
+            labels = torch.nn.functional.pad(
+                labels, pad=(0, 0, 0, max(self.audio_max_length - labels.shape[1], 0)), value=-100
+            )
+
+        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
+
+        input_ids = self.description_tokenizer.pad(
+            input_ids,
+            return_tensors="pt",
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            max_length=self.description_max_length,
+        )
+
+        batch = {"labels": labels, **input_ids}
+
+        prompt_input_ids = [{"input_ids": feature["prompt_input_ids"]} for feature in features]
+        prompt_input_ids = self.prompt_tokenizer.pad(
+            prompt_input_ids,
+            return_tensors="pt",
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            max_length=self.prompt_max_length,
+        )
+
+        batch["prompt_input_ids"] = prompt_input_ids["input_ids"]
+        if "attention_mask" in prompt_input_ids:
+            batch["prompt_attention_mask"] = prompt_input_ids["attention_mask"]
+
+        return batch
+
+
+def convert_dataset_str_to_list(
+    dataset_names,
+    splits=None,
+    dataset_samples=None,
+    default_split="train",
+):
+    if isinstance(dataset_names, str):
+        dataset_names = dataset_names.split("+")
+        splits = splits.split("+") if splits is not None else None
+        dataset_samples = dataset_samples.split("+") if dataset_samples is not None else None
+
+    if splits is not None and len(splits) != len(dataset_names):
+        raise ValueError(
+            f"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits."
+        )
+
+    if dataset_samples is not None:
+        if len(dataset_samples) != len(dataset_names):
+            raise ValueError(
+                f"Ensure one sample is passed for each dataset, got {len(dataset_names)} datasets and "
+                f"{len(dataset_samples)} samples."
+            )
+        dataset_samples = [float(ds_sample) for ds_sample in dataset_samples]
+    else:
+        dataset_samples = [None] * len(dataset_names)
+
+    splits = splits if splits is not None else [default_split for _ in range(len(dataset_names))]
+
+    dataset_names_dict = []
+    for i, ds_name in enumerate(dataset_names):
+        dataset_names_dict.append(
+            {
+                "name": ds_name,
+                "split": splits[i],
+                "samples": dataset_samples[i],
+            }
+        )
+    return dataset_names_dict
+
+
+def load_multiple_datasets(
+    accelerator: Accelerator,
+    dataset_names: Union[List, str],
+    splits: Optional[Union[List, str]] = None,
+    label_column_names: Optional[List] = None,
+    stopping_strategy: Optional[str] = "first_exhausted",
+    dataset_samples: Optional[Union[List, np.array]] = None,
+    streaming: Optional[bool] = False,
+    seed: Optional[int] = None,
+    id_column_name: Optional[str] = None,
+    columns_to_keep: Optional[Set[str]] = None,
+    prompt_column_name: Optional[str] = None,
+    sampling_rate: Optional[int] = None,
+    audio_column_name: Optional[str] = None,
+    logger: Optional[logging.Logger] = None,
+    librittsrmix_dir: Optional[Union[List, str]] = None,
+    mls_dir: Optional[Union[List, str]] = None,
+    gigaspeech_dir: Optional[Union[List, str]] = None,
+    commonvoice_dir: Optional[Union[List, str]] = None,
+    emilia_dir: Optional[Union[List, str]] = None,
+    **kwargs,
+) -> Union[Dataset, IterableDataset]:
+    dataset_names_dict = convert_dataset_str_to_list(
+        dataset_names, splits, label_column_names, dataset_samples
+    )
+
+    if dataset_samples is not None:
+        dataset_samples = [ds_dict["samples"] for ds_dict in dataset_names_dict]
+        probabilities = np.array(dataset_samples) / np.sum(dataset_samples)
+    else:
+        probabilities = None
+
+    all_datasets = []
+    # iterate over the datasets we want to interleave
+    for dataset_dict in tqdm(dataset_names_dict, desc="Combining datasets..."):
+        with accelerator.local_main_process_first():
+            dataset = load_dataset(
+                dataset_dict["name"],
+                split=dataset_dict["split"],
+                streaming=streaming,
+                **kwargs,
+            )
+            dataset_features = dataset.features.keys()
+
+            if columns_to_keep is not None:
+                dataset = dataset.remove_columns(set(dataset_features - columns_to_keep))
+
+            def resolve_path(example):
+                path = example["audio_path"]
+                source = example["source"]
+
+                if source == "libritts-r":
+                    full_path = os.path.join(librittsrmix_dir, path)
+                elif source == "mls":
+                    full_path = os.path.join(mls_dir, path)
+                elif source == "gigaspeech":
+                    full_path = os.path.join(gigaspeech_dir, path)
+                elif source == "commonvoice":
+                    full_path = os.path.join(commonvoice_dir, path)
+                elif source == "emilia":
+                    full_path = os.path.join(emilia_dir, path)
+                else:
+                    return False  # unknown source
+
+                return os.path.exists(full_path)
+                
+            dataset = dataset.filter(resolve_path, num_proc=16)
+
+        all_datasets.append(dataset)
+
+    if len(all_datasets) == 1:
+        # we have a single dataset so just return it as is
+        return all_datasets[0]
+
+    if streaming:
+        interleaved_dataset = interleave_datasets(
+            all_datasets,
+            stopping_strategy=stopping_strategy,
+            probabilities=probabilities,
+            seed=seed,
+        )
+    else:
+        with accelerator.local_main_process_first():
+            interleaved_dataset = concatenate_datasets(all_datasets)
+
+    return interleaved_dataset
diff --git a/capspeech/ar/training/data_captts.py b/capspeech/ar/training/data_captts.py
new file mode 100644
index 0000000000000000000000000000000000000000..227ab2198c8679a155854c5436e7c778f619792b
--- /dev/null
+++ b/capspeech/ar/training/data_captts.py
@@ -0,0 +1,255 @@
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Set, Union
+import os
+import datasets
+import numpy as np
+import torch
+from accelerate import Accelerator
+from datasets import Dataset, IterableDataset, concatenate_datasets, interleave_datasets, load_dataset
+from tqdm import tqdm
+from transformers import AutoFeatureExtractor, AutoTokenizer
+import torchaudio
+import torchaudio.transforms as T
+
+@dataclass
+class DataCollatorEncodecWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received to the longest sequence in the batch or
+    to `max_length` if `max_length` is set and `padding=max_length`.
+    """
+
+    feature_extractor: AutoFeatureExtractor
+    audio_column_name: str
+    librittsr_dir: Optional[str] = None
+    other_dir: Optional[str] = None
+    feature_extractor_input_name: Optional[str] = "input_values"
+    max_length: Optional[int] = None
+    padding: Optional[str] = "longest"
+
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+        sampling_rate = self.feature_extractor.sampling_rate
+        # load audio
+        audios = []
+        for f in features:
+            path = f[self.audio_column_name]
+            source = f["source"]
+            if source == "libritts-r":
+                path = os.path.join(self.librittsr_dir, path)
+            else:
+                path = os.path.join(self.other_dir, path)
+
+            if os.path.exists(path):
+                waveform, sr = torchaudio.load(path)
+                if sr != sampling_rate:
+                    resampler = T.Resample(orig_freq=sr, new_freq=sampling_rate)
+                    waveform = resampler(waveform)
+                if waveform.shape[0] > 1:
+                    waveform = waveform.mean(dim=0, keepdim=True)
+                audios.append(waveform.squeeze())
+            else:
+                print(f"Read error: {path}")
+
+
+        len_audio = [len(audio) for audio in audios]
+        if self.max_length is not None:
+            audios = [audio[: min(l, self.max_length)] for audio, l in zip(audios, len_audio)]
+
+        # since resampling has already been performed in the 'load_multiple_datasets' function,
+        # a fixed sampling_rate(44100hz) is passed to the feature_extractor.
+        batch = self.feature_extractor(
+            [np.asarray(a, dtype=np.float32) for a in audios], sampling_rate=sampling_rate, return_tensors="pt", padding=self.padding, max_length=self.max_length
+        )
+        batch["len_audio"] = torch.tensor(len_audio).unsqueeze(1)
+        return batch
+
+
+@dataclass
+class DataCollatorParlerTTSWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        prompt_tokenizer (:class:`~transformers.AutoTokenizer`)
+            The prompt_tokenizer used for proccessing the data.
+        description_tokenizer (:class:`~transformers.AutoTokenizer`)
+            The description_tokenizer used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+
+    prompt_tokenizer: AutoTokenizer
+    description_tokenizer: AutoTokenizer
+    padding: Union[bool, str] = "longest"
+    pad_to_multiple_of: Optional[int] = None
+    prompt_max_length: Optional[int] = None
+    description_max_length: Optional[int] = None
+    audio_max_length: Optional[int] = None
+
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+
+        labels = [torch.tensor(feature["labels"]).transpose(0, 1) for feature in features]
+        # (bsz, seq_len, num_codebooks)
+        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
+        if self.audio_max_length is not None and self.padding == "max_length":
+            labels = torch.nn.functional.pad(
+                labels, pad=(0, 0, 0, max(self.audio_max_length - labels.shape[1], 0)), value=-100
+            )
+
+        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
+
+        input_ids = self.description_tokenizer.pad(
+            input_ids,
+            return_tensors="pt",
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            max_length=self.description_max_length,
+        )
+
+        batch = {"labels": labels, **input_ids}
+
+        prompt_input_ids = [{"input_ids": feature["prompt_input_ids"]} for feature in features]
+        prompt_input_ids = self.prompt_tokenizer.pad(
+            prompt_input_ids,
+            return_tensors="pt",
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            max_length=self.prompt_max_length,
+        )
+
+        batch["prompt_input_ids"] = prompt_input_ids["input_ids"]
+        if "attention_mask" in prompt_input_ids:
+            batch["prompt_attention_mask"] = prompt_input_ids["attention_mask"]
+
+        return batch
+
+
+def convert_dataset_str_to_list(
+    dataset_names,
+    splits=None,
+    dataset_samples=None,
+    default_split="train",
+):
+    if isinstance(dataset_names, str):
+        dataset_names = dataset_names.split("+")
+        splits = splits.split("+") if splits is not None else None
+        dataset_samples = dataset_samples.split("+") if dataset_samples is not None else None
+
+    if splits is not None and len(splits) != len(dataset_names):
+        raise ValueError(
+            f"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits."
+        )
+
+    if dataset_samples is not None:
+        if len(dataset_samples) != len(dataset_names):
+            raise ValueError(
+                f"Ensure one sample is passed for each dataset, got {len(dataset_names)} datasets and "
+                f"{len(dataset_samples)} samples."
+            )
+        dataset_samples = [float(ds_sample) for ds_sample in dataset_samples]
+    else:
+        dataset_samples = [None] * len(dataset_names)
+
+    splits = splits if splits is not None else [default_split for _ in range(len(dataset_names))]
+
+    dataset_names_dict = []
+    for i, ds_name in enumerate(dataset_names):
+        dataset_names_dict.append(
+            {
+                "name": ds_name,
+                "split": splits[i],
+                "samples": dataset_samples[i],
+            }
+        )
+    return dataset_names_dict
+
+
+def load_multiple_datasets(
+    accelerator: Accelerator,
+    dataset_names: Union[List, str],
+    splits: Optional[Union[List, str]] = None,
+    label_column_names: Optional[List] = None,
+    stopping_strategy: Optional[str] = "first_exhausted",
+    dataset_samples: Optional[Union[List, np.array]] = None,
+    streaming: Optional[bool] = False,
+    seed: Optional[int] = None,
+    id_column_name: Optional[str] = None,
+    columns_to_keep: Optional[Set[str]] = None,
+    prompt_column_name: Optional[str] = None,
+    sampling_rate: Optional[int] = None,
+    audio_column_name: Optional[str] = None,
+    logger: Optional[logging.Logger] = None,
+    librittsr_dir: Optional[Union[List, str]] = None,
+    other_dir: Optional[Union[List, str]] = None,
+    **kwargs,
+) -> Union[Dataset, IterableDataset]:
+    dataset_names_dict = convert_dataset_str_to_list(
+        dataset_names, splits, label_column_names, dataset_samples
+    )
+
+    if dataset_samples is not None:
+        dataset_samples = [ds_dict["samples"] for ds_dict in dataset_names_dict]
+        probabilities = np.array(dataset_samples) / np.sum(dataset_samples)
+    else:
+        probabilities = None
+
+    all_datasets = []
+    # iterate over the datasets we want to interleave
+    for dataset_dict in tqdm(dataset_names_dict, desc="Combining datasets..."):
+        with accelerator.local_main_process_first():
+            dataset = load_dataset(
+                dataset_dict["name"],
+                split=dataset_dict["split"],
+                streaming=streaming,
+                **kwargs,
+            )
+            dataset_features = dataset.features.keys()
+
+            if columns_to_keep is not None:
+                dataset = dataset.remove_columns(set(dataset_features - columns_to_keep))
+
+            def resolve_path(example):
+                path = example["audio_path"]
+                source = example["source"]
+
+                if source == "libritts-r":
+                    full_path = os.path.join(librittsr_dir, path)
+                else:
+                    full_path = os.path.join(other_dir, path)
+
+                return os.path.exists(full_path)
+                
+            dataset = dataset.filter(resolve_path, num_proc=16)
+
+        all_datasets.append(dataset)
+
+    if len(all_datasets) == 1:
+        # we have a single dataset so just return it as is
+        return all_datasets[0]
+
+    if streaming:
+        interleaved_dataset = interleave_datasets(
+            all_datasets,
+            stopping_strategy=stopping_strategy,
+            probabilities=probabilities,
+            seed=seed,
+        )
+    else:
+        with accelerator.local_main_process_first():
+            interleaved_dataset = concatenate_datasets(all_datasets)
+
+    return interleaved_dataset
diff --git a/capspeech/ar/training/data_capttsse.py b/capspeech/ar/training/data_capttsse.py
new file mode 100644
index 0000000000000000000000000000000000000000..be690df76b58d5b4b8fc6289cf6f95031ab84296
--- /dev/null
+++ b/capspeech/ar/training/data_capttsse.py
@@ -0,0 +1,253 @@
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Set, Union
+import os
+import datasets
+import numpy as np
+import torch
+from accelerate import Accelerator
+from datasets import Dataset, IterableDataset, concatenate_datasets, interleave_datasets, load_dataset
+from tqdm import tqdm
+from transformers import AutoFeatureExtractor, AutoTokenizer
+import torchaudio
+import torchaudio.transforms as T
+
+@dataclass
+class DataCollatorEncodecWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received to the longest sequence in the batch or
+    to `max_length` if `max_length` is set and `padding=max_length`.
+    """
+
+    feature_extractor: AutoFeatureExtractor
+    audio_column_name: str
+    librittsrmix_dir: Optional[str] = None
+    feature_extractor_input_name: Optional[str] = "input_values"
+    max_length: Optional[int] = None
+    padding: Optional[str] = "longest"
+
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+        sampling_rate = self.feature_extractor.sampling_rate
+        # load audio
+        audios = []
+        for f in features:
+            path = f[self.audio_column_name]
+            source = f["source"]
+            if source == "libritts-r":
+                path = os.path.join(self.librittsrmix_dir, path)
+            else:
+                raise ValueError(source)
+
+            if os.path.exists(path):
+                waveform, sr = torchaudio.load(path)
+                if sr != sampling_rate:
+                    resampler = T.Resample(orig_freq=sr, new_freq=sampling_rate)
+                    waveform = resampler(waveform)
+                if waveform.shape[0] > 1:
+                    waveform = waveform.mean(dim=0, keepdim=True)
+                audios.append(waveform.squeeze())
+            else:
+                print(f"Read error: {path}")
+
+
+        len_audio = [len(audio) for audio in audios]
+        if self.max_length is not None:
+            audios = [audio[: min(l, self.max_length)] for audio, l in zip(audios, len_audio)]
+
+        # since resampling has already been performed in the 'load_multiple_datasets' function,
+        # a fixed sampling_rate(44100hz) is passed to the feature_extractor.
+        batch = self.feature_extractor(
+            [np.asarray(a, dtype=np.float32) for a in audios], sampling_rate=sampling_rate, return_tensors="pt", padding=self.padding, max_length=self.max_length
+        )
+        batch["len_audio"] = torch.tensor(len_audio).unsqueeze(1)
+        return batch
+
+
+@dataclass
+class DataCollatorParlerTTSWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        prompt_tokenizer (:class:`~transformers.AutoTokenizer`)
+            The prompt_tokenizer used for proccessing the data.
+        description_tokenizer (:class:`~transformers.AutoTokenizer`)
+            The description_tokenizer used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+
+    prompt_tokenizer: AutoTokenizer
+    description_tokenizer: AutoTokenizer
+    padding: Union[bool, str] = "longest"
+    pad_to_multiple_of: Optional[int] = None
+    prompt_max_length: Optional[int] = None
+    description_max_length: Optional[int] = None
+    audio_max_length: Optional[int] = None
+
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+
+        labels = [torch.tensor(feature["labels"]).transpose(0, 1) for feature in features]
+        # (bsz, seq_len, num_codebooks)
+        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
+        if self.audio_max_length is not None and self.padding == "max_length":
+            labels = torch.nn.functional.pad(
+                labels, pad=(0, 0, 0, max(self.audio_max_length - labels.shape[1], 0)), value=-100
+            )
+
+        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
+
+        input_ids = self.description_tokenizer.pad(
+            input_ids,
+            return_tensors="pt",
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            max_length=self.description_max_length,
+        )
+
+        batch = {"labels": labels, **input_ids}
+
+        prompt_input_ids = [{"input_ids": feature["prompt_input_ids"]} for feature in features]
+        prompt_input_ids = self.prompt_tokenizer.pad(
+            prompt_input_ids,
+            return_tensors="pt",
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            max_length=self.prompt_max_length,
+        )
+
+        batch["prompt_input_ids"] = prompt_input_ids["input_ids"]
+        if "attention_mask" in prompt_input_ids:
+            batch["prompt_attention_mask"] = prompt_input_ids["attention_mask"]
+
+        return batch
+
+
+def convert_dataset_str_to_list(
+    dataset_names,
+    splits=None,
+    dataset_samples=None,
+    default_split="train",
+):
+    if isinstance(dataset_names, str):
+        dataset_names = dataset_names.split("+")
+        splits = splits.split("+") if splits is not None else None
+        dataset_samples = dataset_samples.split("+") if dataset_samples is not None else None
+
+    if splits is not None and len(splits) != len(dataset_names):
+        raise ValueError(
+            f"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits."
+        )
+
+    if dataset_samples is not None:
+        if len(dataset_samples) != len(dataset_names):
+            raise ValueError(
+                f"Ensure one sample is passed for each dataset, got {len(dataset_names)} datasets and "
+                f"{len(dataset_samples)} samples."
+            )
+        dataset_samples = [float(ds_sample) for ds_sample in dataset_samples]
+    else:
+        dataset_samples = [None] * len(dataset_names)
+
+    splits = splits if splits is not None else [default_split for _ in range(len(dataset_names))]
+
+    dataset_names_dict = []
+    for i, ds_name in enumerate(dataset_names):
+        dataset_names_dict.append(
+            {
+                "name": ds_name,
+                "split": splits[i],
+                "samples": dataset_samples[i],
+            }
+        )
+    return dataset_names_dict
+
+
+def load_multiple_datasets(
+    accelerator: Accelerator,
+    dataset_names: Union[List, str],
+    splits: Optional[Union[List, str]] = None,
+    label_column_names: Optional[List] = None,
+    stopping_strategy: Optional[str] = "first_exhausted",
+    dataset_samples: Optional[Union[List, np.array]] = None,
+    streaming: Optional[bool] = False,
+    seed: Optional[int] = None,
+    id_column_name: Optional[str] = None,
+    columns_to_keep: Optional[Set[str]] = None,
+    prompt_column_name: Optional[str] = None,
+    sampling_rate: Optional[int] = None,
+    audio_column_name: Optional[str] = None,
+    logger: Optional[logging.Logger] = None,
+    librittsrmix_dir: Optional[Union[List, str]] = None,
+    **kwargs,
+) -> Union[Dataset, IterableDataset]:
+    dataset_names_dict = convert_dataset_str_to_list(
+        dataset_names, splits, label_column_names, dataset_samples
+    )
+
+    if dataset_samples is not None:
+        dataset_samples = [ds_dict["samples"] for ds_dict in dataset_names_dict]
+        probabilities = np.array(dataset_samples) / np.sum(dataset_samples)
+    else:
+        probabilities = None
+
+    all_datasets = []
+    # iterate over the datasets we want to interleave
+    for dataset_dict in tqdm(dataset_names_dict, desc="Combining datasets..."):
+        with accelerator.local_main_process_first():
+            dataset = load_dataset(
+                dataset_dict["name"],
+                split=dataset_dict["split"],
+                streaming=streaming,
+                **kwargs,
+            )
+            dataset_features = dataset.features.keys()
+
+            if columns_to_keep is not None:
+                dataset = dataset.remove_columns(set(dataset_features - columns_to_keep))
+
+            def resolve_path(example):
+                path = example["audio_path"]
+                source = example["source"]
+
+                if source == "libritts-r":
+                    full_path = os.path.join(librittsrmix_dir, path)
+                else:
+                    return False  # unknown source
+
+                return os.path.exists(full_path)
+                
+            dataset = dataset.filter(resolve_path, num_proc=16)
+
+        all_datasets.append(dataset)
+
+    if len(all_datasets) == 1:
+        # we have a single dataset so just return it as is
+        return all_datasets[0]
+
+    if streaming:
+        interleaved_dataset = interleave_datasets(
+            all_datasets,
+            stopping_strategy=stopping_strategy,
+            probabilities=probabilities,
+            seed=seed,
+        )
+    else:
+        with accelerator.local_main_process_first():
+            interleaved_dataset = concatenate_datasets(all_datasets)
+
+    return interleaved_dataset
diff --git a/capspeech/ar/training/finetune_captts.py b/capspeech/ar/training/finetune_captts.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ecefb881c977c56a8cc6d3355e1f082e3dac78e
--- /dev/null
+++ b/capspeech/ar/training/finetune_captts.py
@@ -0,0 +1,1270 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Train Parler-TTS using 🤗 Accelerate"""
+
+import logging
+import os
+import re
+import sys
+import time
+import math
+import contextlib
+from multiprocess import set_start_method
+from datetime import timedelta
+import inspect
+from tqdm import tqdm
+from pathlib import Path
+import wandb
+
+import torch
+from torch.utils.data import DataLoader
+
+import datasets
+from datasets import DatasetDict, Dataset, IterableDataset, concatenate_datasets
+
+from huggingface_hub import HfApi
+
+import transformers
+from transformers import AutoFeatureExtractor, AutoTokenizer, HfArgumentParser
+from transformers.trainer_pt_utils import LengthGroupedSampler
+from transformers.optimization import get_scheduler
+from transformers.utils import send_example_telemetry
+
+
+from accelerate import Accelerator, skip_first_batches
+from accelerate.utils import set_seed, AutocastKwargs, InitProcessGroupKwargs, TorchDynamoPlugin, DistributedDataParallelKwargs
+from accelerate.utils.memory import release_memory
+
+from parler_tts import (
+    ParlerTTSConfig,
+    ParlerTTSForConditionalGeneration,
+    build_delay_pattern_mask,
+)
+
+from training.utils import (
+    get_last_checkpoint,
+    rotate_checkpoints,
+    log_pred,
+    log_metric,
+    load_all_codec_checkpoints,
+    save_codec_checkpoint,
+    get_last_codec_checkpoint_step,
+)
+from training.arguments_captts import ModelArguments, DataTrainingArguments, ParlerTTSTrainingArguments
+from training.data_captts import load_multiple_datasets, DataCollatorParlerTTSWithPadding, DataCollatorEncodecWithPadding
+from training.eval import clap_similarity, wer, si_sdr
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ParlerTTSTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_parler_tts", model_args, data_args)
+    
+    if data_args.wandb_key is not None:
+        wandb.login(key=data_args.wandb_key)
+
+    if training_args.dtype == "float16":
+        mixed_precision = "fp16"
+        torch_dtype = torch.float16
+    elif training_args.dtype == "bfloat16":
+        mixed_precision = "bf16"
+        torch_dtype = torch.bfloat16
+    else:
+        mixed_precision = "no"
+        torch_dtype = torch.float32
+
+    if data_args.pad_to_max_length and (
+        data_args.max_duration_in_seconds is None
+        or data_args.max_prompt_token_length is None
+        or data_args.max_description_token_length is None
+    ):
+        raise ValueError(
+            "`pad_to_max_length` is `True` but one of the following parameters has not been set: `max_duration_in_seconds`, `max_prompt_token_length`, `max_description_token_length`"
+        )
+
+    padding = "max_length" if data_args.pad_to_max_length else "longest"
+
+    ####### A. Preparation
+    kwargs_handlers = [InitProcessGroupKwargs(timeout=timedelta(minutes=120)), DistributedDataParallelKwargs(find_unused_parameters=False)]
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=training_args.gradient_accumulation_steps,
+        mixed_precision=mixed_precision,
+        log_with=training_args.report_to,
+        project_dir=training_args.output_dir,
+        kwargs_handlers=kwargs_handlers,
+    )
+
+    accelerator.init_trackers(
+        project_name=data_args.wandb_project,
+        config={
+            "learning_rate": training_args.learning_rate,
+            "model_name_or_path": model_args.model_name_or_path,
+            "num_train_epochs": training_args.num_train_epochs,
+            "gradient_accumulation_steps": training_args.gradient_accumulation_steps,
+            "per_device_train_batch_size": training_args.per_device_train_batch_size,
+            "global_batch_size": training_args.per_device_train_batch_size * accelerator.num_processes,
+            "mixed_precision": mixed_precision,
+            "lr_scheduler_type": training_args.lr_scheduler_type,
+            "warmup_steps": training_args.warmup_steps,
+            "freeze_text_encoder": model_args.freeze_text_encoder,
+            "max_duration_in_seconds": data_args.max_duration_in_seconds,
+            "weight_decay": training_args.weight_decay,
+            "adam_beta1": training_args.adam_beta1,
+            "adam_beta2": training_args.adam_beta2,
+            "temperature": model_args.temperature,
+        },
+        init_kwargs={"wandb": {"name": data_args.wandb_run_name}} if data_args.wandb_run_name else {},
+    )
+
+    # Detecting last checkpoint and eventually continue from last checkpoint
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if accelerator.is_main_process else logging.WARN)
+
+    # Log a small summary on each proces
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
+    )
+
+    # Set the verbosity to info of the Transformers logger (on main process only)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    num_workers = data_args.preprocessing_num_workers
+
+    # 1. First, lett's instantiate the feature extractor, tokenizers and model
+    # Note for distributed training, the .from_pretrained methods guarantee that only
+    # one local process can concurrently download model & vocab.
+
+    # load feature extractor
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.feature_extractor_name or model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+    ) 
+    sampling_rate = feature_extractor.sampling_rate
+
+    # load prompt tokenizer
+    prompt_tokenizer = AutoTokenizer.from_pretrained(
+        model_args.prompt_tokenizer_name or model_args.description_tokenizer_name or model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+        use_fast=model_args.use_fast_tokenizer,
+        padding_side=model_args.prompt_padding_side,
+    )
+
+    # load description tokenizer
+    description_tokenizer = AutoTokenizer.from_pretrained(
+        model_args.description_tokenizer_name or model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+        use_fast=model_args.use_fast_tokenizer,
+    )
+
+    if model_args.use_fast_tokenizer:
+        logger.warning(
+            "Disabling fast tokenizer warning: https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L3231-L3235"
+        )
+        prompt_tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
+        description_tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
+
+    # 2. Now, let's load the dataset
+
+    if data_args.save_to_disk is not None:
+        os.makedirs(data_args.save_to_disk, exist_ok=True)
+
+    # assume that the dataset has been saved to `save_to_disk` if the latter is not empty
+    dataset_was_precomputed = len(os.listdir(data_args.save_to_disk)) > 0
+    if dataset_was_precomputed:
+        with accelerator.local_main_process_first():
+            vectorized_datasets = datasets.load_from_disk(data_args.save_to_disk)
+    else:
+        raw_datasets = DatasetDict()
+
+        columns_to_keep = {
+            "target_audio_column_name": data_args.target_audio_column_name,
+            "prompt_column_name": data_args.prompt_column_name,
+            "source": data_args.source_column_name,
+        }
+        if data_args.description_column_name is not None:
+            columns_to_keep["description_column_name"] = data_args.description_column_name
+
+        if training_args.do_train:
+            raw_datasets["train"] = load_multiple_datasets(
+                accelerator,
+                data_args.train_dataset_name,
+                splits=data_args.train_split_name,
+                dataset_samples=data_args.train_dataset_samples,
+                seed=training_args.seed,
+                cache_dir=model_args.cache_dir,
+                num_proc=data_args.preprocessing_num_workers,
+                id_column_name=data_args.id_column_name,
+                columns_to_keep=columns_to_keep.values(),
+                prompt_column_name=data_args.prompt_column_name,
+                audio_column_name=data_args.target_audio_column_name,
+                sampling_rate=sampling_rate,
+                logger=logger,
+                librittsr_dir=data_args.librittsr_dir,
+                other_dir=data_args.other_dir,
+                # streaming=data_args.streaming, TODO(SG): optionally enable streaming mode
+            )
+
+            for key in columns_to_keep:
+                if columns_to_keep[key] not in raw_datasets["train"].column_names:
+                    raise ValueError(
+                        f"--{key} '{columns_to_keep[key]}' not found in dataset '{data_args.train_dataset_name}'."
+                        f" Make sure to set `--{key}` to the correct audio column - one of"
+                        f" {', '.join(raw_datasets['train'].column_names)}."
+                    )
+
+            if data_args.max_train_samples is not None:
+                raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
+
+        if training_args.do_eval:
+            raw_datasets["eval"] = load_multiple_datasets(
+                accelerator,
+                data_args.eval_dataset_name if data_args.eval_dataset_name else data_args.train_dataset_name,
+                splits=data_args.eval_split_name,
+                cache_dir=model_args.cache_dir,
+                num_proc=data_args.preprocessing_num_workers,
+                id_column_name=data_args.id_column_name,
+                columns_to_keep=columns_to_keep.values(),
+                prompt_column_name=data_args.prompt_column_name,
+                audio_column_name=data_args.target_audio_column_name,
+                sampling_rate=sampling_rate,
+                logger=logger,
+                librittsr_dir=data_args.librittsr_dir,
+                other_dir=data_args.other_dir,
+                # streaming=data_args.streaming, TODO(SG): optionally enable streaming mode
+            )
+
+            if data_args.max_eval_samples is not None:
+                with accelerator.local_main_process_first():
+                    raw_datasets["eval"] = (
+                        raw_datasets["eval"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples))
+                    )
+
+    # 3. Next, let's load the config.
+    config = ParlerTTSConfig.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+    )
+ 
+    if training_args.codebook_weights is not None and len(training_args.codebook_weights) != config.decoder.num_codebooks:
+        raise ValueError(f"`codebook_weights` has length {len(training_args.codebook_weights)} when it should be of length {config.decoder.num_codebooks}.")
+
+    # update pad token id and decoder_start_token_id
+    config.decoder.update(
+        {
+            "cross_attention_implementation_strategy": model_args.cross_attention_implementation_strategy
+            if model_args.cross_attention_implementation_strategy is not None
+            else None,
+            "codebook_weights": training_args.codebook_weights if training_args.codebook_weights is not None else config.decoder.codebook_weights
+        }
+    )
+    config.update(
+        {
+            "pad_token_id": model_args.pad_token_id if model_args.pad_token_id is not None else config.pad_token_id,
+            "decoder_start_token_id": model_args.decoder_start_token_id
+            if model_args.decoder_start_token_id is not None
+            else config.decoder_start_token_id,
+        }
+    )
+
+    with open("events.txt", "r") as f:
+        events = [line.strip() for line in f]
+    events = ["<"+event.lower().replace(" ", "_")+">" for event in events]
+    events.append("<B_start>")
+    events.append("<B_end>")
+    events.append("<I_start>")
+    events.append("<I_end>")
+
+    special_tokens = {"additional_special_tokens": events}
+    prompt_tokenizer.add_special_tokens(special_tokens)
+    description_tokenizer.add_special_tokens(special_tokens)
+    padded_vocab_size = ((len(prompt_tokenizer) + 127) // 128) * 128 
+    config.vocab_size = padded_vocab_size 
+
+    # create model
+    model = ParlerTTSForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path,
+        ignore_mismatched_sizes=True,
+        cache_dir=model_args.cache_dir,
+        config=config,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+        attn_implementation={"decoder": model_args.attn_implementation, "text_encoder": "eager"},
+    )
+    model.text_encoder.resize_token_embeddings(padded_vocab_size)
+
+    # enable gradient checkpointing if necessary
+    if training_args.gradient_checkpointing:
+        model.gradient_checkpointing_enable()
+
+    # 4. Now we preprocess the datasets including loading the audio, resampling and normalization
+    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
+    # so that we just need to set the correct target sampling rate and normalize the input
+    # via the `feature_extractor`
+
+    # derive max & min input length for sample rate & max duration
+    sampling_rate = feature_extractor.sampling_rate
+    max_target_length = int(data_args.max_duration_in_seconds * sampling_rate)
+    min_target_length = int(data_args.min_duration_in_seconds * sampling_rate)
+    target_audio_column_name = data_args.target_audio_column_name
+    description_column_name = data_args.description_column_name
+    prompt_column_name = data_args.prompt_column_name
+    feature_extractor_input_name = feature_extractor.model_input_names[0]
+    audio_encoder_pad_token_id = config.decoder.pad_token_id
+    audio_encoder_eos_token_id = config.decoder.eos_token_id
+    audio_encoder_bos_token_id = model.generation_config.decoder_start_token_id
+    max_length = model.generation_config.max_length
+    num_codebooks = model.decoder.config.num_codebooks
+    bandwidth = model_args.bandwidth
+    attn_implementation = model_args.attn_implementation
+
+    # Freeze Encoders
+    model.freeze_encoders(model_args.freeze_text_encoder)
+
+    # Test all gather - used for warmout and avoiding timeout
+    logger.debug(str(accelerator.process_index), main_process_only=False, in_order=True)
+    test_tensor = torch.tensor([accelerator.process_index], device=accelerator.device)
+    gathered_tensor = accelerator.gather(test_tensor)
+    print("gathered_tensor", gathered_tensor)
+    accelerator.wait_for_everyone()
+
+    if not dataset_was_precomputed:
+        # Filter on text length
+        if description_column_name is not None and data_args.max_text_length is not None:
+            with accelerator.local_main_process_first():
+                # filter description that is shorter than max_text_length
+                raw_datasets = raw_datasets.filter(
+                    lambda x: len(x) < data_args.max_text_length,
+                    num_proc=num_workers,
+                    input_columns=[description_column_name],
+                )
+
+        # Preprocessing the dataset.
+        # We need to tokenize the texts.
+        def pass_through_processors(description, prompt):
+            batch = {}
+
+            batch["input_ids"] = description_tokenizer(description.strip())["input_ids"]
+            batch["prompt_input_ids"] = prompt_tokenizer(prompt.strip())["input_ids"]
+
+            return batch
+
+        with accelerator.local_main_process_first():
+            # this is a trick to avoid to rewrite the entire audio column which takes ages
+            vectorized_datasets = raw_datasets.map(
+                pass_through_processors,
+                remove_columns=next(iter(raw_datasets.values())).column_names,
+                input_columns=[description_column_name, prompt_column_name],
+                num_proc=num_workers,
+                desc="preprocess datasets",
+            )
+
+        # We use Accelerate to perform distributed inference
+        # T5 doesn't support fp16
+        autocast_kwargs = AutocastKwargs(enabled=(mixed_precision != "fp16"))
+
+        # Now we encode the audio labels with encodec.
+        ####### B. Encode audio
+
+        logger.info("*** Encode target audio with encodec ***")
+
+        # no need to prepare audio_decoder because used for inference without mixed precision
+        # see: https://huggingface.co/docs/accelerate/main/en/package_reference/accelerator#accelerate.Accelerator.prepare
+        if training_args.torch_compile:
+            audio_decoder = accelerator.prepare_model(model.audio_encoder, evaluation_mode=True)
+        else:
+            audio_decoder = model.audio_encoder
+
+        encoder_data_collator = DataCollatorEncodecWithPadding(
+            feature_extractor,
+            audio_column_name=target_audio_column_name,
+            librittsr_dir=data_args.librittsr_dir,
+            other_dir=data_args.other_dir,
+            feature_extractor_input_name=feature_extractor_input_name,
+            max_length=max_target_length,
+            padding=padding,
+        )
+        encoder_signature = set(inspect.signature(audio_decoder.forward).parameters)
+
+        def apply_audio_decoder(batch):
+            len_audio = batch.pop("len_audio")
+            audio_decoder.to(batch["input_values"].device).eval()
+            if bandwidth is not None:
+                batch["bandwidth"] = bandwidth
+            elif "num_quantizers" in encoder_signature:
+                batch["num_quantizers"] = num_codebooks
+            elif "num_codebooks" in encoder_signature:
+                batch["num_codebooks"] = num_codebooks
+            elif "n_quantizers" in encoder_signature:
+                batch["n_quantizers"] = num_codebooks
+
+            with torch.no_grad():
+                labels = audio_decoder.encode(**batch)["audio_codes"]
+            output = {}
+            output["len_audio"] = len_audio
+            # (1, bsz, codebooks, seq_len) -> (bsz, seq_len, codebooks)
+            output["labels"] = labels.squeeze(0).transpose(1, 2)
+
+            # if `pad_to_max_length`, the maximum corresponding audio length of the current batch is max_duration*sampling_rate
+            max_length = len_audio.max() if padding != "max_length" else max_target_length
+            output["ratio"] = torch.ones_like(len_audio) * labels.shape[-1] / max_length
+            return output
+
+        # (1, codebooks, seq_len) where seq_len=1
+        bos_labels = torch.ones((1, num_codebooks, 1)) * audio_encoder_bos_token_id
+
+        def postprocess_dataset(labels):
+            # (1, codebooks, seq_len)
+            labels = torch.tensor(labels).unsqueeze(0)
+            # add bos
+            labels = torch.cat([bos_labels, labels], dim=-1)
+
+            labels, delay_pattern_mask = build_delay_pattern_mask(
+                labels,
+                bos_token_id=audio_encoder_bos_token_id,
+                pad_token_id=audio_encoder_eos_token_id,
+                max_length=labels.shape[-1] + num_codebooks,
+                num_codebooks=num_codebooks,
+            )
+
+            # the first ids of the delay pattern mask are precisely labels, we use the rest of the labels mask
+            # to take care of EOS
+            # we want labels to look like this:
+            #  - [B, a, b, E, E, E, E]
+            #  - [B, B, c, d, E, E, E]
+            #  - [B, B, B, e, f, E, E]
+            #  - [B, B, B, B, g, h, E]
+            labels = torch.where(delay_pattern_mask == -1, audio_encoder_eos_token_id, delay_pattern_mask)
+
+            # the first timestamp is associated to a row full of BOS, let's get rid of it
+            # we also remove the last timestampts (full of PAD)
+            output = {"labels": labels[:, 1:]}
+            return output
+
+        for split in vectorized_datasets:
+            data_loader = DataLoader(
+                raw_datasets[split],
+                batch_size=training_args.audio_encoder_per_device_batch_size,
+                collate_fn=encoder_data_collator,
+                num_workers=training_args.dataloader_num_workers,
+                pin_memory=True,
+            )
+            data_loader = accelerator.prepare(data_loader)
+            total_inference_steps = len(data_loader)
+
+            start_step = get_last_codec_checkpoint_step(os.path.join(data_args.temporary_save_to_disk, split))
+            accelerator.wait_for_everyone()
+            if start_step > 0:
+                logger.info(f"Resuming {split} from step {start_step}")
+                # efficiently skip the first n batches
+                start_step += 1
+                data_loader = skip_first_batches(data_loader, start_step)
+
+            all_generated_labels = []
+            all_lens = []
+            if start_step < total_inference_steps:
+                for i, batch in enumerate(tqdm(data_loader, disable=not accelerator.is_local_main_process)):
+                    cur_step = start_step + i
+                    generate_labels = apply_audio_decoder(batch)
+                    generate_labels = accelerator.pad_across_processes(generate_labels, dim=1, pad_index=0)
+                    generate_labels = accelerator.gather_for_metrics(generate_labels)
+
+                    if accelerator.is_main_process:
+                        lab = generate_labels["labels"].cpu().transpose(1, 2).to(torch.int16)
+                        rat = generate_labels["ratio"].cpu().squeeze(1)
+                        lens = generate_labels["len_audio"].cpu().squeeze(1)
+                        lab = [l[:, : int(ratio * length)] for (l, ratio, length) in zip(lab, rat, lens)]
+
+                        all_generated_labels.extend(lab)
+                        all_lens.extend(lens)
+
+                        if ((cur_step + 1) % data_args.save_codec_steps == 0) or (
+                            cur_step == total_inference_steps - 1
+                        ):
+                            tmp_labels = Dataset.from_dict({"labels": all_generated_labels, "target_length": all_lens})
+                            tmp_labels = tmp_labels.map(
+                                postprocess_dataset,
+                                num_proc=data_args.preprocessing_num_workers,  # this one is resource consuming if many processor.
+                                input_columns=["labels"],
+                                desc="Postprocessing labeling",
+                            )
+                            save_codec_checkpoint(
+                                os.path.join(data_args.temporary_save_to_disk, split), tmp_labels, cur_step
+                            )
+                            all_generated_labels = []
+                            all_lens = []
+
+                accelerator.wait_for_everyone()
+
+            if accelerator.is_main_process and len(all_generated_labels) > 0:
+                tmp_labels = Dataset.from_dict({"labels": all_generated_labels, "target_length": all_lens})
+                tmp_labels = tmp_labels.map(
+                    postprocess_dataset,
+                    num_proc=data_args.preprocessing_num_workers,  # this one is resource consuming if many processor.
+                    input_columns=["labels"],
+                    desc="Postprocessing labeling",
+                )
+                save_codec_checkpoint(os.path.join(data_args.temporary_save_to_disk, split), tmp_labels, cur_step)
+                all_generated_labels = []
+                all_lens = []
+            accelerator.wait_for_everyone()
+
+            del all_generated_labels
+            accelerator.wait_for_everyone()
+
+            with accelerator.local_main_process_first():
+                tmp_labels = load_all_codec_checkpoints(os.path.join(data_args.temporary_save_to_disk, split)).select(
+                    range(len(vectorized_datasets[split]))
+                )
+                logger.info(f"Concatenating {split}: {tmp_labels} with {vectorized_datasets[split]}")
+                vectorized_datasets[split] = concatenate_datasets([vectorized_datasets[split], tmp_labels], axis=1)
+
+        accelerator.free_memory()
+        del generate_labels, all_lens
+
+        with accelerator.local_main_process_first():
+            # NOTE: filtering is done at the end because in the `datasets` library, caching audio files is done after most operations
+            # caching audio files is time and disk-space consuming, so we want to avoid it at all costs, especially for large (>1Kh) audio datasets.
+            # That's also why we avoid to concat the processed datasets (vectorized_datasets) with the audio column present in raw_datasets.
+
+            def is_audio_in_length_range(length):
+                return length > min_target_length and length < max_target_length
+
+            # filter data that is shorter than min_target_length
+            vectorized_datasets = vectorized_datasets.filter(
+                is_audio_in_length_range,
+                num_proc=num_workers,
+                input_columns=["target_length"],
+            )
+
+        if description_column_name is not None and data_args.max_description_token_length is not None:
+            with accelerator.local_main_process_first():
+                # filter description that is shorter than max_text_length
+                vectorized_datasets = vectorized_datasets.filter(
+                    lambda x: len(x) < data_args.max_description_token_length,
+                    num_proc=num_workers,
+                    input_columns=["input_ids"],
+                )
+
+        if data_args.max_prompt_token_length is not None:
+            with accelerator.local_main_process_first():
+                # filter description that is shorter than max_text_length
+                vectorized_datasets = vectorized_datasets.filter(
+                    lambda x: len(x) < data_args.max_prompt_token_length,
+                    num_proc=num_workers,
+                    input_columns=["prompt_input_ids"],
+                )
+
+    if data_args.save_to_disk is not None and not dataset_was_precomputed:
+        if accelerator.is_main_process:
+            vectorized_datasets.save_to_disk(
+                data_args.save_to_disk,
+                num_proc=min(data_args.preprocessing_num_workers, len(vectorized_datasets["eval"]) - 1),
+            )
+        accelerator.wait_for_everyone()
+        logger.info(f"Dataset saved at {data_args.save_to_disk}")
+
+    audio_max_length = None
+    if padding == "max_length":
+        audio_max_length = max(vectorized_datasets["train"]["target_length"])
+        with accelerator.local_main_process_first():
+            max_sample = vectorized_datasets["train"].filter(
+                lambda x: x == audio_max_length,
+                num_proc=num_workers,
+                input_columns=["target_length"],
+            )
+        audio_max_length = max([len(l[0]) for l in max_sample["labels"]])
+
+    if description_column_name is not None and data_args.max_description_token_length is not None:
+        with accelerator.local_main_process_first():
+            # filter description that is shorter than max_text_length
+            vectorized_datasets = vectorized_datasets.filter(
+                lambda x: len(x) < data_args.max_description_token_length,
+                num_proc=num_workers,
+                input_columns=["input_ids"],
+            )
+
+    if data_args.max_prompt_token_length is not None:
+        with accelerator.local_main_process_first():
+            # filter description that is shorter than max_text_length
+            vectorized_datasets = vectorized_datasets.filter(
+                lambda x: len(x) < data_args.max_prompt_token_length,
+                num_proc=num_workers,
+                input_columns=["prompt_input_ids"],
+            )
+
+    if training_args.group_by_length:
+        # apply a simple heuristic to take into account audio and text lengths
+        def add_target_lengths(target_length, prompt, description):
+            return {"target_length": target_length + len(prompt) + len(description)}
+
+        with accelerator.local_main_process_first():
+            vectorized_datasets = vectorized_datasets.map(
+                add_target_lengths,
+                num_proc=num_workers,
+                input_columns=["target_length", "prompt_input_ids", "input_ids"],
+            )
+
+    # for large datasets it is advised to run the preprocessing on a
+    # single machine first with ``args.preprocessing_only`` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only and data_args.save_to_disk is None:
+        raise ValueError(
+            "`preprocessing_only=True` but `save_to_disk` is not set. The latter should indicates where to save the dataset locally."
+        )
+    elif data_args.preprocessing_only:
+        logger.info(f"Data preprocessing finished. Files save at {data_args.save_to_disk}")
+        return
+
+    # 6. Next, we can prepare the training.
+
+    # Let's use word CLAP similary and WER metrics as our evaluation metrics,
+    def compute_metrics(
+        audios,
+        descriptions,
+        prompts,
+        device="cpu",
+        compute_clap_similarity_metric=False,
+        compute_noise_level_metric=False,
+        noise_level_to_compute_clean_wer=None,
+    ):
+        results = {}
+        input_ids = descriptions
+        texts = description_tokenizer.batch_decode(input_ids, skip_special_tokens=True)
+        prompts = prompt_tokenizer.batch_decode(prompts, skip_special_tokens=True)
+        audios = [a.float().cpu().numpy() for a in audios]
+
+        if compute_clap_similarity_metric:
+            clap_score = clap_similarity(
+                model_args.clap_model_name_or_path, texts, audios, device, input_sampling_rate=sampling_rate
+            )
+            results["clap"] = clap_score
+
+        si_sdr_measures = None
+        if compute_noise_level_metric:
+            si_sdr_measures = si_sdr(audios, device, input_sampling_rate=sampling_rate)
+
+        word_error, transcriptions, clean_word_error, noisy_word_error, percent_clean_samples = wer(
+            model_args.asr_model_name_or_path,
+            prompts,
+            audios,
+            device,
+            training_args.per_device_eval_batch_size,
+            sampling_rate,
+            noise_level_to_compute_clean_wer,
+            si_sdr_measures,
+        )
+        results["wer"] = word_error
+        if clean_word_error is not None:
+            results["clean_wer"] = clean_word_error
+            results["noisy_word_error"] = noisy_word_error
+            results["percent_clean_samples"] = percent_clean_samples
+
+        return results, texts, prompts, audios, transcriptions, si_sdr_measures
+
+    # Define Training Schedule
+    # Store some constants
+    per_device_train_batch_size = int(training_args.per_device_train_batch_size)
+    train_batch_size = per_device_train_batch_size * accelerator.num_processes
+    gradient_accumulation_steps = int(training_args.gradient_accumulation_steps)
+    per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
+
+    if training_args.max_steps < 0:
+        num_epochs = int(training_args.num_train_epochs)
+        steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
+        total_train_steps = steps_per_epoch * num_epochs
+    elif training_args.max_steps > 0:
+        logger.info("max_steps is given, it will override any value given in num_train_epochs")
+        total_train_steps = int(training_args.max_steps)
+        # Setting a very large number of epochs so we go as many times as necessary over the iterator.
+        num_epochs = sys.maxsize
+        steps_per_epoch = total_train_steps
+
+    if training_args.eval_steps is None:
+        logger.info(f"eval_steps is not set, evaluating at the end of each epoch")
+        eval_steps = steps_per_epoch
+    else:
+        eval_steps = training_args.eval_steps
+        
+    if training_args.eval_generation_steps is None:
+        eval_generation_steps = eval_steps
+    else:
+        eval_generation_steps = training_args.eval_generation_steps
+
+    # T5 doesn't support fp16
+    autocast_kwargs = AutocastKwargs(enabled=(mixed_precision != "fp16"))
+
+    # Define optimizer, LR scheduler, collator
+    optimizer = torch.optim.AdamW(
+        params=model.parameters(),
+        lr=training_args.learning_rate,
+        betas=(training_args.adam_beta1, training_args.adam_beta2),
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+    )
+
+    # LR scheduler gets stepped by `num_processes` each time -> account for this in warmup / total steps
+    lr_scheduler = get_scheduler(
+        name=training_args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=training_args.get_warmup_steps(total_train_steps) * accelerator.num_processes,
+        num_training_steps=total_train_steps * accelerator.num_processes,
+    )
+
+    # Instantiate custom data collator
+    data_collator = DataCollatorParlerTTSWithPadding(
+        prompt_tokenizer=prompt_tokenizer,
+        description_tokenizer=description_tokenizer,
+        pad_to_multiple_of=data_args.pad_to_multiple_of,
+        padding=padding,
+        prompt_max_length=data_args.max_prompt_token_length,
+        description_max_length=data_args.max_description_token_length,
+        audio_max_length=audio_max_length,
+    )
+
+    # Prepare everything with accelerate
+    model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
+
+    num_examples = total_train_steps * train_batch_size * gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {num_examples}")
+    logger.info("  Instantaneous batch size per device =" f" {per_device_train_batch_size}")
+    logger.info("  Gradient accumulation steps =" f" {gradient_accumulation_steps}")
+    logger.info(
+        f"  Total train batch size (w. parallel & distributed) = {train_batch_size * gradient_accumulation_steps}"
+    )
+    logger.info(f"  Total optimization steps = {total_train_steps}")
+
+    # ======================== Training ================================
+    train_time = 0
+    train_start = time.time()
+    steps_trained_progress_bar = tqdm(
+        range(total_train_steps), desc="Train steps ... ", position=0, disable=not accelerator.is_local_main_process
+    )
+    continue_training = True
+    epochs_trained = 0
+    cur_step = 0
+
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+
+    if accelerator.is_main_process:
+        if training_args.push_to_hub:
+            api = HfApi(token=training_args.hub_token)
+
+            # Create repo (repo_name from args or inferred)
+            repo_name = training_args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(training_args.output_dir).absolute().name
+            repo_id = api.create_repo(repo_name, exist_ok=True).repo_id
+
+            with open(os.path.join(training_args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "wandb" not in gitignore:
+                    gitignore.write("wandb\n")
+        elif training_args.output_dir is not None:
+            os.makedirs(training_args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
+    # Now save everything to be able to create a single processor later
+    # make sure all processes wait until data is saved
+    # only the main process saves them
+    if accelerator.is_main_process:
+        # save feature extractor, tokenizer and config
+        if (
+            model_args.prompt_tokenizer_name is None
+            and model_args.description_tokenizer_name
+            or (model_args.prompt_tokenizer_name == model_args.description_tokenizer_name)
+        ):
+            prompt_tokenizer.save_pretrained(training_args.output_dir)
+        else:
+            logger.warning(
+                f"Prompt tokenizer ('{model_args.prompt_tokenizer_name}') and description tokenizer ('{model_args.description_tokenizer_name}') are not the same. Saving only the prompt tokenizer."
+            )
+            prompt_tokenizer.save_pretrained(training_args.output_dir)
+
+        feature_extractor.save_pretrained(training_args.output_dir)
+        config.save_pretrained(training_args.output_dir)
+    accelerator.wait_for_everyone()
+
+    if checkpoint is not None:
+        accelerator.load_state(checkpoint)
+        # Find num steps and epoch from saved state string pattern
+        pattern = r"checkpoint-(\d+)-epoch-(\d+)"
+        match = re.search(pattern, checkpoint)
+        cur_step = int(match.group(1))
+        epochs_trained = int(match.group(2))
+
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info(f"  Continuing training from epoch {epochs_trained}")
+        logger.info(f"  Continuing training from global step {cur_step}")
+
+        steps_trained_progress_bar.update(cur_step)
+
+        for epoch in range(0, epochs_trained):
+            with accelerator.local_main_process_first():
+                vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+
+        if training_args.max_steps < 0:
+            # we know exactly the number of steps per epoch, so can skip through the required number of batches
+            resume_step = (cur_step - epochs_trained * steps_per_epoch) * gradient_accumulation_steps
+        else:
+            # Currently we don't know how many steps we've taken in the current epoch
+            # So we just shuffle the dataset one extra time and start from a fresh epoch
+            # This is "good enough" for our purposes but not fully correct
+            resume_step = None
+            with accelerator.local_main_process_first():
+                vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+    else:
+        resume_step = None
+
+    gen_kwargs = {
+        "do_sample": model_args.do_sample,
+        "temperature": model_args.temperature,
+        "max_length": model_args.max_length,
+        # Because of the delayed pattern mask, generation might stop earlier because of unexpected behaviour
+        # on the first tokens of the codebooks that are delayed.
+        # This fix the issue.
+        "min_new_tokens": num_codebooks + 1,
+    }
+
+    # Define gradient update step fn
+    def train_step(
+        batch,
+        accelerator,
+        autocast_kwargs,
+        num_items_in_batch,
+        gradient_accumulation_steps,
+    ):
+        if mixed_precision == "fp16":
+            # fp16 doesn't work with T5-like models
+            with accelerator.autocast(autocast_handler=autocast_kwargs):
+                if training_args.parallel_mode.value != "distributed":
+                    encoder_outputs = model.text_encoder(
+                        input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
+                    )
+                else:
+                    encoder_outputs = model.module.text_encoder(
+                        input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
+                    )
+                # we optionnally project last_hidden_state to avoid recomputing every time
+                encoder_hidden_states = encoder_outputs.last_hidden_state
+                if (
+                    config.text_encoder.hidden_size != config.decoder.hidden_size
+                    and config.decoder.cross_attention_hidden_size is None
+                ):
+                    encoder_hidden_states = (
+                        model.enc_to_dec_proj(encoder_hidden_states)
+                        if training_args.parallel_mode.value != "distributed"
+                        else model.module.enc_to_dec_proj(encoder_hidden_states)
+                    )
+
+                if batch.get("attention_mask", None) is not None:
+                    encoder_hidden_states = encoder_hidden_states * batch.get("attention_mask", None)[..., None]
+
+                encoder_outputs.last_hidden_state = encoder_hidden_states
+                batch["encoder_outputs"] = encoder_outputs
+
+        outputs = model(**batch, loss_reduction="sum")
+        # CE (data) loss
+        ce_loss = (outputs.loss * gradient_accumulation_steps * accelerator.num_processes) / num_items_in_batch
+
+        metrics = {"loss": ce_loss}
+        
+        # per CE loss
+        per_codebook_losses = outputs.per_codebook_losses
+        metrics.update({f"codebook_{i}_loss": ((l  * gradient_accumulation_steps * accelerator.num_processes) / num_items_in_batch) for (i,l) in enumerate(per_codebook_losses)})
+        return ce_loss, metrics
+
+    # Define eval fn
+    def eval_step(
+        batch,
+        accelerator,
+        autocast_kwargs,
+    ):
+        eval_model = model if not training_args.torch_compile else model._orig_mod
+
+        if mixed_precision == "fp16":
+            # fp16 doesn't work with T5-like models
+            with accelerator.autocast(autocast_handler=autocast_kwargs):
+                if training_args.parallel_mode.value != "distributed":
+                    encoder_outputs = model.text_encoder(
+                        input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
+                    )
+                else:
+                    encoder_outputs = model.module.text_encoder(
+                        input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
+                    )
+                # we optionnally project last_hidden_state to avoid recomputing every time
+                encoder_hidden_states = encoder_outputs.last_hidden_state
+                if (
+                    config.text_encoder.hidden_size != config.decoder.hidden_size
+                    and config.decoder.cross_attention_hidden_size is None
+                ):
+                    encoder_hidden_states = (
+                        model.enc_to_dec_proj(encoder_hidden_states)
+                        if training_args.parallel_mode.value != "distributed"
+                        else model.module.enc_to_dec_proj(encoder_hidden_states)
+                    )
+
+                if batch.get("attention_mask", None) is not None:
+                    encoder_hidden_states = encoder_hidden_states * batch.get("attention_mask", None)[..., None]
+
+                encoder_outputs.last_hidden_state = encoder_hidden_states
+                batch["encoder_outputs"] = encoder_outputs
+
+        with torch.no_grad():
+            outputs = eval_model(**batch)
+        # CE (data) loss
+        ce_loss = outputs.loss
+        metrics = {"loss": ce_loss}
+        
+        # per CE loss
+        per_codebook_losses = outputs.per_codebook_losses
+        metrics.update({f"codebook_{i}_loss": l for (i,l) in enumerate(per_codebook_losses)})
+        return metrics
+
+    def generate_step(batch, accelerator):
+        batch.pop("decoder_attention_mask", None)
+        eval_model = accelerator.unwrap_model(model, keep_fp32_wrapper=True)
+        if training_args.torch_compile:
+            # if the model is compiled, we use the original model bc compile is not compatible with .generate
+            eval_model = model._orig_mod
+
+        # since we've might have loaded the weights in fp32, we have to autocast to ensure FA2 weights are in half-precision.
+        # with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=(attn_implementation=="flash_attention_2"))):
+        output_audios = eval_model.generate(**batch, **gen_kwargs)
+        output_audios = accelerator.pad_across_processes(output_audios, dim=1, pad_index=0)
+        return output_audios
+
+    model.train()
+
+    total_batched_samples = resume_step if resume_step is not None else 0
+    for epoch in range(epochs_trained, num_epochs):
+        with accelerator.local_main_process_first():
+            vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+        sampler = None
+        if training_args.group_by_length:
+            sampler = LengthGroupedSampler(train_batch_size, lengths=vectorized_datasets["train"]["target_length"])
+        train_dataloader = DataLoader(
+            vectorized_datasets["train"],
+            collate_fn=data_collator,
+            batch_size=per_device_train_batch_size,
+            sampler=sampler,
+            shuffle=not training_args.group_by_length,
+            num_workers=training_args.dataloader_num_workers,
+            pin_memory=training_args.dataloader_pin_memory,
+        )
+        train_dataloader = accelerator.prepare(train_dataloader)
+        if hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDataset):
+            train_dataloader.dataset.set_epoch(epoch)
+
+        if resume_step is not None:
+            # Skip the first N batches in the dataloader when resuming from a checkpoint
+            logger.info(f"  Skip first {resume_step} batches")
+            train_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+            resume_step = None
+            accelerator.wait_for_everyone()
+
+        # We chunkify the epoch iterator into gradient accumulation steps `n` batches
+        train_iterator = iter(train_dataloader)
+        num_steps_in_epoch = len(train_dataloader)
+        remainder = num_steps_in_epoch % gradient_accumulation_steps
+        remainder = remainder if remainder != 0 else gradient_accumulation_steps
+        total_updates = math.ceil(num_steps_in_epoch / gradient_accumulation_steps)
+        
+        update_step = -1
+        for _ in range(total_updates):
+            update_step += 1
+            
+            # preload the total batch per step
+            batch_samples = []
+            num_batches_in_step = gradient_accumulation_steps if update_step != (total_updates - 1) else remainder
+            for _ in range(num_batches_in_step):
+                batch_samples += [next(train_iterator)]
+                
+            # get num items in batch - if different than BOS and than -100
+            num_items_in_batch = sum([(batch["labels"].ne(audio_encoder_bos_token_id) | batch["labels"].ne(-100) | batch["labels"].ne(audio_encoder_eos_token_id)).sum((0,1))[0] for batch in batch_samples])
+            num_items_in_batch = accelerator.gather(num_items_in_batch).sum().item()
+            
+            # losses = []
+            for i,batch in enumerate(batch_samples):
+                total_batched_samples += 1
+                ctx = model.no_sync if (i < len(batch_samples) - 1 and accelerator.num_processes > 1) else contextlib.nullcontext
+                
+                with ctx():
+                    loss, train_metric = train_step(batch, accelerator, autocast_kwargs, num_items_in_batch, gradient_accumulation_steps)
+                    accelerator.backward(loss)
+                    # losses.append(loss.detach())
+            
+            grad_norm = accelerator.clip_grad_norm_(model.parameters(), training_args.max_grad_norm)
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+
+            # The accelerator has performed an optimization step behind the scenes
+            steps_trained_progress_bar.update(1)
+            cur_step += 1
+
+            # losses = accelerator.gather(sum(losses)).sum().item() / (accelerator.num_processes * gradient_accumulation_steps)
+            
+            if cur_step % training_args.logging_steps == 0:
+                steps_trained_progress_bar.write(
+                    f"Step... ({cur_step} / {total_train_steps} | Loss:"
+                    f" {train_metric['loss']}, Learning Rate:"
+                    f" {lr_scheduler.get_last_lr()[0]})"
+                )
+                train_metric["grad_norm"] = grad_norm.detach().item() if isinstance(grad_norm, torch.Tensor) else grad_norm
+                log_metric(
+                    accelerator,
+                    metrics=train_metric,
+                    learning_rate=lr_scheduler.get_last_lr()[0],
+                    train_time=train_time + time.time() - train_start,
+                    step=cur_step,
+                    epoch=epoch,
+                    prefix="train",
+                )
+
+            # save checkpoint and weights after each save_steps and at the end of training
+            if (cur_step % training_args.save_steps == 0) or cur_step == total_train_steps:
+                intermediate_dir = os.path.join(training_args.output_dir, f"checkpoint-{cur_step}-epoch-{epoch}")
+                # safe_serialization=False to avoid shared tensors saving issue (TODO(YL): it's a temporary fix)
+                # https://github.com/huggingface/transformers/issues/27293#issuecomment-1872560074
+                accelerator.save_state(output_dir=intermediate_dir, safe_serialization=False)
+                accelerator.wait_for_everyone()
+                if accelerator.is_main_process:
+                    rotate_checkpoints(
+                        training_args.save_total_limit, output_dir=training_args.output_dir, logger=logger
+                    )
+
+                    if cur_step == total_train_steps:
+                        # un-wrap student model for save
+                        unwrapped_model = accelerator.unwrap_model(model)
+                        unwrapped_model.save_pretrained(training_args.output_dir)
+
+                    if training_args.push_to_hub:
+                        api.upload_folder(
+                            repo_id=repo_id,
+                            folder_path=training_args.output_dir,
+                            commit_message=f"Saving train state of step {cur_step}",
+                            run_as_future=True,
+                        )
+                accelerator.wait_for_everyone()
+
+            if training_args.do_eval and (cur_step % eval_steps == 0 or cur_step == total_train_steps):
+                train_time += time.time() - train_start
+                # ======================== Evaluating ==============================
+                model.eval()
+                eval_metrics = []
+                eval_preds = []
+                eval_descriptions = []
+                eval_prompts = []
+                eval_start = time.time()
+
+                # release training input batch
+                batch = release_memory(batch)
+
+                validation_dataloader = DataLoader(
+                    vectorized_datasets["eval"],
+                    collate_fn=data_collator,
+                    batch_size=per_device_eval_batch_size,
+                    drop_last=False,
+                    num_workers=training_args.eval_dataloader_num_workers,
+                    pin_memory=training_args.dataloader_pin_memory,
+                )
+                validation_dataloader = accelerator.prepare(validation_dataloader)
+
+                for batch in tqdm(
+                    validation_dataloader,
+                    desc=f"Evaluating - Inference ...",
+                    position=2,
+                    disable=not accelerator.is_local_main_process,
+                ):
+                    # Model forward
+                    eval_metric = eval_step(batch, accelerator, autocast_kwargs)
+                    eval_metric = accelerator.gather_for_metrics(eval_metric)
+                    eval_metric = {key: val.unsqueeze(0) if val.ndim == 0 else val for (key,val) in eval_metric.items()}
+                    eval_metrics.append(eval_metric)
+
+                if training_args.predict_with_generate and (cur_step % eval_generation_steps == 0 or cur_step == total_train_steps):
+                    validation_dataloader = DataLoader(
+                        vectorized_datasets["eval"],
+                        collate_fn=data_collator,
+                        batch_size=per_device_eval_batch_size,
+                        drop_last=False,
+                        num_workers=training_args.eval_dataloader_num_workers,
+                        pin_memory=training_args.dataloader_pin_memory,
+                    )
+                    validation_dataloader = accelerator.prepare(validation_dataloader)
+                    # generation
+                    for batch in tqdm(
+                        validation_dataloader,
+                        desc=f"Evaluating - Generation ...",
+                        position=2,
+                        disable=not accelerator.is_local_main_process,
+                    ):
+                        generated_audios = generate_step(batch, accelerator)
+                        # Gather all predictions and targets
+                        generated_audios, input_ids, prompts = accelerator.pad_across_processes(
+                            (generated_audios, batch["input_ids"], batch["prompt_input_ids"]), dim=1, pad_index=0
+                        )
+                        generated_audios, input_ids, prompts = accelerator.gather_for_metrics(
+                            (generated_audios, input_ids, prompts)
+                        )
+                        eval_preds.extend(generated_audios.to("cpu"))
+                        eval_descriptions.extend(input_ids.to("cpu"))
+                        eval_prompts.extend(prompts.to("cpu"))
+
+                eval_time = time.time() - eval_start
+                # normalize eval metrics
+                eval_metrics = {
+                    key: torch.mean(torch.cat([d[key] for d in eval_metrics])).to("cpu") for key in eval_metrics[0]
+                }
+
+                # compute metrics
+                metrics_desc = ""
+                if training_args.predict_with_generate and (cur_step % eval_generation_steps == 0 or cur_step == total_train_steps):
+                    if accelerator.is_local_main_process:
+                        (
+                            metric_values,
+                            pred_descriptions,
+                            pred_prompts,
+                            audios,
+                            transcriptions,
+                            si_sdr_measures,
+                        ) = compute_metrics(
+                            eval_preds,
+                            eval_descriptions,
+                            eval_prompts,
+                            accelerator.device,
+                            training_args.compute_clap_similarity_metric,
+                            training_args.compute_noise_level_metric,
+                            training_args.noise_level_to_compute_clean_wer,
+                        )
+                        eval_metrics.update(metric_values)
+                        metrics_desc = " ".join([f"Eval {key}: {value} |" for key, value in metric_values.items()])
+                        if "wandb" in training_args.report_to:
+                            log_pred(
+                                accelerator,
+                                pred_descriptions,
+                                pred_prompts,
+                                transcriptions,
+                                audios,
+                                si_sdr_measures,
+                                sampling_rate=sampling_rate,
+                                step=cur_step,
+                                prefix="eval",
+                            )
+                    accelerator.wait_for_everyone()
+
+                # Print metrics and update progress bar
+                if accelerator.is_local_main_process:
+                    steps_trained_progress_bar.write(
+                        f"Eval results for step ({cur_step} / {total_train_steps} | Eval Loss: {eval_metrics['loss']} |"
+                        f" {metrics_desc})"
+                    )
+
+                log_metric(
+                    accelerator,
+                    metrics=eval_metrics,
+                    train_time=eval_time,
+                    step=cur_step,
+                    epoch=epoch,
+                    prefix="eval",
+                )
+
+                # release eval batch and relax metrics
+                eval_metrics, eval_preds, eval_descriptions, eval_prompts, batch, eval_metric = release_memory(
+                    eval_metrics, eval_preds, eval_descriptions, eval_prompts, batch, eval_metric
+                )
+                if training_args.predict_with_generate and (cur_step % eval_generation_steps == 0 or cur_step == total_train_steps):
+                    generated_audios, input_ids, prompts = release_memory(generated_audios, input_ids, prompts)
+
+                # train mode
+                model.train()
+
+                # flush the train metrics
+                train_start = time.time()
+
+            # break condition
+            if cur_step == total_train_steps:
+                continue_training = False
+                break
+
+        if not continue_training:
+            break
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/capspeech/ar/training/finetune_capttsse.py b/capspeech/ar/training/finetune_capttsse.py
new file mode 100644
index 0000000000000000000000000000000000000000..0502d889fede2eb7e4a5ef6cb5f04c701e1cf25c
--- /dev/null
+++ b/capspeech/ar/training/finetune_capttsse.py
@@ -0,0 +1,1267 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Train Parler-TTS using 🤗 Accelerate"""
+
+import logging
+import os
+import re
+import sys
+import time
+import math
+import contextlib
+from multiprocess import set_start_method
+from datetime import timedelta
+import inspect
+from tqdm import tqdm
+from pathlib import Path
+import wandb
+
+import torch
+from torch.utils.data import DataLoader
+
+import datasets
+from datasets import DatasetDict, Dataset, IterableDataset, concatenate_datasets
+
+from huggingface_hub import HfApi
+
+import transformers
+from transformers import AutoFeatureExtractor, AutoTokenizer, HfArgumentParser
+from transformers.trainer_pt_utils import LengthGroupedSampler
+from transformers.optimization import get_scheduler
+from transformers.utils import send_example_telemetry
+
+
+from accelerate import Accelerator, skip_first_batches
+from accelerate.utils import set_seed, AutocastKwargs, InitProcessGroupKwargs, TorchDynamoPlugin, DistributedDataParallelKwargs
+from accelerate.utils.memory import release_memory
+
+from parler_tts import (
+    ParlerTTSConfig,
+    ParlerTTSForConditionalGeneration,
+    build_delay_pattern_mask,
+)
+
+from training.utils import (
+    get_last_checkpoint,
+    rotate_checkpoints,
+    log_pred,
+    log_metric,
+    load_all_codec_checkpoints,
+    save_codec_checkpoint,
+    get_last_codec_checkpoint_step,
+)
+from training.arguments_capttsse import ModelArguments, DataTrainingArguments, ParlerTTSTrainingArguments
+from training.data_capttsse import load_multiple_datasets, DataCollatorParlerTTSWithPadding, DataCollatorEncodecWithPadding
+from training.eval import clap_similarity, wer, si_sdr
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ParlerTTSTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_parler_tts", model_args, data_args)
+    
+    if data_args.wandb_key is not None:
+        wandb.login(key=data_args.wandb_key)
+
+    if training_args.dtype == "float16":
+        mixed_precision = "fp16"
+        torch_dtype = torch.float16
+    elif training_args.dtype == "bfloat16":
+        mixed_precision = "bf16"
+        torch_dtype = torch.bfloat16
+    else:
+        mixed_precision = "no"
+        torch_dtype = torch.float32
+
+    if data_args.pad_to_max_length and (
+        data_args.max_duration_in_seconds is None
+        or data_args.max_prompt_token_length is None
+        or data_args.max_description_token_length is None
+    ):
+        raise ValueError(
+            "`pad_to_max_length` is `True` but one of the following parameters has not been set: `max_duration_in_seconds`, `max_prompt_token_length`, `max_description_token_length`"
+        )
+
+    padding = "max_length" if data_args.pad_to_max_length else "longest"
+
+    ####### A. Preparation
+    kwargs_handlers = [InitProcessGroupKwargs(timeout=timedelta(minutes=120)), DistributedDataParallelKwargs(find_unused_parameters=False)]
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=training_args.gradient_accumulation_steps,
+        mixed_precision=mixed_precision,
+        log_with=training_args.report_to,
+        project_dir=training_args.output_dir,
+        kwargs_handlers=kwargs_handlers,
+    )
+
+    accelerator.init_trackers(
+        project_name=data_args.wandb_project,
+        config={
+            "learning_rate": training_args.learning_rate,
+            "model_name_or_path": model_args.model_name_or_path,
+            "num_train_epochs": training_args.num_train_epochs,
+            "gradient_accumulation_steps": training_args.gradient_accumulation_steps,
+            "per_device_train_batch_size": training_args.per_device_train_batch_size,
+            "global_batch_size": training_args.per_device_train_batch_size * accelerator.num_processes,
+            "mixed_precision": mixed_precision,
+            "lr_scheduler_type": training_args.lr_scheduler_type,
+            "warmup_steps": training_args.warmup_steps,
+            "freeze_text_encoder": model_args.freeze_text_encoder,
+            "max_duration_in_seconds": data_args.max_duration_in_seconds,
+            "weight_decay": training_args.weight_decay,
+            "adam_beta1": training_args.adam_beta1,
+            "adam_beta2": training_args.adam_beta2,
+            "temperature": model_args.temperature,
+        },
+        init_kwargs={"wandb": {"name": data_args.wandb_run_name}} if data_args.wandb_run_name else {},
+    )
+
+    # Detecting last checkpoint and eventually continue from last checkpoint
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if accelerator.is_main_process else logging.WARN)
+
+    # Log a small summary on each proces
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
+    )
+
+    # Set the verbosity to info of the Transformers logger (on main process only)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    num_workers = data_args.preprocessing_num_workers
+
+    # 1. First, lett's instantiate the feature extractor, tokenizers and model
+    # Note for distributed training, the .from_pretrained methods guarantee that only
+    # one local process can concurrently download model & vocab.
+
+    # load feature extractor
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.feature_extractor_name or model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+    ) 
+    sampling_rate = feature_extractor.sampling_rate
+
+    # load prompt tokenizer
+    prompt_tokenizer = AutoTokenizer.from_pretrained(
+        model_args.prompt_tokenizer_name or model_args.description_tokenizer_name or model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+        use_fast=model_args.use_fast_tokenizer,
+        padding_side=model_args.prompt_padding_side,
+    )
+
+    # load description tokenizer
+    description_tokenizer = AutoTokenizer.from_pretrained(
+        model_args.description_tokenizer_name or model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+        use_fast=model_args.use_fast_tokenizer,
+    )
+
+    if model_args.use_fast_tokenizer:
+        logger.warning(
+            "Disabling fast tokenizer warning: https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L3231-L3235"
+        )
+        prompt_tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
+        description_tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
+
+    # 2. Now, let's load the dataset
+
+    if data_args.save_to_disk is not None:
+        os.makedirs(data_args.save_to_disk, exist_ok=True)
+
+    # assume that the dataset has been saved to `save_to_disk` if the latter is not empty
+    dataset_was_precomputed = len(os.listdir(data_args.save_to_disk)) > 0
+    if dataset_was_precomputed:
+        with accelerator.local_main_process_first():
+            vectorized_datasets = datasets.load_from_disk(data_args.save_to_disk)
+    else:
+        raw_datasets = DatasetDict()
+
+        columns_to_keep = {
+            "target_audio_column_name": data_args.target_audio_column_name,
+            "prompt_column_name": data_args.prompt_column_name,
+            "source": data_args.source_column_name,
+        }
+        if data_args.description_column_name is not None:
+            columns_to_keep["description_column_name"] = data_args.description_column_name
+
+        if training_args.do_train:
+            raw_datasets["train"] = load_multiple_datasets(
+                accelerator,
+                data_args.train_dataset_name,
+                splits=data_args.train_split_name,
+                dataset_samples=data_args.train_dataset_samples,
+                seed=training_args.seed,
+                cache_dir=model_args.cache_dir,
+                num_proc=data_args.preprocessing_num_workers,
+                id_column_name=data_args.id_column_name,
+                columns_to_keep=columns_to_keep.values(),
+                prompt_column_name=data_args.prompt_column_name,
+                audio_column_name=data_args.target_audio_column_name,
+                sampling_rate=sampling_rate,
+                logger=logger,
+                librittsrmix_dir=data_args.librittsrmix_dir,
+                # streaming=data_args.streaming, TODO(SG): optionally enable streaming mode
+            )
+
+            for key in columns_to_keep:
+                if columns_to_keep[key] not in raw_datasets["train"].column_names:
+                    raise ValueError(
+                        f"--{key} '{columns_to_keep[key]}' not found in dataset '{data_args.train_dataset_name}'."
+                        f" Make sure to set `--{key}` to the correct audio column - one of"
+                        f" {', '.join(raw_datasets['train'].column_names)}."
+                    )
+
+            if data_args.max_train_samples is not None:
+                raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
+
+        if training_args.do_eval:
+            raw_datasets["eval"] = load_multiple_datasets(
+                accelerator,
+                data_args.eval_dataset_name if data_args.eval_dataset_name else data_args.train_dataset_name,
+                splits=data_args.eval_split_name,
+                cache_dir=model_args.cache_dir,
+                num_proc=data_args.preprocessing_num_workers,
+                id_column_name=data_args.id_column_name,
+                columns_to_keep=columns_to_keep.values(),
+                prompt_column_name=data_args.prompt_column_name,
+                audio_column_name=data_args.target_audio_column_name,
+                sampling_rate=sampling_rate,
+                logger=logger,
+                librittsrmix_dir=data_args.librittsrmix_dir,
+                # streaming=data_args.streaming, TODO(SG): optionally enable streaming mode
+            )
+
+            if data_args.max_eval_samples is not None:
+                with accelerator.local_main_process_first():
+                    raw_datasets["eval"] = (
+                        raw_datasets["eval"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples))
+                    )
+
+    # 3. Next, let's load the config.
+    config = ParlerTTSConfig.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+    )
+ 
+    if training_args.codebook_weights is not None and len(training_args.codebook_weights) != config.decoder.num_codebooks:
+        raise ValueError(f"`codebook_weights` has length {len(training_args.codebook_weights)} when it should be of length {config.decoder.num_codebooks}.")
+
+    # update pad token id and decoder_start_token_id
+    config.decoder.update(
+        {
+            "cross_attention_implementation_strategy": model_args.cross_attention_implementation_strategy
+            if model_args.cross_attention_implementation_strategy is not None
+            else None,
+            "codebook_weights": training_args.codebook_weights if training_args.codebook_weights is not None else config.decoder.codebook_weights
+        }
+    )
+    config.update(
+        {
+            "pad_token_id": model_args.pad_token_id if model_args.pad_token_id is not None else config.pad_token_id,
+            "decoder_start_token_id": model_args.decoder_start_token_id
+            if model_args.decoder_start_token_id is not None
+            else config.decoder_start_token_id,
+        }
+    )
+
+    with open("events.txt", "r") as f:
+        events = [line.strip() for line in f]
+    events = ["<"+event.lower().replace(" ", "_")+">" for event in events]
+    events.append("<B_start>")
+    events.append("<B_end>")
+    events.append("<I_start>")
+    events.append("<I_end>")
+
+    special_tokens = {"additional_special_tokens": events}
+    prompt_tokenizer.add_special_tokens(special_tokens)
+    description_tokenizer.add_special_tokens(special_tokens)
+    padded_vocab_size = ((len(prompt_tokenizer) + 127) // 128) * 128 
+    config.vocab_size = padded_vocab_size 
+
+    # create model
+    model = ParlerTTSForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path,
+        ignore_mismatched_sizes=True,
+        cache_dir=model_args.cache_dir,
+        config=config,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+        attn_implementation={"decoder": model_args.attn_implementation, "text_encoder": "eager"},
+    )
+    model.text_encoder.resize_token_embeddings(padded_vocab_size)
+
+    # enable gradient checkpointing if necessary
+    if training_args.gradient_checkpointing:
+        model.gradient_checkpointing_enable()
+
+    # 4. Now we preprocess the datasets including loading the audio, resampling and normalization
+    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
+    # so that we just need to set the correct target sampling rate and normalize the input
+    # via the `feature_extractor`
+
+    # derive max & min input length for sample rate & max duration
+    sampling_rate = feature_extractor.sampling_rate
+    max_target_length = int(data_args.max_duration_in_seconds * sampling_rate)
+    min_target_length = int(data_args.min_duration_in_seconds * sampling_rate)
+    target_audio_column_name = data_args.target_audio_column_name
+    description_column_name = data_args.description_column_name
+    prompt_column_name = data_args.prompt_column_name
+    feature_extractor_input_name = feature_extractor.model_input_names[0]
+    audio_encoder_pad_token_id = config.decoder.pad_token_id
+    audio_encoder_eos_token_id = config.decoder.eos_token_id
+    audio_encoder_bos_token_id = model.generation_config.decoder_start_token_id
+    max_length = model.generation_config.max_length
+    num_codebooks = model.decoder.config.num_codebooks
+    bandwidth = model_args.bandwidth
+    attn_implementation = model_args.attn_implementation
+
+    # Freeze Encoders
+    model.freeze_encoders(model_args.freeze_text_encoder)
+
+    # Test all gather - used for warmout and avoiding timeout
+    logger.debug(str(accelerator.process_index), main_process_only=False, in_order=True)
+    test_tensor = torch.tensor([accelerator.process_index], device=accelerator.device)
+    gathered_tensor = accelerator.gather(test_tensor)
+    print("gathered_tensor", gathered_tensor)
+    accelerator.wait_for_everyone()
+
+    if not dataset_was_precomputed:
+        # Filter on text length
+        if description_column_name is not None and data_args.max_text_length is not None:
+            with accelerator.local_main_process_first():
+                # filter description that is shorter than max_text_length
+                raw_datasets = raw_datasets.filter(
+                    lambda x: len(x) < data_args.max_text_length,
+                    num_proc=num_workers,
+                    input_columns=[description_column_name],
+                )
+
+        # Preprocessing the dataset.
+        # We need to tokenize the texts.
+        def pass_through_processors(description, prompt):
+            batch = {}
+
+            batch["input_ids"] = description_tokenizer(description.strip())["input_ids"]
+            batch["prompt_input_ids"] = prompt_tokenizer(prompt.strip())["input_ids"]
+
+            return batch
+
+        with accelerator.local_main_process_first():
+            # this is a trick to avoid to rewrite the entire audio column which takes ages
+            vectorized_datasets = raw_datasets.map(
+                pass_through_processors,
+                remove_columns=next(iter(raw_datasets.values())).column_names,
+                input_columns=[description_column_name, prompt_column_name],
+                num_proc=num_workers,
+                desc="preprocess datasets",
+            )
+
+        # We use Accelerate to perform distributed inference
+        # T5 doesn't support fp16
+        autocast_kwargs = AutocastKwargs(enabled=(mixed_precision != "fp16"))
+
+        # Now we encode the audio labels with encodec.
+        ####### B. Encode audio
+
+        logger.info("*** Encode target audio with encodec ***")
+
+        # no need to prepare audio_decoder because used for inference without mixed precision
+        # see: https://huggingface.co/docs/accelerate/main/en/package_reference/accelerator#accelerate.Accelerator.prepare
+        if training_args.torch_compile:
+            audio_decoder = accelerator.prepare_model(model.audio_encoder, evaluation_mode=True)
+        else:
+            audio_decoder = model.audio_encoder
+
+        encoder_data_collator = DataCollatorEncodecWithPadding(
+            feature_extractor,
+            audio_column_name=target_audio_column_name,
+            librittsrmix_dir=data_args.librittsrmix_dir,
+            feature_extractor_input_name=feature_extractor_input_name,
+            max_length=max_target_length,
+            padding=padding,
+        )
+        encoder_signature = set(inspect.signature(audio_decoder.forward).parameters)
+
+        def apply_audio_decoder(batch):
+            len_audio = batch.pop("len_audio")
+            audio_decoder.to(batch["input_values"].device).eval()
+            if bandwidth is not None:
+                batch["bandwidth"] = bandwidth
+            elif "num_quantizers" in encoder_signature:
+                batch["num_quantizers"] = num_codebooks
+            elif "num_codebooks" in encoder_signature:
+                batch["num_codebooks"] = num_codebooks
+            elif "n_quantizers" in encoder_signature:
+                batch["n_quantizers"] = num_codebooks
+
+            with torch.no_grad():
+                labels = audio_decoder.encode(**batch)["audio_codes"]
+            output = {}
+            output["len_audio"] = len_audio
+            # (1, bsz, codebooks, seq_len) -> (bsz, seq_len, codebooks)
+            output["labels"] = labels.squeeze(0).transpose(1, 2)
+
+            # if `pad_to_max_length`, the maximum corresponding audio length of the current batch is max_duration*sampling_rate
+            max_length = len_audio.max() if padding != "max_length" else max_target_length
+            output["ratio"] = torch.ones_like(len_audio) * labels.shape[-1] / max_length
+            return output
+
+        # (1, codebooks, seq_len) where seq_len=1
+        bos_labels = torch.ones((1, num_codebooks, 1)) * audio_encoder_bos_token_id
+
+        def postprocess_dataset(labels):
+            # (1, codebooks, seq_len)
+            labels = torch.tensor(labels).unsqueeze(0)
+            # add bos
+            labels = torch.cat([bos_labels, labels], dim=-1)
+
+            labels, delay_pattern_mask = build_delay_pattern_mask(
+                labels,
+                bos_token_id=audio_encoder_bos_token_id,
+                pad_token_id=audio_encoder_eos_token_id,
+                max_length=labels.shape[-1] + num_codebooks,
+                num_codebooks=num_codebooks,
+            )
+
+            # the first ids of the delay pattern mask are precisely labels, we use the rest of the labels mask
+            # to take care of EOS
+            # we want labels to look like this:
+            #  - [B, a, b, E, E, E, E]
+            #  - [B, B, c, d, E, E, E]
+            #  - [B, B, B, e, f, E, E]
+            #  - [B, B, B, B, g, h, E]
+            labels = torch.where(delay_pattern_mask == -1, audio_encoder_eos_token_id, delay_pattern_mask)
+
+            # the first timestamp is associated to a row full of BOS, let's get rid of it
+            # we also remove the last timestampts (full of PAD)
+            output = {"labels": labels[:, 1:]}
+            return output
+
+        for split in vectorized_datasets:
+            data_loader = DataLoader(
+                raw_datasets[split],
+                batch_size=training_args.audio_encoder_per_device_batch_size,
+                collate_fn=encoder_data_collator,
+                num_workers=training_args.dataloader_num_workers,
+                pin_memory=True,
+            )
+            data_loader = accelerator.prepare(data_loader)
+            total_inference_steps = len(data_loader)
+
+            start_step = get_last_codec_checkpoint_step(os.path.join(data_args.temporary_save_to_disk, split))
+            accelerator.wait_for_everyone()
+            if start_step > 0:
+                logger.info(f"Resuming {split} from step {start_step}")
+                # efficiently skip the first n batches
+                start_step += 1
+                data_loader = skip_first_batches(data_loader, start_step)
+
+            all_generated_labels = []
+            all_lens = []
+            if start_step < total_inference_steps:
+                for i, batch in enumerate(tqdm(data_loader, disable=not accelerator.is_local_main_process)):
+                    cur_step = start_step + i
+                    generate_labels = apply_audio_decoder(batch)
+                    generate_labels = accelerator.pad_across_processes(generate_labels, dim=1, pad_index=0)
+                    generate_labels = accelerator.gather_for_metrics(generate_labels)
+
+                    if accelerator.is_main_process:
+                        lab = generate_labels["labels"].cpu().transpose(1, 2).to(torch.int16)
+                        rat = generate_labels["ratio"].cpu().squeeze(1)
+                        lens = generate_labels["len_audio"].cpu().squeeze(1)
+                        lab = [l[:, : int(ratio * length)] for (l, ratio, length) in zip(lab, rat, lens)]
+
+                        all_generated_labels.extend(lab)
+                        all_lens.extend(lens)
+
+                        if ((cur_step + 1) % data_args.save_codec_steps == 0) or (
+                            cur_step == total_inference_steps - 1
+                        ):
+                            tmp_labels = Dataset.from_dict({"labels": all_generated_labels, "target_length": all_lens})
+                            tmp_labels = tmp_labels.map(
+                                postprocess_dataset,
+                                num_proc=data_args.preprocessing_num_workers,  # this one is resource consuming if many processor.
+                                input_columns=["labels"],
+                                desc="Postprocessing labeling",
+                            )
+                            save_codec_checkpoint(
+                                os.path.join(data_args.temporary_save_to_disk, split), tmp_labels, cur_step
+                            )
+                            all_generated_labels = []
+                            all_lens = []
+
+                accelerator.wait_for_everyone()
+
+            if accelerator.is_main_process and len(all_generated_labels) > 0:
+                tmp_labels = Dataset.from_dict({"labels": all_generated_labels, "target_length": all_lens})
+                tmp_labels = tmp_labels.map(
+                    postprocess_dataset,
+                    num_proc=data_args.preprocessing_num_workers,  # this one is resource consuming if many processor.
+                    input_columns=["labels"],
+                    desc="Postprocessing labeling",
+                )
+                save_codec_checkpoint(os.path.join(data_args.temporary_save_to_disk, split), tmp_labels, cur_step)
+                all_generated_labels = []
+                all_lens = []
+            accelerator.wait_for_everyone()
+
+            del all_generated_labels
+            accelerator.wait_for_everyone()
+
+            with accelerator.local_main_process_first():
+                tmp_labels = load_all_codec_checkpoints(os.path.join(data_args.temporary_save_to_disk, split)).select(
+                    range(len(vectorized_datasets[split]))
+                )
+                logger.info(f"Concatenating {split}: {tmp_labels} with {vectorized_datasets[split]}")
+                vectorized_datasets[split] = concatenate_datasets([vectorized_datasets[split], tmp_labels], axis=1)
+
+        accelerator.free_memory()
+        del generate_labels, all_lens
+
+        with accelerator.local_main_process_first():
+            # NOTE: filtering is done at the end because in the `datasets` library, caching audio files is done after most operations
+            # caching audio files is time and disk-space consuming, so we want to avoid it at all costs, especially for large (>1Kh) audio datasets.
+            # That's also why we avoid to concat the processed datasets (vectorized_datasets) with the audio column present in raw_datasets.
+
+            def is_audio_in_length_range(length):
+                return length > min_target_length and length < max_target_length
+
+            # filter data that is shorter than min_target_length
+            vectorized_datasets = vectorized_datasets.filter(
+                is_audio_in_length_range,
+                num_proc=num_workers,
+                input_columns=["target_length"],
+            )
+
+        if description_column_name is not None and data_args.max_description_token_length is not None:
+            with accelerator.local_main_process_first():
+                # filter description that is shorter than max_text_length
+                vectorized_datasets = vectorized_datasets.filter(
+                    lambda x: len(x) < data_args.max_description_token_length,
+                    num_proc=num_workers,
+                    input_columns=["input_ids"],
+                )
+
+        if data_args.max_prompt_token_length is not None:
+            with accelerator.local_main_process_first():
+                # filter description that is shorter than max_text_length
+                vectorized_datasets = vectorized_datasets.filter(
+                    lambda x: len(x) < data_args.max_prompt_token_length,
+                    num_proc=num_workers,
+                    input_columns=["prompt_input_ids"],
+                )
+
+    if data_args.save_to_disk is not None and not dataset_was_precomputed:
+        if accelerator.is_main_process:
+            vectorized_datasets.save_to_disk(
+                data_args.save_to_disk,
+                num_proc=min(data_args.preprocessing_num_workers, len(vectorized_datasets["eval"]) - 1),
+            )
+        accelerator.wait_for_everyone()
+        logger.info(f"Dataset saved at {data_args.save_to_disk}")
+
+    audio_max_length = None
+    if padding == "max_length":
+        audio_max_length = max(vectorized_datasets["train"]["target_length"])
+        with accelerator.local_main_process_first():
+            max_sample = vectorized_datasets["train"].filter(
+                lambda x: x == audio_max_length,
+                num_proc=num_workers,
+                input_columns=["target_length"],
+            )
+        audio_max_length = max([len(l[0]) for l in max_sample["labels"]])
+
+    if description_column_name is not None and data_args.max_description_token_length is not None:
+        with accelerator.local_main_process_first():
+            # filter description that is shorter than max_text_length
+            vectorized_datasets = vectorized_datasets.filter(
+                lambda x: len(x) < data_args.max_description_token_length,
+                num_proc=num_workers,
+                input_columns=["input_ids"],
+            )
+
+    if data_args.max_prompt_token_length is not None:
+        with accelerator.local_main_process_first():
+            # filter description that is shorter than max_text_length
+            vectorized_datasets = vectorized_datasets.filter(
+                lambda x: len(x) < data_args.max_prompt_token_length,
+                num_proc=num_workers,
+                input_columns=["prompt_input_ids"],
+            )
+
+    if training_args.group_by_length:
+        # apply a simple heuristic to take into account audio and text lengths
+        def add_target_lengths(target_length, prompt, description):
+            return {"target_length": target_length + len(prompt) + len(description)}
+
+        with accelerator.local_main_process_first():
+            vectorized_datasets = vectorized_datasets.map(
+                add_target_lengths,
+                num_proc=num_workers,
+                input_columns=["target_length", "prompt_input_ids", "input_ids"],
+            )
+
+    # for large datasets it is advised to run the preprocessing on a
+    # single machine first with ``args.preprocessing_only`` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only and data_args.save_to_disk is None:
+        raise ValueError(
+            "`preprocessing_only=True` but `save_to_disk` is not set. The latter should indicates where to save the dataset locally."
+        )
+    elif data_args.preprocessing_only:
+        logger.info(f"Data preprocessing finished. Files save at {data_args.save_to_disk}")
+        return
+
+    # 6. Next, we can prepare the training.
+
+    # Let's use word CLAP similary and WER metrics as our evaluation metrics,
+    def compute_metrics(
+        audios,
+        descriptions,
+        prompts,
+        device="cpu",
+        compute_clap_similarity_metric=False,
+        compute_noise_level_metric=False,
+        noise_level_to_compute_clean_wer=None,
+    ):
+        results = {}
+        input_ids = descriptions
+        texts = description_tokenizer.batch_decode(input_ids, skip_special_tokens=True)
+        prompts = prompt_tokenizer.batch_decode(prompts, skip_special_tokens=True)
+        audios = [a.float().cpu().numpy() for a in audios]
+
+        if compute_clap_similarity_metric:
+            clap_score = clap_similarity(
+                model_args.clap_model_name_or_path, texts, audios, device, input_sampling_rate=sampling_rate
+            )
+            results["clap"] = clap_score
+
+        si_sdr_measures = None
+        if compute_noise_level_metric:
+            si_sdr_measures = si_sdr(audios, device, input_sampling_rate=sampling_rate)
+
+        word_error, transcriptions, clean_word_error, noisy_word_error, percent_clean_samples = wer(
+            model_args.asr_model_name_or_path,
+            prompts,
+            audios,
+            device,
+            training_args.per_device_eval_batch_size,
+            sampling_rate,
+            noise_level_to_compute_clean_wer,
+            si_sdr_measures,
+        )
+        results["wer"] = word_error
+        if clean_word_error is not None:
+            results["clean_wer"] = clean_word_error
+            results["noisy_word_error"] = noisy_word_error
+            results["percent_clean_samples"] = percent_clean_samples
+
+        return results, texts, prompts, audios, transcriptions, si_sdr_measures
+
+    # Define Training Schedule
+    # Store some constants
+    per_device_train_batch_size = int(training_args.per_device_train_batch_size)
+    train_batch_size = per_device_train_batch_size * accelerator.num_processes
+    gradient_accumulation_steps = int(training_args.gradient_accumulation_steps)
+    per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
+
+    if training_args.max_steps < 0:
+        num_epochs = int(training_args.num_train_epochs)
+        steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
+        total_train_steps = steps_per_epoch * num_epochs
+    elif training_args.max_steps > 0:
+        logger.info("max_steps is given, it will override any value given in num_train_epochs")
+        total_train_steps = int(training_args.max_steps)
+        # Setting a very large number of epochs so we go as many times as necessary over the iterator.
+        num_epochs = sys.maxsize
+        steps_per_epoch = total_train_steps
+
+    if training_args.eval_steps is None:
+        logger.info(f"eval_steps is not set, evaluating at the end of each epoch")
+        eval_steps = steps_per_epoch
+    else:
+        eval_steps = training_args.eval_steps
+        
+    if training_args.eval_generation_steps is None:
+        eval_generation_steps = eval_steps
+    else:
+        eval_generation_steps = training_args.eval_generation_steps
+
+    # T5 doesn't support fp16
+    autocast_kwargs = AutocastKwargs(enabled=(mixed_precision != "fp16"))
+
+    # Define optimizer, LR scheduler, collator
+    optimizer = torch.optim.AdamW(
+        params=model.parameters(),
+        lr=training_args.learning_rate,
+        betas=(training_args.adam_beta1, training_args.adam_beta2),
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+    )
+
+    # LR scheduler gets stepped by `num_processes` each time -> account for this in warmup / total steps
+    lr_scheduler = get_scheduler(
+        name=training_args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=training_args.get_warmup_steps(total_train_steps) * accelerator.num_processes,
+        num_training_steps=total_train_steps * accelerator.num_processes,
+    )
+
+    # Instantiate custom data collator
+    data_collator = DataCollatorParlerTTSWithPadding(
+        prompt_tokenizer=prompt_tokenizer,
+        description_tokenizer=description_tokenizer,
+        pad_to_multiple_of=data_args.pad_to_multiple_of,
+        padding=padding,
+        prompt_max_length=data_args.max_prompt_token_length,
+        description_max_length=data_args.max_description_token_length,
+        audio_max_length=audio_max_length,
+    )
+
+    # Prepare everything with accelerate
+    model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
+
+    num_examples = total_train_steps * train_batch_size * gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {num_examples}")
+    logger.info("  Instantaneous batch size per device =" f" {per_device_train_batch_size}")
+    logger.info("  Gradient accumulation steps =" f" {gradient_accumulation_steps}")
+    logger.info(
+        f"  Total train batch size (w. parallel & distributed) = {train_batch_size * gradient_accumulation_steps}"
+    )
+    logger.info(f"  Total optimization steps = {total_train_steps}")
+
+    # ======================== Training ================================
+    train_time = 0
+    train_start = time.time()
+    steps_trained_progress_bar = tqdm(
+        range(total_train_steps), desc="Train steps ... ", position=0, disable=not accelerator.is_local_main_process
+    )
+    continue_training = True
+    epochs_trained = 0
+    cur_step = 0
+
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+
+    if accelerator.is_main_process:
+        if training_args.push_to_hub:
+            api = HfApi(token=training_args.hub_token)
+
+            # Create repo (repo_name from args or inferred)
+            repo_name = training_args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(training_args.output_dir).absolute().name
+            repo_id = api.create_repo(repo_name, exist_ok=True).repo_id
+
+            with open(os.path.join(training_args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "wandb" not in gitignore:
+                    gitignore.write("wandb\n")
+        elif training_args.output_dir is not None:
+            os.makedirs(training_args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
+    # Now save everything to be able to create a single processor later
+    # make sure all processes wait until data is saved
+    # only the main process saves them
+    if accelerator.is_main_process:
+        # save feature extractor, tokenizer and config
+        if (
+            model_args.prompt_tokenizer_name is None
+            and model_args.description_tokenizer_name
+            or (model_args.prompt_tokenizer_name == model_args.description_tokenizer_name)
+        ):
+            prompt_tokenizer.save_pretrained(training_args.output_dir)
+        else:
+            logger.warning(
+                f"Prompt tokenizer ('{model_args.prompt_tokenizer_name}') and description tokenizer ('{model_args.description_tokenizer_name}') are not the same. Saving only the prompt tokenizer."
+            )
+            prompt_tokenizer.save_pretrained(training_args.output_dir)
+
+        feature_extractor.save_pretrained(training_args.output_dir)
+        config.save_pretrained(training_args.output_dir)
+    accelerator.wait_for_everyone()
+
+    if checkpoint is not None:
+        accelerator.load_state(checkpoint)
+        # Find num steps and epoch from saved state string pattern
+        pattern = r"checkpoint-(\d+)-epoch-(\d+)"
+        match = re.search(pattern, checkpoint)
+        cur_step = int(match.group(1))
+        epochs_trained = int(match.group(2))
+
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info(f"  Continuing training from epoch {epochs_trained}")
+        logger.info(f"  Continuing training from global step {cur_step}")
+
+        steps_trained_progress_bar.update(cur_step)
+
+        for epoch in range(0, epochs_trained):
+            with accelerator.local_main_process_first():
+                vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+
+        if training_args.max_steps < 0:
+            # we know exactly the number of steps per epoch, so can skip through the required number of batches
+            resume_step = (cur_step - epochs_trained * steps_per_epoch) * gradient_accumulation_steps
+        else:
+            # Currently we don't know how many steps we've taken in the current epoch
+            # So we just shuffle the dataset one extra time and start from a fresh epoch
+            # This is "good enough" for our purposes but not fully correct
+            resume_step = None
+            with accelerator.local_main_process_first():
+                vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+    else:
+        resume_step = None
+
+    gen_kwargs = {
+        "do_sample": model_args.do_sample,
+        "temperature": model_args.temperature,
+        "max_length": model_args.max_length,
+        # Because of the delayed pattern mask, generation might stop earlier because of unexpected behaviour
+        # on the first tokens of the codebooks that are delayed.
+        # This fix the issue.
+        "min_new_tokens": num_codebooks + 1,
+    }
+
+    # Define gradient update step fn
+    def train_step(
+        batch,
+        accelerator,
+        autocast_kwargs,
+        num_items_in_batch,
+        gradient_accumulation_steps,
+    ):
+        if mixed_precision == "fp16":
+            # fp16 doesn't work with T5-like models
+            with accelerator.autocast(autocast_handler=autocast_kwargs):
+                if training_args.parallel_mode.value != "distributed":
+                    encoder_outputs = model.text_encoder(
+                        input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
+                    )
+                else:
+                    encoder_outputs = model.module.text_encoder(
+                        input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
+                    )
+                # we optionnally project last_hidden_state to avoid recomputing every time
+                encoder_hidden_states = encoder_outputs.last_hidden_state
+                if (
+                    config.text_encoder.hidden_size != config.decoder.hidden_size
+                    and config.decoder.cross_attention_hidden_size is None
+                ):
+                    encoder_hidden_states = (
+                        model.enc_to_dec_proj(encoder_hidden_states)
+                        if training_args.parallel_mode.value != "distributed"
+                        else model.module.enc_to_dec_proj(encoder_hidden_states)
+                    )
+
+                if batch.get("attention_mask", None) is not None:
+                    encoder_hidden_states = encoder_hidden_states * batch.get("attention_mask", None)[..., None]
+
+                encoder_outputs.last_hidden_state = encoder_hidden_states
+                batch["encoder_outputs"] = encoder_outputs
+
+        outputs = model(**batch, loss_reduction="sum")
+        # CE (data) loss
+        ce_loss = (outputs.loss * gradient_accumulation_steps * accelerator.num_processes) / num_items_in_batch
+
+        metrics = {"loss": ce_loss}
+        
+        # per CE loss
+        per_codebook_losses = outputs.per_codebook_losses
+        metrics.update({f"codebook_{i}_loss": ((l  * gradient_accumulation_steps * accelerator.num_processes) / num_items_in_batch) for (i,l) in enumerate(per_codebook_losses)})
+        return ce_loss, metrics
+
+    # Define eval fn
+    def eval_step(
+        batch,
+        accelerator,
+        autocast_kwargs,
+    ):
+        eval_model = model if not training_args.torch_compile else model._orig_mod
+
+        if mixed_precision == "fp16":
+            # fp16 doesn't work with T5-like models
+            with accelerator.autocast(autocast_handler=autocast_kwargs):
+                if training_args.parallel_mode.value != "distributed":
+                    encoder_outputs = model.text_encoder(
+                        input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
+                    )
+                else:
+                    encoder_outputs = model.module.text_encoder(
+                        input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
+                    )
+                # we optionnally project last_hidden_state to avoid recomputing every time
+                encoder_hidden_states = encoder_outputs.last_hidden_state
+                if (
+                    config.text_encoder.hidden_size != config.decoder.hidden_size
+                    and config.decoder.cross_attention_hidden_size is None
+                ):
+                    encoder_hidden_states = (
+                        model.enc_to_dec_proj(encoder_hidden_states)
+                        if training_args.parallel_mode.value != "distributed"
+                        else model.module.enc_to_dec_proj(encoder_hidden_states)
+                    )
+
+                if batch.get("attention_mask", None) is not None:
+                    encoder_hidden_states = encoder_hidden_states * batch.get("attention_mask", None)[..., None]
+
+                encoder_outputs.last_hidden_state = encoder_hidden_states
+                batch["encoder_outputs"] = encoder_outputs
+
+        with torch.no_grad():
+            outputs = eval_model(**batch)
+        # CE (data) loss
+        ce_loss = outputs.loss
+        metrics = {"loss": ce_loss}
+        
+        # per CE loss
+        per_codebook_losses = outputs.per_codebook_losses
+        metrics.update({f"codebook_{i}_loss": l for (i,l) in enumerate(per_codebook_losses)})
+        return metrics
+
+    def generate_step(batch, accelerator):
+        batch.pop("decoder_attention_mask", None)
+        eval_model = accelerator.unwrap_model(model, keep_fp32_wrapper=True)
+        if training_args.torch_compile:
+            # if the model is compiled, we use the original model bc compile is not compatible with .generate
+            eval_model = model._orig_mod
+
+        # since we've might have loaded the weights in fp32, we have to autocast to ensure FA2 weights are in half-precision.
+        # with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=(attn_implementation=="flash_attention_2"))):
+        output_audios = eval_model.generate(**batch, **gen_kwargs)
+        output_audios = accelerator.pad_across_processes(output_audios, dim=1, pad_index=0)
+        return output_audios
+
+    model.train()
+
+    total_batched_samples = resume_step if resume_step is not None else 0
+    for epoch in range(epochs_trained, num_epochs):
+        with accelerator.local_main_process_first():
+            vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+        sampler = None
+        if training_args.group_by_length:
+            sampler = LengthGroupedSampler(train_batch_size, lengths=vectorized_datasets["train"]["target_length"])
+        train_dataloader = DataLoader(
+            vectorized_datasets["train"],
+            collate_fn=data_collator,
+            batch_size=per_device_train_batch_size,
+            sampler=sampler,
+            shuffle=not training_args.group_by_length,
+            num_workers=training_args.dataloader_num_workers,
+            pin_memory=training_args.dataloader_pin_memory,
+        )
+        train_dataloader = accelerator.prepare(train_dataloader)
+        if hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDataset):
+            train_dataloader.dataset.set_epoch(epoch)
+
+        if resume_step is not None:
+            # Skip the first N batches in the dataloader when resuming from a checkpoint
+            logger.info(f"  Skip first {resume_step} batches")
+            train_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+            resume_step = None
+            accelerator.wait_for_everyone()
+
+        # We chunkify the epoch iterator into gradient accumulation steps `n` batches
+        train_iterator = iter(train_dataloader)
+        num_steps_in_epoch = len(train_dataloader)
+        remainder = num_steps_in_epoch % gradient_accumulation_steps
+        remainder = remainder if remainder != 0 else gradient_accumulation_steps
+        total_updates = math.ceil(num_steps_in_epoch / gradient_accumulation_steps)
+        
+        update_step = -1
+        for _ in range(total_updates):
+            update_step += 1
+            
+            # preload the total batch per step
+            batch_samples = []
+            num_batches_in_step = gradient_accumulation_steps if update_step != (total_updates - 1) else remainder
+            for _ in range(num_batches_in_step):
+                batch_samples += [next(train_iterator)]
+                
+            # get num items in batch - if different than BOS and than -100
+            num_items_in_batch = sum([(batch["labels"].ne(audio_encoder_bos_token_id) | batch["labels"].ne(-100) | batch["labels"].ne(audio_encoder_eos_token_id)).sum((0,1))[0] for batch in batch_samples])
+            num_items_in_batch = accelerator.gather(num_items_in_batch).sum().item()
+            
+            # losses = []
+            for i,batch in enumerate(batch_samples):
+                total_batched_samples += 1
+                ctx = model.no_sync if (i < len(batch_samples) - 1 and accelerator.num_processes > 1) else contextlib.nullcontext
+                
+                with ctx():
+                    loss, train_metric = train_step(batch, accelerator, autocast_kwargs, num_items_in_batch, gradient_accumulation_steps)
+                    accelerator.backward(loss)
+                    # losses.append(loss.detach())
+            
+            grad_norm = accelerator.clip_grad_norm_(model.parameters(), training_args.max_grad_norm)
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+
+            # The accelerator has performed an optimization step behind the scenes
+            steps_trained_progress_bar.update(1)
+            cur_step += 1
+
+            # losses = accelerator.gather(sum(losses)).sum().item() / (accelerator.num_processes * gradient_accumulation_steps)
+            
+            if cur_step % training_args.logging_steps == 0:
+                steps_trained_progress_bar.write(
+                    f"Step... ({cur_step} / {total_train_steps} | Loss:"
+                    f" {train_metric['loss']}, Learning Rate:"
+                    f" {lr_scheduler.get_last_lr()[0]})"
+                )
+                train_metric["grad_norm"] = grad_norm.detach().item() if isinstance(grad_norm, torch.Tensor) else grad_norm
+                log_metric(
+                    accelerator,
+                    metrics=train_metric,
+                    learning_rate=lr_scheduler.get_last_lr()[0],
+                    train_time=train_time + time.time() - train_start,
+                    step=cur_step,
+                    epoch=epoch,
+                    prefix="train",
+                )
+
+            # save checkpoint and weights after each save_steps and at the end of training
+            if (cur_step % training_args.save_steps == 0) or cur_step == total_train_steps:
+                intermediate_dir = os.path.join(training_args.output_dir, f"checkpoint-{cur_step}-epoch-{epoch}")
+                # safe_serialization=False to avoid shared tensors saving issue (TODO(YL): it's a temporary fix)
+                # https://github.com/huggingface/transformers/issues/27293#issuecomment-1872560074
+                accelerator.save_state(output_dir=intermediate_dir, safe_serialization=False)
+                accelerator.wait_for_everyone()
+                if accelerator.is_main_process:
+                    rotate_checkpoints(
+                        training_args.save_total_limit, output_dir=training_args.output_dir, logger=logger
+                    )
+
+                    if cur_step == total_train_steps:
+                        # un-wrap student model for save
+                        unwrapped_model = accelerator.unwrap_model(model)
+                        unwrapped_model.save_pretrained(training_args.output_dir)
+
+                    if training_args.push_to_hub:
+                        api.upload_folder(
+                            repo_id=repo_id,
+                            folder_path=training_args.output_dir,
+                            commit_message=f"Saving train state of step {cur_step}",
+                            run_as_future=True,
+                        )
+                accelerator.wait_for_everyone()
+
+            if training_args.do_eval and (cur_step % eval_steps == 0 or cur_step == total_train_steps):
+                train_time += time.time() - train_start
+                # ======================== Evaluating ==============================
+                model.eval()
+                eval_metrics = []
+                eval_preds = []
+                eval_descriptions = []
+                eval_prompts = []
+                eval_start = time.time()
+
+                # release training input batch
+                batch = release_memory(batch)
+
+                validation_dataloader = DataLoader(
+                    vectorized_datasets["eval"],
+                    collate_fn=data_collator,
+                    batch_size=per_device_eval_batch_size,
+                    drop_last=False,
+                    num_workers=training_args.eval_dataloader_num_workers,
+                    pin_memory=training_args.dataloader_pin_memory,
+                )
+                validation_dataloader = accelerator.prepare(validation_dataloader)
+
+                for batch in tqdm(
+                    validation_dataloader,
+                    desc=f"Evaluating - Inference ...",
+                    position=2,
+                    disable=not accelerator.is_local_main_process,
+                ):
+                    # Model forward
+                    eval_metric = eval_step(batch, accelerator, autocast_kwargs)
+                    eval_metric = accelerator.gather_for_metrics(eval_metric)
+                    eval_metric = {key: val.unsqueeze(0) if val.ndim == 0 else val for (key,val) in eval_metric.items()}
+                    eval_metrics.append(eval_metric)
+
+                if training_args.predict_with_generate and (cur_step % eval_generation_steps == 0 or cur_step == total_train_steps):
+                    validation_dataloader = DataLoader(
+                        vectorized_datasets["eval"],
+                        collate_fn=data_collator,
+                        batch_size=per_device_eval_batch_size,
+                        drop_last=False,
+                        num_workers=training_args.eval_dataloader_num_workers,
+                        pin_memory=training_args.dataloader_pin_memory,
+                    )
+                    validation_dataloader = accelerator.prepare(validation_dataloader)
+                    # generation
+                    for batch in tqdm(
+                        validation_dataloader,
+                        desc=f"Evaluating - Generation ...",
+                        position=2,
+                        disable=not accelerator.is_local_main_process,
+                    ):
+                        generated_audios = generate_step(batch, accelerator)
+                        # Gather all predictions and targets
+                        generated_audios, input_ids, prompts = accelerator.pad_across_processes(
+                            (generated_audios, batch["input_ids"], batch["prompt_input_ids"]), dim=1, pad_index=0
+                        )
+                        generated_audios, input_ids, prompts = accelerator.gather_for_metrics(
+                            (generated_audios, input_ids, prompts)
+                        )
+                        eval_preds.extend(generated_audios.to("cpu"))
+                        eval_descriptions.extend(input_ids.to("cpu"))
+                        eval_prompts.extend(prompts.to("cpu"))
+
+                eval_time = time.time() - eval_start
+                # normalize eval metrics
+                eval_metrics = {
+                    key: torch.mean(torch.cat([d[key] for d in eval_metrics])).to("cpu") for key in eval_metrics[0]
+                }
+
+                # compute metrics
+                metrics_desc = ""
+                if training_args.predict_with_generate and (cur_step % eval_generation_steps == 0 or cur_step == total_train_steps):
+                    if accelerator.is_local_main_process:
+                        (
+                            metric_values,
+                            pred_descriptions,
+                            pred_prompts,
+                            audios,
+                            transcriptions,
+                            si_sdr_measures,
+                        ) = compute_metrics(
+                            eval_preds,
+                            eval_descriptions,
+                            eval_prompts,
+                            accelerator.device,
+                            training_args.compute_clap_similarity_metric,
+                            training_args.compute_noise_level_metric,
+                            training_args.noise_level_to_compute_clean_wer,
+                        )
+                        eval_metrics.update(metric_values)
+                        metrics_desc = " ".join([f"Eval {key}: {value} |" for key, value in metric_values.items()])
+                        if "wandb" in training_args.report_to:
+                            log_pred(
+                                accelerator,
+                                pred_descriptions,
+                                pred_prompts,
+                                transcriptions,
+                                audios,
+                                si_sdr_measures,
+                                sampling_rate=sampling_rate,
+                                step=cur_step,
+                                prefix="eval",
+                            )
+                    accelerator.wait_for_everyone()
+
+                # Print metrics and update progress bar
+                if accelerator.is_local_main_process:
+                    steps_trained_progress_bar.write(
+                        f"Eval results for step ({cur_step} / {total_train_steps} | Eval Loss: {eval_metrics['loss']} |"
+                        f" {metrics_desc})"
+                    )
+
+                log_metric(
+                    accelerator,
+                    metrics=eval_metrics,
+                    train_time=eval_time,
+                    step=cur_step,
+                    epoch=epoch,
+                    prefix="eval",
+                )
+
+                # release eval batch and relax metrics
+                eval_metrics, eval_preds, eval_descriptions, eval_prompts, batch, eval_metric = release_memory(
+                    eval_metrics, eval_preds, eval_descriptions, eval_prompts, batch, eval_metric
+                )
+                if training_args.predict_with_generate and (cur_step % eval_generation_steps == 0 or cur_step == total_train_steps):
+                    generated_audios, input_ids, prompts = release_memory(generated_audios, input_ids, prompts)
+
+                # train mode
+                model.train()
+
+                # flush the train metrics
+                train_start = time.time()
+
+            # break condition
+            if cur_step == total_train_steps:
+                continue_training = False
+                break
+
+        if not continue_training:
+            break
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/capspeech/ar/training/run_parler_tts_training.py b/capspeech/ar/training/run_parler_tts_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..36e3b6b286f03f4a803105e2d3db0afbf69c2f21
--- /dev/null
+++ b/capspeech/ar/training/run_parler_tts_training.py
@@ -0,0 +1,1279 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Train Parler-TTS using 🤗 Accelerate"""
+
+import logging
+import os
+import re
+import sys
+import time
+import math
+import contextlib
+from multiprocess import set_start_method
+from datetime import timedelta
+import inspect
+from tqdm import tqdm
+from pathlib import Path
+import wandb
+
+import torch
+from torch.utils.data import DataLoader
+
+import datasets
+from datasets import DatasetDict, Dataset, IterableDataset, concatenate_datasets
+
+from huggingface_hub import HfApi
+
+import transformers
+from transformers import AutoFeatureExtractor, AutoTokenizer, HfArgumentParser
+from transformers.trainer_pt_utils import LengthGroupedSampler
+from transformers.optimization import get_scheduler
+from transformers.utils import send_example_telemetry
+
+
+from accelerate import Accelerator, skip_first_batches
+from accelerate.utils import set_seed, AutocastKwargs, InitProcessGroupKwargs, TorchDynamoPlugin, DistributedDataParallelKwargs
+from accelerate.utils.memory import release_memory
+
+from parler_tts import (
+    ParlerTTSConfig,
+    ParlerTTSForConditionalGeneration,
+    build_delay_pattern_mask,
+)
+
+from training.utils import (
+    get_last_checkpoint,
+    rotate_checkpoints,
+    log_pred,
+    log_metric,
+    load_all_codec_checkpoints,
+    save_codec_checkpoint,
+    get_last_codec_checkpoint_step,
+)
+from training.arguments import ModelArguments, DataTrainingArguments, ParlerTTSTrainingArguments
+from training.data import load_multiple_datasets, DataCollatorParlerTTSWithPadding, DataCollatorEncodecWithPadding
+from training.eval import clap_similarity, wer, si_sdr
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ParlerTTSTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_parler_tts", model_args, data_args)
+    
+    if data_args.wandb_key is not None:
+        wandb.login(key=data_args.wandb_key)
+
+    if training_args.dtype == "float16":
+        mixed_precision = "fp16"
+        torch_dtype = torch.float16
+    elif training_args.dtype == "bfloat16":
+        mixed_precision = "bf16"
+        torch_dtype = torch.bfloat16
+    else:
+        mixed_precision = "no"
+        torch_dtype = torch.float32
+
+    if data_args.pad_to_max_length and (
+        data_args.max_duration_in_seconds is None
+        or data_args.max_prompt_token_length is None
+        or data_args.max_description_token_length is None
+    ):
+        raise ValueError(
+            "`pad_to_max_length` is `True` but one of the following parameters has not been set: `max_duration_in_seconds`, `max_prompt_token_length`, `max_description_token_length`"
+        )
+
+    padding = "max_length" if data_args.pad_to_max_length else "longest"
+
+    ####### A. Preparation
+    kwargs_handlers = [InitProcessGroupKwargs(timeout=timedelta(minutes=120)), DistributedDataParallelKwargs(find_unused_parameters=False)]
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=training_args.gradient_accumulation_steps,
+        mixed_precision=mixed_precision,
+        log_with=training_args.report_to,
+        project_dir=training_args.output_dir,
+        kwargs_handlers=kwargs_handlers,
+    )
+
+    accelerator.init_trackers(
+        project_name=data_args.wandb_project,
+        config={
+            "learning_rate": training_args.learning_rate,
+            "model_name_or_path": model_args.model_name_or_path,
+            "num_train_epochs": training_args.num_train_epochs,
+            "gradient_accumulation_steps": training_args.gradient_accumulation_steps,
+            "per_device_train_batch_size": training_args.per_device_train_batch_size,
+            "global_batch_size": training_args.per_device_train_batch_size * accelerator.num_processes,
+            "mixed_precision": mixed_precision,
+            "lr_scheduler_type": training_args.lr_scheduler_type,
+            "warmup_steps": training_args.warmup_steps,
+            "freeze_text_encoder": model_args.freeze_text_encoder,
+            "max_duration_in_seconds": data_args.max_duration_in_seconds,
+            "weight_decay": training_args.weight_decay,
+            "adam_beta1": training_args.adam_beta1,
+            "adam_beta2": training_args.adam_beta2,
+            "temperature": model_args.temperature,
+        },
+        init_kwargs={"wandb": {"name": data_args.wandb_run_name}} if data_args.wandb_run_name else {},
+    )
+
+    # Detecting last checkpoint and eventually continue from last checkpoint
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if accelerator.is_main_process else logging.WARN)
+
+    # Log a small summary on each proces
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
+        f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
+    )
+
+    # Set the verbosity to info of the Transformers logger (on main process only)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    num_workers = data_args.preprocessing_num_workers
+
+    # 1. First, lett's instantiate the feature extractor, tokenizers and model
+    # Note for distributed training, the .from_pretrained methods guarantee that only
+    # one local process can concurrently download model & vocab.
+
+    # load feature extractor
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.feature_extractor_name or model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+    ) 
+    sampling_rate = feature_extractor.sampling_rate
+
+    # load prompt tokenizer
+    prompt_tokenizer = AutoTokenizer.from_pretrained(
+        model_args.prompt_tokenizer_name or model_args.description_tokenizer_name or model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+        use_fast=model_args.use_fast_tokenizer,
+        padding_side=model_args.prompt_padding_side,
+    )
+
+    # load description tokenizer
+    description_tokenizer = AutoTokenizer.from_pretrained(
+        model_args.description_tokenizer_name or model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+        use_fast=model_args.use_fast_tokenizer,
+    )
+
+    if model_args.use_fast_tokenizer:
+        logger.warning(
+            "Disabling fast tokenizer warning: https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L3231-L3235"
+        )
+        prompt_tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
+        description_tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
+
+    # 2. Now, let's load the dataset
+
+    if data_args.save_to_disk is not None:
+        os.makedirs(data_args.save_to_disk, exist_ok=True)
+
+    # assume that the dataset has been saved to `save_to_disk` if the latter is not empty
+    dataset_was_precomputed = len(os.listdir(data_args.save_to_disk)) > 0
+    if dataset_was_precomputed:
+        with accelerator.local_main_process_first():
+            vectorized_datasets = datasets.load_from_disk(data_args.save_to_disk)
+    else:
+        raw_datasets = DatasetDict()
+
+        columns_to_keep = {
+            "target_audio_column_name": data_args.target_audio_column_name,
+            "prompt_column_name": data_args.prompt_column_name,
+            "source": data_args.source_column_name,
+        }
+        if data_args.description_column_name is not None:
+            columns_to_keep["description_column_name"] = data_args.description_column_name
+
+        if training_args.do_train:
+            raw_datasets["train"] = load_multiple_datasets(
+                accelerator,
+                data_args.train_dataset_name,
+                splits=data_args.train_split_name,
+                dataset_samples=data_args.train_dataset_samples,
+                seed=training_args.seed,
+                cache_dir=model_args.cache_dir,
+                num_proc=data_args.preprocessing_num_workers,
+                id_column_name=data_args.id_column_name,
+                columns_to_keep=columns_to_keep.values(),
+                prompt_column_name=data_args.prompt_column_name,
+                audio_column_name=data_args.target_audio_column_name,
+                sampling_rate=sampling_rate,
+                logger=logger,
+                mls_dir=data_args.mls_dir,
+                librittsrmix_dir=data_args.librittsrmix_dir,
+                gigaspeech_dir=data_args.gigaspeech_dir,
+                commonvoice_dir=data_args.commonvoice_dir,
+                emilia_dir=data_args.emilia_dir,
+                # streaming=data_args.streaming, TODO(SG): optionally enable streaming mode
+            )
+
+            for key in columns_to_keep:
+                if columns_to_keep[key] not in raw_datasets["train"].column_names:
+                    raise ValueError(
+                        f"--{key} '{columns_to_keep[key]}' not found in dataset '{data_args.train_dataset_name}'."
+                        f" Make sure to set `--{key}` to the correct audio column - one of"
+                        f" {', '.join(raw_datasets['train'].column_names)}."
+                    )
+
+            if data_args.max_train_samples is not None:
+                raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
+
+        if training_args.do_eval:
+            raw_datasets["eval"] = load_multiple_datasets(
+                accelerator,
+                data_args.eval_dataset_name if data_args.eval_dataset_name else data_args.train_dataset_name,
+                splits=data_args.eval_split_name,
+                cache_dir=model_args.cache_dir,
+                num_proc=data_args.preprocessing_num_workers,
+                id_column_name=data_args.id_column_name,
+                columns_to_keep=columns_to_keep.values(),
+                prompt_column_name=data_args.prompt_column_name,
+                audio_column_name=data_args.target_audio_column_name,
+                sampling_rate=sampling_rate,
+                logger=logger,
+                mls_dir=data_args.mls_dir,
+                librittsrmix_dir=data_args.librittsrmix_dir,
+                gigaspeech_dir=data_args.gigaspeech_dir,
+                commonvoice_dir=data_args.commonvoice_dir,
+                emilia_dir=data_args.emilia_dir
+                # streaming=data_args.streaming, TODO(SG): optionally enable streaming mode
+            )
+
+            if data_args.max_eval_samples is not None:
+                with accelerator.local_main_process_first():
+                    raw_datasets["eval"] = (
+                        raw_datasets["eval"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples))
+                    )
+
+    # 3. Next, let's load the config.
+    config = ParlerTTSConfig.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+    )
+ 
+    if training_args.codebook_weights is not None and len(training_args.codebook_weights) != config.decoder.num_codebooks:
+        raise ValueError(f"`codebook_weights` has length {len(training_args.codebook_weights)} when it should be of length {config.decoder.num_codebooks}.")
+
+    # update pad token id and decoder_start_token_id
+    config.decoder.update(
+        {
+            "cross_attention_implementation_strategy": model_args.cross_attention_implementation_strategy
+            if model_args.cross_attention_implementation_strategy is not None
+            else None,
+            "codebook_weights": training_args.codebook_weights if training_args.codebook_weights is not None else config.decoder.codebook_weights
+        }
+    )
+    config.update(
+        {
+            "pad_token_id": model_args.pad_token_id if model_args.pad_token_id is not None else config.pad_token_id,
+            "decoder_start_token_id": model_args.decoder_start_token_id
+            if model_args.decoder_start_token_id is not None
+            else config.decoder_start_token_id,
+        }
+    )
+
+    with open("events.txt", "r") as f:
+        events = [line.strip() for line in f]
+    events = ["<"+event.lower().replace(" ", "_")+">" for event in events]
+    events.append("<B_start>")
+    events.append("<B_end>")
+    events.append("<I_start>")
+    events.append("<I_end>")
+
+    special_tokens = {"additional_special_tokens": events}
+    prompt_tokenizer.add_special_tokens(special_tokens)
+    description_tokenizer.add_special_tokens(special_tokens)
+    padded_vocab_size = ((len(prompt_tokenizer) + 127) // 128) * 128 
+    config.vocab_size = padded_vocab_size 
+
+    # create model
+    model = ParlerTTSForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path,
+        ignore_mismatched_sizes=True,
+        cache_dir=model_args.cache_dir,
+        config=config,
+        token=data_args.token,
+        trust_remote_code=data_args.trust_remote_code,
+        attn_implementation={"decoder": model_args.attn_implementation, "text_encoder": "eager"},
+    )
+    model.text_encoder.resize_token_embeddings(padded_vocab_size)
+
+    # enable gradient checkpointing if necessary
+    if training_args.gradient_checkpointing:
+        model.gradient_checkpointing_enable()
+
+    # 4. Now we preprocess the datasets including loading the audio, resampling and normalization
+    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
+    # so that we just need to set the correct target sampling rate and normalize the input
+    # via the `feature_extractor`
+
+    # derive max & min input length for sample rate & max duration
+    sampling_rate = feature_extractor.sampling_rate
+    max_target_length = int(data_args.max_duration_in_seconds * sampling_rate)
+    min_target_length = int(data_args.min_duration_in_seconds * sampling_rate)
+    target_audio_column_name = data_args.target_audio_column_name
+    description_column_name = data_args.description_column_name
+    prompt_column_name = data_args.prompt_column_name
+    feature_extractor_input_name = feature_extractor.model_input_names[0]
+    audio_encoder_pad_token_id = config.decoder.pad_token_id
+    audio_encoder_eos_token_id = config.decoder.eos_token_id
+    audio_encoder_bos_token_id = model.generation_config.decoder_start_token_id
+    max_length = model.generation_config.max_length
+    num_codebooks = model.decoder.config.num_codebooks
+    bandwidth = model_args.bandwidth
+    attn_implementation = model_args.attn_implementation
+
+    # Freeze Encoders
+    model.freeze_encoders(model_args.freeze_text_encoder)
+
+    # Test all gather - used for warmout and avoiding timeout
+    logger.debug(str(accelerator.process_index), main_process_only=False, in_order=True)
+    test_tensor = torch.tensor([accelerator.process_index], device=accelerator.device)
+    gathered_tensor = accelerator.gather(test_tensor)
+    print("gathered_tensor", gathered_tensor)
+    accelerator.wait_for_everyone()
+
+    if not dataset_was_precomputed:
+        # Filter on text length
+        if description_column_name is not None and data_args.max_text_length is not None:
+            with accelerator.local_main_process_first():
+                # filter description that is shorter than max_text_length
+                raw_datasets = raw_datasets.filter(
+                    lambda x: len(x) < data_args.max_text_length,
+                    num_proc=num_workers,
+                    input_columns=[description_column_name],
+                )
+
+        # Preprocessing the dataset.
+        # We need to tokenize the texts.
+        def pass_through_processors(description, prompt):
+            batch = {}
+
+            batch["input_ids"] = description_tokenizer(description.strip())["input_ids"]
+            batch["prompt_input_ids"] = prompt_tokenizer(prompt.strip())["input_ids"]
+
+            return batch
+
+        with accelerator.local_main_process_first():
+            # this is a trick to avoid to rewrite the entire audio column which takes ages
+            vectorized_datasets = raw_datasets.map(
+                pass_through_processors,
+                remove_columns=next(iter(raw_datasets.values())).column_names,
+                input_columns=[description_column_name, prompt_column_name],
+                num_proc=num_workers,
+                desc="preprocess datasets",
+            )
+
+        # We use Accelerate to perform distributed inference
+        # T5 doesn't support fp16
+        autocast_kwargs = AutocastKwargs(enabled=(mixed_precision != "fp16"))
+
+        # Now we encode the audio labels with encodec.
+        ####### B. Encode audio
+
+        logger.info("*** Encode target audio with encodec ***")
+
+        # no need to prepare audio_decoder because used for inference without mixed precision
+        # see: https://huggingface.co/docs/accelerate/main/en/package_reference/accelerator#accelerate.Accelerator.prepare
+        if training_args.torch_compile:
+            audio_decoder = accelerator.prepare_model(model.audio_encoder, evaluation_mode=True)
+        else:
+            audio_decoder = model.audio_encoder
+
+        encoder_data_collator = DataCollatorEncodecWithPadding(
+            feature_extractor,
+            audio_column_name=target_audio_column_name,
+            mls_dir=data_args.mls_dir,
+            librittsrmix_dir=data_args.librittsrmix_dir,
+            gigaspeech_dir=data_args.gigaspeech_dir,
+            commonvoice_dir=data_args.commonvoice_dir,
+            emilia_dir=data_args.emilia_dir,
+            feature_extractor_input_name=feature_extractor_input_name,
+            max_length=max_target_length,
+            padding=padding,
+        )
+        encoder_signature = set(inspect.signature(audio_decoder.forward).parameters)
+
+        def apply_audio_decoder(batch):
+            len_audio = batch.pop("len_audio")
+            audio_decoder.to(batch["input_values"].device).eval()
+            if bandwidth is not None:
+                batch["bandwidth"] = bandwidth
+            elif "num_quantizers" in encoder_signature:
+                batch["num_quantizers"] = num_codebooks
+            elif "num_codebooks" in encoder_signature:
+                batch["num_codebooks"] = num_codebooks
+            elif "n_quantizers" in encoder_signature:
+                batch["n_quantizers"] = num_codebooks
+
+            with torch.no_grad():
+                labels = audio_decoder.encode(**batch)["audio_codes"]
+            output = {}
+            output["len_audio"] = len_audio
+            # (1, bsz, codebooks, seq_len) -> (bsz, seq_len, codebooks)
+            output["labels"] = labels.squeeze(0).transpose(1, 2)
+
+            # if `pad_to_max_length`, the maximum corresponding audio length of the current batch is max_duration*sampling_rate
+            max_length = len_audio.max() if padding != "max_length" else max_target_length
+            output["ratio"] = torch.ones_like(len_audio) * labels.shape[-1] / max_length
+            return output
+
+        # (1, codebooks, seq_len) where seq_len=1
+        bos_labels = torch.ones((1, num_codebooks, 1)) * audio_encoder_bos_token_id
+
+        def postprocess_dataset(labels):
+            # (1, codebooks, seq_len)
+            labels = torch.tensor(labels).unsqueeze(0)
+            # add bos
+            labels = torch.cat([bos_labels, labels], dim=-1)
+
+            labels, delay_pattern_mask = build_delay_pattern_mask(
+                labels,
+                bos_token_id=audio_encoder_bos_token_id,
+                pad_token_id=audio_encoder_eos_token_id,
+                max_length=labels.shape[-1] + num_codebooks,
+                num_codebooks=num_codebooks,
+            )
+
+            # the first ids of the delay pattern mask are precisely labels, we use the rest of the labels mask
+            # to take care of EOS
+            # we want labels to look like this:
+            #  - [B, a, b, E, E, E, E]
+            #  - [B, B, c, d, E, E, E]
+            #  - [B, B, B, e, f, E, E]
+            #  - [B, B, B, B, g, h, E]
+            labels = torch.where(delay_pattern_mask == -1, audio_encoder_eos_token_id, delay_pattern_mask)
+
+            # the first timestamp is associated to a row full of BOS, let's get rid of it
+            # we also remove the last timestampts (full of PAD)
+            output = {"labels": labels[:, 1:]}
+            return output
+
+        for split in vectorized_datasets:
+            data_loader = DataLoader(
+                raw_datasets[split],
+                batch_size=training_args.audio_encoder_per_device_batch_size,
+                collate_fn=encoder_data_collator,
+                num_workers=training_args.dataloader_num_workers,
+                pin_memory=True,
+            )
+            data_loader = accelerator.prepare(data_loader)
+            total_inference_steps = len(data_loader)
+
+            start_step = get_last_codec_checkpoint_step(os.path.join(data_args.temporary_save_to_disk, split))
+            accelerator.wait_for_everyone()
+            if start_step > 0:
+                logger.info(f"Resuming {split} from step {start_step}")
+                # efficiently skip the first n batches
+                start_step += 1
+                data_loader = skip_first_batches(data_loader, start_step)
+
+            all_generated_labels = []
+            all_lens = []
+            if start_step < total_inference_steps:
+                for i, batch in enumerate(tqdm(data_loader, disable=not accelerator.is_local_main_process)):
+                    cur_step = start_step + i
+                    generate_labels = apply_audio_decoder(batch)
+                    generate_labels = accelerator.pad_across_processes(generate_labels, dim=1, pad_index=0)
+                    generate_labels = accelerator.gather_for_metrics(generate_labels)
+
+                    if accelerator.is_main_process:
+                        lab = generate_labels["labels"].cpu().transpose(1, 2).to(torch.int16)
+                        rat = generate_labels["ratio"].cpu().squeeze(1)
+                        lens = generate_labels["len_audio"].cpu().squeeze(1)
+                        lab = [l[:, : int(ratio * length)] for (l, ratio, length) in zip(lab, rat, lens)]
+
+                        all_generated_labels.extend(lab)
+                        all_lens.extend(lens)
+
+                        if ((cur_step + 1) % data_args.save_codec_steps == 0) or (
+                            cur_step == total_inference_steps - 1
+                        ):
+                            tmp_labels = Dataset.from_dict({"labels": all_generated_labels, "target_length": all_lens})
+                            tmp_labels = tmp_labels.map(
+                                postprocess_dataset,
+                                num_proc=data_args.preprocessing_num_workers,  # this one is resource consuming if many processor.
+                                input_columns=["labels"],
+                                desc="Postprocessing labeling",
+                            )
+                            save_codec_checkpoint(
+                                os.path.join(data_args.temporary_save_to_disk, split), tmp_labels, cur_step
+                            )
+                            all_generated_labels = []
+                            all_lens = []
+
+                accelerator.wait_for_everyone()
+
+            if accelerator.is_main_process and len(all_generated_labels) > 0:
+                tmp_labels = Dataset.from_dict({"labels": all_generated_labels, "target_length": all_lens})
+                tmp_labels = tmp_labels.map(
+                    postprocess_dataset,
+                    num_proc=data_args.preprocessing_num_workers,  # this one is resource consuming if many processor.
+                    input_columns=["labels"],
+                    desc="Postprocessing labeling",
+                )
+                save_codec_checkpoint(os.path.join(data_args.temporary_save_to_disk, split), tmp_labels, cur_step)
+                all_generated_labels = []
+                all_lens = []
+            accelerator.wait_for_everyone()
+
+            del all_generated_labels
+            accelerator.wait_for_everyone()
+
+            with accelerator.local_main_process_first():
+                tmp_labels = load_all_codec_checkpoints(os.path.join(data_args.temporary_save_to_disk, split)).select(
+                    range(len(vectorized_datasets[split]))
+                )
+                logger.info(f"Concatenating {split}: {tmp_labels} with {vectorized_datasets[split]}")
+                vectorized_datasets[split] = concatenate_datasets([vectorized_datasets[split], tmp_labels], axis=1)
+
+        accelerator.free_memory()
+        del generate_labels, all_lens
+
+        with accelerator.local_main_process_first():
+            # NOTE: filtering is done at the end because in the `datasets` library, caching audio files is done after most operations
+            # caching audio files is time and disk-space consuming, so we want to avoid it at all costs, especially for large (>1Kh) audio datasets.
+            # That's also why we avoid to concat the processed datasets (vectorized_datasets) with the audio column present in raw_datasets.
+
+            def is_audio_in_length_range(length):
+                return length > min_target_length and length < max_target_length
+
+            # filter data that is shorter than min_target_length
+            vectorized_datasets = vectorized_datasets.filter(
+                is_audio_in_length_range,
+                num_proc=num_workers,
+                input_columns=["target_length"],
+            )
+
+        if description_column_name is not None and data_args.max_description_token_length is not None:
+            with accelerator.local_main_process_first():
+                # filter description that is shorter than max_text_length
+                vectorized_datasets = vectorized_datasets.filter(
+                    lambda x: len(x) < data_args.max_description_token_length,
+                    num_proc=num_workers,
+                    input_columns=["input_ids"],
+                )
+
+        if data_args.max_prompt_token_length is not None:
+            with accelerator.local_main_process_first():
+                # filter description that is shorter than max_text_length
+                vectorized_datasets = vectorized_datasets.filter(
+                    lambda x: len(x) < data_args.max_prompt_token_length,
+                    num_proc=num_workers,
+                    input_columns=["prompt_input_ids"],
+                )
+
+    if data_args.save_to_disk is not None and not dataset_was_precomputed:
+        if accelerator.is_main_process:
+            vectorized_datasets.save_to_disk(
+                data_args.save_to_disk,
+                num_proc=min(data_args.preprocessing_num_workers, len(vectorized_datasets["eval"]) - 1),
+            )
+        accelerator.wait_for_everyone()
+        logger.info(f"Dataset saved at {data_args.save_to_disk}")
+
+    audio_max_length = None
+    if padding == "max_length":
+        audio_max_length = max(vectorized_datasets["train"]["target_length"])
+        with accelerator.local_main_process_first():
+            max_sample = vectorized_datasets["train"].filter(
+                lambda x: x == audio_max_length,
+                num_proc=num_workers,
+                input_columns=["target_length"],
+            )
+        audio_max_length = max([len(l[0]) for l in max_sample["labels"]])
+
+    if description_column_name is not None and data_args.max_description_token_length is not None:
+        with accelerator.local_main_process_first():
+            # filter description that is shorter than max_text_length
+            vectorized_datasets = vectorized_datasets.filter(
+                lambda x: len(x) < data_args.max_description_token_length,
+                num_proc=num_workers,
+                input_columns=["input_ids"],
+            )
+
+    if data_args.max_prompt_token_length is not None:
+        with accelerator.local_main_process_first():
+            # filter description that is shorter than max_text_length
+            vectorized_datasets = vectorized_datasets.filter(
+                lambda x: len(x) < data_args.max_prompt_token_length,
+                num_proc=num_workers,
+                input_columns=["prompt_input_ids"],
+            )
+
+    if training_args.group_by_length:
+        # apply a simple heuristic to take into account audio and text lengths
+        def add_target_lengths(target_length, prompt, description):
+            return {"target_length": target_length + len(prompt) + len(description)}
+
+        with accelerator.local_main_process_first():
+            vectorized_datasets = vectorized_datasets.map(
+                add_target_lengths,
+                num_proc=num_workers,
+                input_columns=["target_length", "prompt_input_ids", "input_ids"],
+            )
+
+    # for large datasets it is advised to run the preprocessing on a
+    # single machine first with ``args.preprocessing_only`` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only and data_args.save_to_disk is None:
+        raise ValueError(
+            "`preprocessing_only=True` but `save_to_disk` is not set. The latter should indicates where to save the dataset locally."
+        )
+    elif data_args.preprocessing_only:
+        logger.info(f"Data preprocessing finished. Files save at {data_args.save_to_disk}")
+        return
+
+    # 6. Next, we can prepare the training.
+
+    # Let's use word CLAP similary and WER metrics as our evaluation metrics,
+    def compute_metrics(
+        audios,
+        descriptions,
+        prompts,
+        device="cpu",
+        compute_clap_similarity_metric=False,
+        compute_noise_level_metric=False,
+        noise_level_to_compute_clean_wer=None,
+    ):
+        results = {}
+        input_ids = descriptions
+        texts = description_tokenizer.batch_decode(input_ids, skip_special_tokens=True)
+        prompts = prompt_tokenizer.batch_decode(prompts, skip_special_tokens=True)
+        audios = [a.float().cpu().numpy() for a in audios]
+
+        if compute_clap_similarity_metric:
+            clap_score = clap_similarity(
+                model_args.clap_model_name_or_path, texts, audios, device, input_sampling_rate=sampling_rate
+            )
+            results["clap"] = clap_score
+
+        si_sdr_measures = None
+        if compute_noise_level_metric:
+            si_sdr_measures = si_sdr(audios, device, input_sampling_rate=sampling_rate)
+
+        word_error, transcriptions, clean_word_error, noisy_word_error, percent_clean_samples = wer(
+            model_args.asr_model_name_or_path,
+            prompts,
+            audios,
+            device,
+            training_args.per_device_eval_batch_size,
+            sampling_rate,
+            noise_level_to_compute_clean_wer,
+            si_sdr_measures,
+        )
+        results["wer"] = word_error
+        if clean_word_error is not None:
+            results["clean_wer"] = clean_word_error
+            results["noisy_word_error"] = noisy_word_error
+            results["percent_clean_samples"] = percent_clean_samples
+
+        return results, texts, prompts, audios, transcriptions, si_sdr_measures
+
+    # Define Training Schedule
+    # Store some constants
+    per_device_train_batch_size = int(training_args.per_device_train_batch_size)
+    train_batch_size = per_device_train_batch_size * accelerator.num_processes
+    gradient_accumulation_steps = int(training_args.gradient_accumulation_steps)
+    per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
+
+    if training_args.max_steps < 0:
+        num_epochs = int(training_args.num_train_epochs)
+        steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
+        total_train_steps = steps_per_epoch * num_epochs
+    elif training_args.max_steps > 0:
+        logger.info("max_steps is given, it will override any value given in num_train_epochs")
+        total_train_steps = int(training_args.max_steps)
+        # Setting a very large number of epochs so we go as many times as necessary over the iterator.
+        num_epochs = sys.maxsize
+        steps_per_epoch = total_train_steps
+
+    if training_args.eval_steps is None:
+        logger.info(f"eval_steps is not set, evaluating at the end of each epoch")
+        eval_steps = steps_per_epoch
+    else:
+        eval_steps = training_args.eval_steps
+        
+    if training_args.eval_generation_steps is None:
+        eval_generation_steps = eval_steps
+    else:
+        eval_generation_steps = training_args.eval_generation_steps
+
+    # T5 doesn't support fp16
+    autocast_kwargs = AutocastKwargs(enabled=(mixed_precision != "fp16"))
+
+    # Define optimizer, LR scheduler, collator
+    optimizer = torch.optim.AdamW(
+        params=model.parameters(),
+        lr=training_args.learning_rate,
+        betas=(training_args.adam_beta1, training_args.adam_beta2),
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+    )
+
+    # LR scheduler gets stepped by `num_processes` each time -> account for this in warmup / total steps
+    lr_scheduler = get_scheduler(
+        name=training_args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=training_args.get_warmup_steps(total_train_steps) * accelerator.num_processes,
+        num_training_steps=total_train_steps * accelerator.num_processes,
+    )
+
+    # Instantiate custom data collator
+    data_collator = DataCollatorParlerTTSWithPadding(
+        prompt_tokenizer=prompt_tokenizer,
+        description_tokenizer=description_tokenizer,
+        pad_to_multiple_of=data_args.pad_to_multiple_of,
+        padding=padding,
+        prompt_max_length=data_args.max_prompt_token_length,
+        description_max_length=data_args.max_description_token_length,
+        audio_max_length=audio_max_length,
+    )
+
+    # Prepare everything with accelerate
+    model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
+
+    num_examples = total_train_steps * train_batch_size * gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {num_examples}")
+    logger.info("  Instantaneous batch size per device =" f" {per_device_train_batch_size}")
+    logger.info("  Gradient accumulation steps =" f" {gradient_accumulation_steps}")
+    logger.info(
+        f"  Total train batch size (w. parallel & distributed) = {train_batch_size * gradient_accumulation_steps}"
+    )
+    logger.info(f"  Total optimization steps = {total_train_steps}")
+
+    # ======================== Training ================================
+    train_time = 0
+    train_start = time.time()
+    steps_trained_progress_bar = tqdm(
+        range(total_train_steps), desc="Train steps ... ", position=0, disable=not accelerator.is_local_main_process
+    )
+    continue_training = True
+    epochs_trained = 0
+    cur_step = 0
+
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+
+    if accelerator.is_main_process:
+        if training_args.push_to_hub:
+            api = HfApi(token=training_args.hub_token)
+
+            # Create repo (repo_name from args or inferred)
+            repo_name = training_args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(training_args.output_dir).absolute().name
+            repo_id = api.create_repo(repo_name, exist_ok=True).repo_id
+
+            with open(os.path.join(training_args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "wandb" not in gitignore:
+                    gitignore.write("wandb\n")
+        elif training_args.output_dir is not None:
+            os.makedirs(training_args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
+    # Now save everything to be able to create a single processor later
+    # make sure all processes wait until data is saved
+    # only the main process saves them
+    if accelerator.is_main_process:
+        # save feature extractor, tokenizer and config
+        if (
+            model_args.prompt_tokenizer_name is None
+            and model_args.description_tokenizer_name
+            or (model_args.prompt_tokenizer_name == model_args.description_tokenizer_name)
+        ):
+            prompt_tokenizer.save_pretrained(training_args.output_dir)
+        else:
+            logger.warning(
+                f"Prompt tokenizer ('{model_args.prompt_tokenizer_name}') and description tokenizer ('{model_args.description_tokenizer_name}') are not the same. Saving only the prompt tokenizer."
+            )
+            prompt_tokenizer.save_pretrained(training_args.output_dir)
+
+        feature_extractor.save_pretrained(training_args.output_dir)
+        config.save_pretrained(training_args.output_dir)
+    accelerator.wait_for_everyone()
+
+    if checkpoint is not None:
+        accelerator.load_state(checkpoint)
+        # Find num steps and epoch from saved state string pattern
+        pattern = r"checkpoint-(\d+)-epoch-(\d+)"
+        match = re.search(pattern, checkpoint)
+        cur_step = int(match.group(1))
+        epochs_trained = int(match.group(2))
+
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info(f"  Continuing training from epoch {epochs_trained}")
+        logger.info(f"  Continuing training from global step {cur_step}")
+
+        steps_trained_progress_bar.update(cur_step)
+
+        for epoch in range(0, epochs_trained):
+            with accelerator.local_main_process_first():
+                vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+
+        if training_args.max_steps < 0:
+            # we know exactly the number of steps per epoch, so can skip through the required number of batches
+            resume_step = (cur_step - epochs_trained * steps_per_epoch) * gradient_accumulation_steps
+        else:
+            # Currently we don't know how many steps we've taken in the current epoch
+            # So we just shuffle the dataset one extra time and start from a fresh epoch
+            # This is "good enough" for our purposes but not fully correct
+            resume_step = None
+            with accelerator.local_main_process_first():
+                vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+    else:
+        resume_step = None
+
+    gen_kwargs = {
+        "do_sample": model_args.do_sample,
+        "temperature": model_args.temperature,
+        "max_length": model_args.max_length,
+        # Because of the delayed pattern mask, generation might stop earlier because of unexpected behaviour
+        # on the first tokens of the codebooks that are delayed.
+        # This fix the issue.
+        "min_new_tokens": num_codebooks + 1,
+    }
+
+    # Define gradient update step fn
+    def train_step(
+        batch,
+        accelerator,
+        autocast_kwargs,
+        num_items_in_batch,
+        gradient_accumulation_steps,
+    ):
+        if mixed_precision == "fp16":
+            # fp16 doesn't work with T5-like models
+            with accelerator.autocast(autocast_handler=autocast_kwargs):
+                if training_args.parallel_mode.value != "distributed":
+                    encoder_outputs = model.text_encoder(
+                        input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
+                    )
+                else:
+                    encoder_outputs = model.module.text_encoder(
+                        input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
+                    )
+                # we optionnally project last_hidden_state to avoid recomputing every time
+                encoder_hidden_states = encoder_outputs.last_hidden_state
+                if (
+                    config.text_encoder.hidden_size != config.decoder.hidden_size
+                    and config.decoder.cross_attention_hidden_size is None
+                ):
+                    encoder_hidden_states = (
+                        model.enc_to_dec_proj(encoder_hidden_states)
+                        if training_args.parallel_mode.value != "distributed"
+                        else model.module.enc_to_dec_proj(encoder_hidden_states)
+                    )
+
+                if batch.get("attention_mask", None) is not None:
+                    encoder_hidden_states = encoder_hidden_states * batch.get("attention_mask", None)[..., None]
+
+                encoder_outputs.last_hidden_state = encoder_hidden_states
+                batch["encoder_outputs"] = encoder_outputs
+
+        outputs = model(**batch, loss_reduction="sum")
+        # CE (data) loss
+        ce_loss = (outputs.loss * gradient_accumulation_steps * accelerator.num_processes) / num_items_in_batch
+
+        metrics = {"loss": ce_loss}
+        
+        # per CE loss
+        per_codebook_losses = outputs.per_codebook_losses
+        metrics.update({f"codebook_{i}_loss": ((l  * gradient_accumulation_steps * accelerator.num_processes) / num_items_in_batch) for (i,l) in enumerate(per_codebook_losses)})
+        return ce_loss, metrics
+
+    # Define eval fn
+    def eval_step(
+        batch,
+        accelerator,
+        autocast_kwargs,
+    ):
+        eval_model = model if not training_args.torch_compile else model._orig_mod
+
+        if mixed_precision == "fp16":
+            # fp16 doesn't work with T5-like models
+            with accelerator.autocast(autocast_handler=autocast_kwargs):
+                if training_args.parallel_mode.value != "distributed":
+                    encoder_outputs = model.text_encoder(
+                        input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
+                    )
+                else:
+                    encoder_outputs = model.module.text_encoder(
+                        input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
+                    )
+                # we optionnally project last_hidden_state to avoid recomputing every time
+                encoder_hidden_states = encoder_outputs.last_hidden_state
+                if (
+                    config.text_encoder.hidden_size != config.decoder.hidden_size
+                    and config.decoder.cross_attention_hidden_size is None
+                ):
+                    encoder_hidden_states = (
+                        model.enc_to_dec_proj(encoder_hidden_states)
+                        if training_args.parallel_mode.value != "distributed"
+                        else model.module.enc_to_dec_proj(encoder_hidden_states)
+                    )
+
+                if batch.get("attention_mask", None) is not None:
+                    encoder_hidden_states = encoder_hidden_states * batch.get("attention_mask", None)[..., None]
+
+                encoder_outputs.last_hidden_state = encoder_hidden_states
+                batch["encoder_outputs"] = encoder_outputs
+
+        with torch.no_grad():
+            outputs = eval_model(**batch)
+        # CE (data) loss
+        ce_loss = outputs.loss
+        metrics = {"loss": ce_loss}
+        
+        # per CE loss
+        per_codebook_losses = outputs.per_codebook_losses
+        metrics.update({f"codebook_{i}_loss": l for (i,l) in enumerate(per_codebook_losses)})
+        return metrics
+
+    def generate_step(batch, accelerator):
+        batch.pop("decoder_attention_mask", None)
+        eval_model = accelerator.unwrap_model(model, keep_fp32_wrapper=True)
+        if training_args.torch_compile:
+            # if the model is compiled, we use the original model bc compile is not compatible with .generate
+            eval_model = model._orig_mod
+
+        # since we've might have loaded the weights in fp32, we have to autocast to ensure FA2 weights are in half-precision.
+        # with accelerator.autocast(autocast_handler=AutocastKwargs(enabled=(attn_implementation=="flash_attention_2"))):
+        output_audios = eval_model.generate(**batch, **gen_kwargs)
+        output_audios = accelerator.pad_across_processes(output_audios, dim=1, pad_index=0)
+        return output_audios
+
+    model.train()
+
+    total_batched_samples = resume_step if resume_step is not None else 0
+    for epoch in range(epochs_trained, num_epochs):
+        with accelerator.local_main_process_first():
+            vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
+        sampler = None
+        if training_args.group_by_length:
+            sampler = LengthGroupedSampler(train_batch_size, lengths=vectorized_datasets["train"]["target_length"])
+        train_dataloader = DataLoader(
+            vectorized_datasets["train"],
+            collate_fn=data_collator,
+            batch_size=per_device_train_batch_size,
+            sampler=sampler,
+            shuffle=not training_args.group_by_length,
+            num_workers=training_args.dataloader_num_workers,
+            pin_memory=training_args.dataloader_pin_memory,
+        )
+        train_dataloader = accelerator.prepare(train_dataloader)
+        if hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDataset):
+            train_dataloader.dataset.set_epoch(epoch)
+
+        if resume_step is not None:
+            # Skip the first N batches in the dataloader when resuming from a checkpoint
+            logger.info(f"  Skip first {resume_step} batches")
+            train_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+            resume_step = None
+            accelerator.wait_for_everyone()
+
+        # We chunkify the epoch iterator into gradient accumulation steps `n` batches
+        train_iterator = iter(train_dataloader)
+        num_steps_in_epoch = len(train_dataloader)
+        remainder = num_steps_in_epoch % gradient_accumulation_steps
+        remainder = remainder if remainder != 0 else gradient_accumulation_steps
+        total_updates = math.ceil(num_steps_in_epoch / gradient_accumulation_steps)
+        
+        update_step = -1
+        for _ in range(total_updates):
+            update_step += 1
+            
+            # preload the total batch per step
+            batch_samples = []
+            num_batches_in_step = gradient_accumulation_steps if update_step != (total_updates - 1) else remainder
+            for _ in range(num_batches_in_step):
+                batch_samples += [next(train_iterator)]
+                
+            # get num items in batch - if different than BOS and than -100
+            num_items_in_batch = sum([(batch["labels"].ne(audio_encoder_bos_token_id) | batch["labels"].ne(-100) | batch["labels"].ne(audio_encoder_eos_token_id)).sum((0,1))[0] for batch in batch_samples])
+            num_items_in_batch = accelerator.gather(num_items_in_batch).sum().item()
+            
+            # losses = []
+            for i,batch in enumerate(batch_samples):
+                total_batched_samples += 1
+                ctx = model.no_sync if (i < len(batch_samples) - 1 and accelerator.num_processes > 1) else contextlib.nullcontext
+                
+                with ctx():
+                    loss, train_metric = train_step(batch, accelerator, autocast_kwargs, num_items_in_batch, gradient_accumulation_steps)
+                    accelerator.backward(loss)
+                    # losses.append(loss.detach())
+            
+            grad_norm = accelerator.clip_grad_norm_(model.parameters(), training_args.max_grad_norm)
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+
+            # The accelerator has performed an optimization step behind the scenes
+            steps_trained_progress_bar.update(1)
+            cur_step += 1
+
+            # losses = accelerator.gather(sum(losses)).sum().item() / (accelerator.num_processes * gradient_accumulation_steps)
+            
+            if cur_step % training_args.logging_steps == 0:
+                steps_trained_progress_bar.write(
+                    f"Step... ({cur_step} / {total_train_steps} | Loss:"
+                    f" {train_metric['loss']}, Learning Rate:"
+                    f" {lr_scheduler.get_last_lr()[0]})"
+                )
+                train_metric["grad_norm"] = grad_norm.detach().item() if isinstance(grad_norm, torch.Tensor) else grad_norm
+                log_metric(
+                    accelerator,
+                    metrics=train_metric,
+                    learning_rate=lr_scheduler.get_last_lr()[0],
+                    train_time=train_time + time.time() - train_start,
+                    step=cur_step,
+                    epoch=epoch,
+                    prefix="train",
+                )
+
+            # save checkpoint and weights after each save_steps and at the end of training
+            if (cur_step % training_args.save_steps == 0) or cur_step == total_train_steps:
+                intermediate_dir = os.path.join(training_args.output_dir, f"checkpoint-{cur_step}-epoch-{epoch}")
+                # safe_serialization=False to avoid shared tensors saving issue (TODO(YL): it's a temporary fix)
+                # https://github.com/huggingface/transformers/issues/27293#issuecomment-1872560074
+                accelerator.save_state(output_dir=intermediate_dir, safe_serialization=False)
+                accelerator.wait_for_everyone()
+                if accelerator.is_main_process:
+                    rotate_checkpoints(
+                        training_args.save_total_limit, output_dir=training_args.output_dir, logger=logger
+                    )
+
+                    if cur_step == total_train_steps:
+                        # un-wrap student model for save
+                        unwrapped_model = accelerator.unwrap_model(model)
+                        unwrapped_model.save_pretrained(training_args.output_dir)
+
+                    if training_args.push_to_hub:
+                        api.upload_folder(
+                            repo_id=repo_id,
+                            folder_path=training_args.output_dir,
+                            commit_message=f"Saving train state of step {cur_step}",
+                            run_as_future=True,
+                        )
+                accelerator.wait_for_everyone()
+
+            if training_args.do_eval and (cur_step % eval_steps == 0 or cur_step == total_train_steps):
+                train_time += time.time() - train_start
+                # ======================== Evaluating ==============================
+                model.eval()
+                eval_metrics = []
+                eval_preds = []
+                eval_descriptions = []
+                eval_prompts = []
+                eval_start = time.time()
+
+                # release training input batch
+                batch = release_memory(batch)
+
+                validation_dataloader = DataLoader(
+                    vectorized_datasets["eval"],
+                    collate_fn=data_collator,
+                    batch_size=per_device_eval_batch_size,
+                    drop_last=False,
+                    num_workers=training_args.eval_dataloader_num_workers,
+                    pin_memory=training_args.dataloader_pin_memory,
+                )
+                validation_dataloader = accelerator.prepare(validation_dataloader)
+
+                for batch in tqdm(
+                    validation_dataloader,
+                    desc=f"Evaluating - Inference ...",
+                    position=2,
+                    disable=not accelerator.is_local_main_process,
+                ):
+                    # Model forward
+                    eval_metric = eval_step(batch, accelerator, autocast_kwargs)
+                    eval_metric = accelerator.gather_for_metrics(eval_metric)
+                    eval_metric = {key: val.unsqueeze(0) if val.ndim == 0 else val for (key,val) in eval_metric.items()}
+                    eval_metrics.append(eval_metric)
+
+                if training_args.predict_with_generate and (cur_step % eval_generation_steps == 0 or cur_step == total_train_steps):
+                    validation_dataloader = DataLoader(
+                        vectorized_datasets["eval"],
+                        collate_fn=data_collator,
+                        batch_size=per_device_eval_batch_size,
+                        drop_last=False,
+                        num_workers=training_args.eval_dataloader_num_workers,
+                        pin_memory=training_args.dataloader_pin_memory,
+                    )
+                    validation_dataloader = accelerator.prepare(validation_dataloader)
+                    # generation
+                    for batch in tqdm(
+                        validation_dataloader,
+                        desc=f"Evaluating - Generation ...",
+                        position=2,
+                        disable=not accelerator.is_local_main_process,
+                    ):
+                        generated_audios = generate_step(batch, accelerator)
+                        # Gather all predictions and targets
+                        generated_audios, input_ids, prompts = accelerator.pad_across_processes(
+                            (generated_audios, batch["input_ids"], batch["prompt_input_ids"]), dim=1, pad_index=0
+                        )
+                        generated_audios, input_ids, prompts = accelerator.gather_for_metrics(
+                            (generated_audios, input_ids, prompts)
+                        )
+                        eval_preds.extend(generated_audios.to("cpu"))
+                        eval_descriptions.extend(input_ids.to("cpu"))
+                        eval_prompts.extend(prompts.to("cpu"))
+
+                eval_time = time.time() - eval_start
+                # normalize eval metrics
+                eval_metrics = {
+                    key: torch.mean(torch.cat([d[key] for d in eval_metrics])).to("cpu") for key in eval_metrics[0]
+                }
+
+                # compute metrics
+                metrics_desc = ""
+                if training_args.predict_with_generate and (cur_step % eval_generation_steps == 0 or cur_step == total_train_steps):
+                    if accelerator.is_local_main_process:
+                        (
+                            metric_values,
+                            pred_descriptions,
+                            pred_prompts,
+                            audios,
+                            transcriptions,
+                            si_sdr_measures,
+                        ) = compute_metrics(
+                            eval_preds,
+                            eval_descriptions,
+                            eval_prompts,
+                            accelerator.device,
+                            training_args.compute_clap_similarity_metric,
+                            training_args.compute_noise_level_metric,
+                            training_args.noise_level_to_compute_clean_wer,
+                        )
+                        eval_metrics.update(metric_values)
+                        metrics_desc = " ".join([f"Eval {key}: {value} |" for key, value in metric_values.items()])
+                        if "wandb" in training_args.report_to:
+                            log_pred(
+                                accelerator,
+                                pred_descriptions,
+                                pred_prompts,
+                                transcriptions,
+                                audios,
+                                si_sdr_measures,
+                                sampling_rate=sampling_rate,
+                                step=cur_step,
+                                prefix="eval",
+                            )
+                    accelerator.wait_for_everyone()
+
+                # Print metrics and update progress bar
+                if accelerator.is_local_main_process:
+                    steps_trained_progress_bar.write(
+                        f"Eval results for step ({cur_step} / {total_train_steps} | Eval Loss: {eval_metrics['loss']} |"
+                        f" {metrics_desc})"
+                    )
+
+                log_metric(
+                    accelerator,
+                    metrics=eval_metrics,
+                    train_time=eval_time,
+                    step=cur_step,
+                    epoch=epoch,
+                    prefix="eval",
+                )
+
+                # release eval batch and relax metrics
+                eval_metrics, eval_preds, eval_descriptions, eval_prompts, batch, eval_metric = release_memory(
+                    eval_metrics, eval_preds, eval_descriptions, eval_prompts, batch, eval_metric
+                )
+                if training_args.predict_with_generate and (cur_step % eval_generation_steps == 0 or cur_step == total_train_steps):
+                    generated_audios, input_ids, prompts = release_memory(generated_audios, input_ids, prompts)
+
+                # train mode
+                model.train()
+
+                # flush the train metrics
+                train_start = time.time()
+
+            # break condition
+            if cur_step == total_train_steps:
+                continue_training = False
+                break
+
+        if not continue_training:
+            break
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/capspeech/ar/training/utils.py b/capspeech/ar/training/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1a0f03dce0da74625e80c21820261c2ac0b7d29
--- /dev/null
+++ b/capspeech/ar/training/utils.py
@@ -0,0 +1,203 @@
+import os
+import re
+import shutil
+from dataclasses import field
+from pathlib import Path
+from typing import Dict, List
+
+import torch
+from datasets import concatenate_datasets, load_from_disk
+from wandb import Audio
+from datasets import load_from_disk, concatenate_datasets
+
+
+def list_field(default=None, metadata=None):
+    return field(default_factory=lambda: default, metadata=metadata)
+
+
+_RE_CHECKPOINT = re.compile(r"^checkpoint-(\d+)-epoch-(\d+)$")
+CHECKPOINT_CODEC_PREFIX = "checkpoint"
+_RE_CODEC_CHECKPOINT = re.compile(r"^checkpoint-(\d+)$")
+
+
+def get_last_checkpoint(folder):
+    content = os.listdir(folder)
+    checkpoints = [
+        path
+        for path in content
+        if _RE_CHECKPOINT.search(path) is not None and os.path.isdir(os.path.join(folder, path))
+    ]
+    if len(checkpoints) == 0:
+        return
+    return os.path.join(folder, max(checkpoints, key=lambda x: int(_RE_CHECKPOINT.search(x).groups()[0])))
+
+
+def sorted_checkpoints(output_dir=None, checkpoint_prefix="checkpoint") -> List[str]:
+    """Helper function to sort saved checkpoints from oldest to newest."""
+    ordering_and_checkpoint_path = []
+
+    glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)]
+
+    for path in glob_checkpoints:
+        regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
+        if regex_match is not None and regex_match.groups() is not None:
+            ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+
+    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+    return checkpoints_sorted
+
+
+def rotate_checkpoints(save_total_limit=None, output_dir=None, checkpoint_prefix="checkpoint", logger=None) -> None:
+    """Helper function to delete old checkpoints."""
+    if save_total_limit is None or save_total_limit <= 0:
+        return
+    # Check if we should delete older checkpoint(s)
+    checkpoints_sorted = sorted_checkpoints(output_dir=output_dir, checkpoint_prefix=checkpoint_prefix)
+    if len(checkpoints_sorted) <= save_total_limit:
+        return
+
+    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit)
+    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
+    for checkpoint in checkpoints_to_be_deleted:
+        logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
+        shutil.rmtree(checkpoint, ignore_errors=True)
+
+
+def save_codec_checkpoint(output_dir, dataset, step):
+    checkpoint_path = f"{CHECKPOINT_CODEC_PREFIX}-{step}"
+    output_path = os.path.join(output_dir, checkpoint_path)
+    dataset.save_to_disk(output_path)
+
+
+def load_codec_checkpoint(checkpoint_path):
+    dataset = load_from_disk(checkpoint_path)
+    return dataset
+
+
+def sorted_codec_checkpoints(output_dir=None) -> List[str]:
+    """Helper function to sort saved checkpoints from oldest to newest."""
+    ordering_and_checkpoint_path = []
+
+    glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{CHECKPOINT_CODEC_PREFIX}-*")]
+
+    for path in glob_checkpoints:
+        regex_match = re.match(f".*{CHECKPOINT_CODEC_PREFIX}-([0-9]+)", path)
+        if regex_match is not None and regex_match.groups() is not None:
+            ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+
+    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+    return checkpoints_sorted
+
+
+def load_all_codec_checkpoints(output_dir=None) -> List[str]:
+    """Helper function to load and concat all checkpoints."""
+    checkpoints_sorted = sorted_codec_checkpoints(output_dir=output_dir)
+    datasets = [load_from_disk(checkpoint) for checkpoint in checkpoints_sorted]
+    datasets = concatenate_datasets(datasets, axis=0)
+    return datasets
+
+
+def get_last_codec_checkpoint_step(folder) -> int:
+    if not os.path.exists(folder) or not os.path.isdir(folder):
+        os.makedirs(folder, exist_ok=True)
+        return 0
+    content = os.listdir(folder)
+    checkpoints = [path for path in content if _RE_CODEC_CHECKPOINT.search(path) is not None]
+    if len(checkpoints) == 0:
+        return 0
+    last_checkpoint = os.path.join(
+        folder, max(checkpoints, key=lambda x: int(_RE_CODEC_CHECKPOINT.search(x).groups()[0]))
+    )
+    # Find num steps saved state string pattern
+    pattern = r"checkpoint-(\d+)"
+    match = re.search(pattern, last_checkpoint)
+    cur_step = int(match.group(1))
+    return cur_step
+
+
+def log_metric(
+    accelerator,
+    metrics: Dict,
+    train_time: float,
+    step: int,
+    epoch: int,
+    learning_rate: float = None,
+    prefix: str = "train",
+):
+    """Helper function to log all training/evaluation metrics with the correct prefixes and styling."""
+    log_metrics = {}
+    for k, v in metrics.items():
+        if "codebook" in k:
+            log_metrics[f"codebook_{prefix}/{k}"] = v
+        else:
+            log_metrics[f"{prefix}/{k}"] = v
+    log_metrics[f"{prefix}/time"] = train_time
+    log_metrics[f"{prefix}/epoch"] = epoch
+    if learning_rate is not None:
+        log_metrics[f"{prefix}/learning_rate"] = learning_rate
+    accelerator.log(log_metrics, step=step)
+
+
+def log_pred(
+    accelerator,
+    pred_descriptions: List[str],
+    pred_prompts: List[str],
+    transcriptions: List[str],
+    audios: List[torch.Tensor],
+    si_sdr_measures: List[float],
+    sampling_rate: int,
+    step: int,
+    prefix: str = "eval",
+    num_lines: int = 200000,
+):
+    """Helper function to log target/predicted transcriptions to weights and biases (wandb)."""
+    if accelerator.is_main_process:
+        wandb_tracker = accelerator.get_tracker("wandb")
+        # pretty name for current step: step 50000 -> step 50k
+        cur_step_pretty = f"{int(step // 1000)}k" if step > 1000 else step
+        prefix_pretty = prefix.replace("/", "-")
+
+        if si_sdr_measures is None:
+            # convert str data to a wandb compatible format
+            str_data = [
+                [pred_descriptions[i], pred_prompts[i], transcriptions[i]] for i in range(len(pred_descriptions))
+            ]
+            # log as a table with the appropriate headers
+            wandb_tracker.log_table(
+                table_name=f"predictions/{prefix_pretty}-step-{cur_step_pretty}",
+                columns=["Target descriptions", "Target prompts", "Predicted transcriptions"],
+                data=str_data[:num_lines],
+                step=step,
+                commit=False,
+            )
+        else:
+            # convert str data to a wandb compatible format
+            str_data = [
+                [pred_descriptions[i], pred_prompts[i], transcriptions[i], si_sdr_measures[i]]
+                for i in range(len(pred_descriptions))
+            ]
+            # log as a table with the appropriate headers
+            wandb_tracker.log_table(
+                table_name=f"predictions/{prefix_pretty}-step-{cur_step_pretty}",
+                columns=["Target descriptions", "Target prompts", "Predicted transcriptions", "Noise estimation"],
+                data=str_data[:num_lines],
+                step=step,
+                commit=False,
+            )
+
+        # wandb can only loads 100 audios per step
+        wandb_tracker.log(
+            {
+                "Speech samples": [
+                    Audio(
+                        audio,
+                        caption=f"{pred_prompts[i]} --- DESCRIPTION: {pred_descriptions[i]}",
+                        sample_rate=sampling_rate,
+                    )
+                    for (i, audio) in enumerate(audios[: min(len(audios), 100)])
+                ]
+            },
+            step=step,
+        )
diff --git a/capspeech/eval/README.md b/capspeech/eval/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..812acf63ccaa30ab39e8c587a1a665c297284b1b
--- /dev/null
+++ b/capspeech/eval/README.md
@@ -0,0 +1,42 @@
+# CapSpeech Evaluation Tools
+
+## Get Start
+Install dependicies:
+```bash
+conda create -n capeval python=3.9
+conda activate capeval
+pip install -r requirements.txt
+pip install git+https://github.com/sarulab-speech/UTMOSv2.git
+```
+
+For ASR, we need:
+```bash
+conda install ffmpeg
+```
+
+## Evaluate pitch, monotony, speed, age, gender
+RUN:
+```bash
+python base_eval.py
+```
+
+## Evaluate UTMOSv2
+RUN:
+```bash
+python mos_eval.py
+```
+
+## Evaluate ASR Results
+RUN:
+```bash
+python asr_eval.py
+```
+
+## Evaluate emotion, accent
+RUN:
+```bash
+cd src/example/
+python categorized_emotion.py
+python dialect_world_dialect.py
+```
+Please refer to [Vox-profile](https://github.com/tiantiaf0627/vox-profile-release.git) for more evaluation tools.
diff --git a/capspeech/eval/__init__.py b/capspeech/eval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/eval/age_gender.py b/capspeech/eval/age_gender.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d2aa9f843c81cbf6719ba3d6eae11581331399d
--- /dev/null
+++ b/capspeech/eval/age_gender.py
@@ -0,0 +1,35 @@
+import audeer
+import audonnx
+import numpy as np
+
+def age_gender_apply(waveform):
+    age_labels = ['child', 'teenager', 'young adult', 'middle-aged adult', 'elderly']
+    gender_labels = ['female', 'male']
+    url = 'https://zenodo.org/record/7761387/files/w2v2-L-robust-6-age-gender.25c844af-1.1.1.zip'
+    cache_root = audeer.mkdir('cache')
+    model_root = audeer.mkdir('model')
+    sampling_rate = 16000
+    archive_path = audeer.download_url(url, cache_root, verbose=True)
+    audeer.extract_archive(archive_path, model_root)
+    model = audonnx.load(model_root)
+
+    result = model(waveform, sampling_rate)
+    # Process age
+    age_label = result['logits_age'].squeeze() * 100.0
+    if age_label <= 12:
+        age_label = 'child'
+    elif age_label <= 19:
+        age_label = 'teenager'
+    elif age_label <= 39:
+        age_label = 'young adult'
+    elif age_label <= 64:
+        age_label = 'middle-aged adult'
+    else:
+        age_label = 'elderly'
+
+    # Process gender
+    gender_label = result['logits_gender'].squeeze()
+    gender_label = gender_label[:2]  # Remove child
+    gender_label = np.argmax(gender_label)
+
+    return age_label, gender_labels[gender_label]
diff --git a/capspeech/eval/asr_eval.py b/capspeech/eval/asr_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a7a793146841664298710012d4ee3bf818e7142
--- /dev/null
+++ b/capspeech/eval/asr_eval.py
@@ -0,0 +1,24 @@
+from jiwer import wer as calculate_wer
+from jiwer import cer as calculate_cer
+from whisper.normalizers import EnglishTextNormalizer
+import whisper
+import torch
+
+normalizer = EnglishTextNormalizer()
+device = "cuda" if torch.cuda.is_available() else "cpu"
+whisper_model = whisper.load_model("large-v3-turbo", device=device)
+
+def asr(wav_path):
+    result = whisper_model.transcribe(wav_path)
+    pred = result['text'].strip()
+    pred = normalizer(pred)
+    return pred
+
+if __name__ == '__main__':
+    gt_text="Hey, how are you doing today? I like it."
+    wav_path="your-audio"
+    gt_text = normalizer(gt_text.strip())
+    pred_asr = asr(wav_path)
+    wer = round(calculate_wer(gt_text, pred_asr), 3)
+    cer = round(calculate_cer(gt_text, pred_asr), 3)
+    print(wer, cer)
\ No newline at end of file
diff --git a/capspeech/eval/base_eval.py b/capspeech/eval/base_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..440b430c66f8e5b60d2b8cf1ca332b715bc7c542
--- /dev/null
+++ b/capspeech/eval/base_eval.py
@@ -0,0 +1,32 @@
+from pitch import pitch_apply
+from speed import speed_apply
+from age_gender import age_gender_apply
+import librosa
+import json
+import bisect
+
+SPEAKER_RATE_BINS = ["very slowly", "slowly", "slightly slowly", "moderate speed", "slightly fast", "fast", "very fast"]
+UTTERANCE_LEVEL_STD = ["very monotone", "monotone", "slightly expressive and animated", "expressive and animated", "very expressive and animated"]
+SPEAKER_LEVEL_PITCH_BINS = ["very low-pitch", "low-pitch", "slightly low-pitch", "moderate pitch", "slightly high-pitch", "high-pitch", "very high-pitch"]
+with open("bin.json") as json_file:
+    text_bins_dict = json.load(json_file)
+
+audiopath = "YOUR_AUDIO_PATH"
+waveform, _ = librosa.load(audiopath, sr=16000)
+age, gender = age_gender_apply(waveform)
+pitch_mean, pitch_std = pitch_apply(waveform)
+if gender == "male":
+    index = bisect.bisect_right(text_bins_dict["pitch_bins_male"], pitch_mean) - 1
+    pitch = SPEAKER_LEVEL_PITCH_BINS[index]
+else:
+    index = bisect.bisect_right(text_bins_dict["pitch_bins_female"], pitch_mean) - 1
+    pitch = SPEAKER_LEVEL_PITCH_BINS[index]
+
+index = bisect.bisect_right(text_bins_dict["speech_monotony"], pitch_std) - 1
+monotony = UTTERANCE_LEVEL_STD[index]
+speech_duration = speed_apply(waveform)
+
+index = bisect.bisect_right(text_bins_dict["speaking_rate"], speech_duration) - 1
+speed = SPEAKER_RATE_BINS[index]
+
+print(pitch, monotony, speed, age, gender)
diff --git a/capspeech/eval/bin.json b/capspeech/eval/bin.json
new file mode 100644
index 0000000000000000000000000000000000000000..32117a62a96a80a79e0e77753e2836b7f1ac6f1e
--- /dev/null
+++ b/capspeech/eval/bin.json
@@ -0,0 +1,10 @@
+{
+    "speaking_rate": [0.0, 3.8258038258038254, 7.651607651607651, 11.477411477411476, 15.303215303215302, 19.129019129019127, 22.95482295482295, 26.78062678062678], 
+    "noise": [17.12751579284668, 25.4012325831822, 33.67494937351772, 41.94866616385323, 50.22238295418875, 58.49609974452427, 66.76981653485979, 75.04353332519531], 
+    "reverberation": [10, 35, 45, 55, 59, 60], 
+    "speech_monotony": [0.0, 20.37920924595424, 40.75841849190848, 70, 90, 142.6544647216797], 
+    "pitch_bins_male": [64.6531982421875, 81.66683959960938, 98.68048095703125, 115.69412231445312, 132.707763671875, 149.72140502929688, 166.73504638671875, 183.74868774414062], 
+    "pitch_bins_female": [120.17855072021484, 141.6242690945264, 163.06998746883795, 184.51570584314953, 205.96142421746106, 227.40714259177264, 248.8528609660842, 270.29857934039575], 
+    "si-sdr": [-17.804332733154297, -0.40644073486328125, 10, 20, 25, 28, 34.38934326171875], 
+    "pesq": [1, 1.7, 2.4, 3.1, 3.6, 4, 4.499948978424072]
+}
\ No newline at end of file
diff --git a/capspeech/eval/pitch.py b/capspeech/eval/pitch.py
new file mode 100644
index 0000000000000000000000000000000000000000..1162eed47683b23c8a300cb854a54ce0116349d4
--- /dev/null
+++ b/capspeech/eval/pitch.py
@@ -0,0 +1,30 @@
+import torch 
+import penn
+
+def pitch_apply(waveform):
+    hopsize = .01
+    fmin = 30.
+    fmax = 1000.
+    checkpoint = None
+    center = 'half-hop'
+    interp_unvoiced_at = .065
+    sampling_rate = 16000
+    penn_batch_size = 4096
+    waveform = torch.Tensor(waveform).unsqueeze(0)
+    pitch, periodicity = penn.from_audio(
+        waveform.float(),
+        sampling_rate,
+        hopsize=hopsize,
+        fmin=fmin,
+        fmax=fmax,
+        checkpoint=checkpoint,
+        batch_size=penn_batch_size,
+        center=center,
+        interp_unvoiced_at=interp_unvoiced_at,
+        gpu=None
+        )     
+    
+    pitch_mean = pitch.mean().cpu().numpy()
+    pitch_std = pitch.std().cpu().numpy()
+
+    return pitch_mean, pitch_std
\ No newline at end of file
diff --git a/capspeech/eval/requirements.txt b/capspeech/eval/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d0008f33a297e41db6d43f3d7340e0a36bdcf86a
--- /dev/null
+++ b/capspeech/eval/requirements.txt
@@ -0,0 +1,16 @@
+datasets[audio]
+https://github.com/marianne-m/brouhaha-vad/archive/main.zip
+penn
+g2p
+demucs
+transformers
+bitsandbytes
+git+https://github.com/sarulab-speech/UTMOSv2.git
+-U openai-whisper
+jiwer
+numpy==1.26.4
+audeer
+audonnx
+laion_clap
+numpy==1.26.4
+onnxruntime
\ No newline at end of file
diff --git a/capspeech/eval/speed.py b/capspeech/eval/speed.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4b31b5996d819775b436fcbd7353fc03fa196c9
--- /dev/null
+++ b/capspeech/eval/speed.py
@@ -0,0 +1,29 @@
+from pyannote.audio import Model
+from pathlib import Path
+from brouhaha.pipeline import RegressiveActivityDetectionPipeline
+import torch 
+from huggingface_hub import hf_hub_download
+import numpy as np
+
+def speed_apply(waveform):
+    ratio = 16000/270
+    sampling_rate = 16000
+    device = "cpu"
+    waveform = torch.Tensor(waveform).unsqueeze(0)
+    model = Model.from_pretrained(
+            Path(hf_hub_download(repo_id="ylacombe/brouhaha-best", filename="best.ckpt")),
+            strict=False,
+        )
+    model.to(device)
+
+    pipeline = RegressiveActivityDetectionPipeline(segmentation=model, batch_size=1)
+    pipeline.to(torch.device(device))
+
+    device = pipeline._models["segmentation"].device
+
+    res = pipeline({"sample_rate": sampling_rate,
+                    "waveform": waveform.to(device).float()})
+
+    speech_duration = sum(map(lambda x: x[0].duration, res["annotation"].itertracks()))     
+        
+    return speech_duration
\ No newline at end of file
diff --git a/capspeech/eval/src/__init__.py b/capspeech/eval/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/eval/src/example/__init__.py b/capspeech/eval/src/example/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/eval/src/example/categorized_emotion.py b/capspeech/eval/src/example/categorized_emotion.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cf8b475f10d93da220c3c246f2d1ce86533fad2
--- /dev/null
+++ b/capspeech/eval/src/example/categorized_emotion.py
@@ -0,0 +1,92 @@
+import torch
+import logging
+import sys, os, pdb
+import torch.nn.functional as F
+
+from pathlib import Path
+
+sys.path.append(os.path.join(str(Path(os.path.realpath(__file__)).parents[1])))
+sys.path.append(os.path.join(str(Path(os.path.realpath(__file__)).parents[1]), 'model', 'emotion'))
+
+from wavlm_emotion import WavLMWrapper
+from whisper_emotion import WhisperWrapper
+
+
+# define logging console
+import logging
+logging.basicConfig(
+    format='%(asctime)s %(levelname)-3s ==> %(message)s', 
+    level=logging.INFO, 
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+
+os.environ["MKL_NUM_THREADS"] = "1" 
+os.environ["NUMEXPR_NUM_THREADS"] = "1" 
+os.environ["OMP_NUM_THREADS"] = "1" 
+
+
+if __name__ == '__main__':
+
+    label_list = [
+        'Anger', 
+        'Contempt', 
+        'Disgust', 
+        'Fear', 
+        'Happiness', 
+        'Neutral', 
+        'Sadness', 
+        'Surprise', 
+        'Other'
+    ]
+
+    # Find device
+    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
+    if torch.cuda.is_available(): print('GPU available, use GPU')
+
+    # Define the model
+    # Note that ensemble yields the better performance than the single model
+    # Define the model wrapper
+    model_path = "model"
+    wavlm_model = model = WavLMWrapper(
+        pretrain_model="wavlm_large", 
+        finetune_method="finetune",
+        output_class_num=9,
+        freeze_params=True, 
+        use_conv_output=True,
+        detailed_class_num=17
+    ).to(device)
+    
+    whisper_model = WhisperWrapper(
+        pretrain_model="whisper_large", 
+        finetune_method="lora",
+        lora_rank=16, 
+        output_class_num=9,
+        freeze_params=True, 
+        use_conv_output=True,
+        detailed_class_num=17
+    ).to(device)
+        
+    whisper_model.load_state_dict(torch.load(os.path.join(model_path, f"whisper_emotion.pt"), weights_only=True), strict=False)
+    whisper_model.load_state_dict(torch.load(os.path.join(model_path, f"whisper_emotion_lora.pt")), strict=False)
+    wavlm_model.load_state_dict(torch.load(os.path.join(model_path, f"wavlm_emotion.pt"), weights_only=True), strict=False)
+
+    wavlm_model.eval()
+    whisper_model.eval()
+    
+    # Audio must be 16k Hz
+    data = torch.zeros([1, 16000]).to(device)
+    whisper_logits, whisper_embedding, _, _, _, _   = whisper_model(
+        data, return_feature=True
+    )
+    wavlm_logits, wavlm_embedding, _, _, _, _       = wavlm_model(
+        data, return_feature=True
+    )
+    
+    ensemble_logits = (whisper_logits + wavlm_logits) / 2
+    ensemble_prob   = F.softmax(ensemble_logits, dim=1)
+
+    print(ensemble_prob.shape)
+    print(whisper_embedding.shape)
+    print(wavlm_embedding.shape)
+    print(label_list[torch.argmax(ensemble_prob).detach().cpu().item()])
+
diff --git a/capspeech/eval/src/example/dialect_world_dialect.py b/capspeech/eval/src/example/dialect_world_dialect.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5371ee5a6f42e6bc8acf3c75a263bbd9d1d4365
--- /dev/null
+++ b/capspeech/eval/src/example/dialect_world_dialect.py
@@ -0,0 +1,87 @@
+import torch
+import sys, os, pdb
+import argparse, logging
+import torch.nn.functional as F
+
+from pathlib import Path
+
+
+sys.path.append(os.path.join(str(Path(os.path.realpath(__file__)).parents[1])))
+sys.path.append(os.path.join(str(Path(os.path.realpath(__file__)).parents[1]), 'model', 'dialect'))
+
+from wavlm_dialect import WavLMWrapper
+from whisper_dialect import WhisperWrapper
+
+
+# define logging console
+import logging
+logging.basicConfig(
+    format='%(asctime)s %(levelname)-3s ==> %(message)s', 
+    level=logging.INFO, 
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+
+os.environ["MKL_NUM_THREADS"] = "1" 
+os.environ["NUMEXPR_NUM_THREADS"] = "1" 
+os.environ["OMP_NUM_THREADS"] = "1" 
+
+
+if __name__ == '__main__':
+
+   
+    label_list = [
+        'East Asia', 'English', 'Germanic', 'Irish', 
+        'North America', 'Northern Irish', 'Oceania', 
+        'Other', 'Romance', 'Scottish', 'Semitic', 'Slavic', 
+        'South African', 'Southeast Asia', 'South Asia', 'Welsh'
+    ]
+    
+    # Find device
+    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
+    if torch.cuda.is_available(): print('GPU available, use GPU')
+
+    # Define the model
+    # Note that ensemble yields the better performance than the single model
+    model_path = "YOUR_PATH"
+    # Define the model wrapper
+    wavlm_model = model = WavLMWrapper(
+        pretrain_model="wavlm_large", 
+        finetune_method="lora",
+        lora_rank=16,
+        output_class_num=16,
+        freeze_params=False, 
+        use_conv_output=True,
+        apply_gradient_reversal=False, 
+        num_dataset=3
+    ).to(device)
+    
+    whisper_model = WhisperWrapper(
+        pretrain_model="whisper_large", 
+        finetune_method="lora",
+        lora_rank=16,
+        output_class_num=16,
+        freeze_params=False, 
+        use_conv_output=True,
+        apply_gradient_reversal=False, 
+        num_dataset=11
+    ).to(device)
+    
+    wavlm_model.load_state_dict(torch.load(os.path.join(model_path, f"wavlm_world_dialect.pt"), weights_only=True), strict=False)
+    wavlm_model.load_state_dict(torch.load(os.path.join(model_path, f"wavlm_world_dialect_lora.pt")), strict=False)
+    
+    whisper_model.load_state_dict(torch.load(os.path.join(model_path, f"whisper_world_dialect.pt"), weights_only=True), strict=False)
+    whisper_model.load_state_dict(torch.load(os.path.join(model_path, f"whisper_world_dialect_lora.pt")), strict=False)
+
+    wavlm_model.eval()
+    whisper_model.eval()
+        
+    data = torch.zeros([1, 16000]).to(device)
+    wavlm_logits, wavlm_embeddings      = wavlm_model(data, return_feature=True)
+    whisper_logits, whisper_embeddings  = whisper_model(data, return_feature=True)
+
+    ensemble_logits = (wavlm_logits + whisper_logits) / 2
+    ensemble_prob   = F.softmax(ensemble_logits, dim=1)
+
+    pred = label_list[ensemble_prob.argmax(-1)]
+    print(pred)
+
diff --git a/capspeech/eval/src/model/__init__.py b/capspeech/eval/src/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/eval/src/model/adapter.py b/capspeech/eval/src/model/adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..86e8e9e5312d59d648fc917fd513fe047dd00e87
--- /dev/null
+++ b/capspeech/eval/src/model/adapter.py
@@ -0,0 +1,73 @@
+# --------------------------------------------------------
+# References:
+# https://github.com/jxhe/unify-parameter-efficient-tuning
+# --------------------------------------------------------
+
+import math
+import torch
+import torch.nn as nn
+
+
+class Adapter(nn.Module):
+    def __init__(
+        self,
+        config=None,
+        d_model=768,
+        bottleneck=None,
+        dropout=0.0,
+        init_option="lora",
+        adapter_scalar="1.0",
+        adapter_layernorm_option="none"
+    ):
+        super().__init__()
+        self.n_embd = config.d_model if d_model is None else d_model
+        self.down_size = config.attn_bn if bottleneck is None else bottleneck
+
+        #_before
+        self.adapter_layernorm_option = adapter_layernorm_option
+
+        self.adapter_layer_norm_before = None
+        if adapter_layernorm_option == "in" or adapter_layernorm_option == "out":
+            self.adapter_layer_norm_before = nn.LayerNorm(self.n_embd)
+
+        if adapter_scalar == "learnable_scalar":
+            self.scale = nn.Parameter(torch.ones(1))
+        else:
+            self.scale = float(adapter_scalar)
+
+        self.down_proj = nn.Linear(self.n_embd, self.down_size)
+        self.non_linear_func = nn.ReLU()
+        self.up_proj = nn.Linear(self.down_size, self.n_embd)
+
+        self.dropout = dropout
+        if init_option == "bert":
+            raise NotImplementedError
+        elif init_option == "lora":
+            with torch.no_grad():
+                nn.init.kaiming_uniform_(self.down_proj.weight, a=math.sqrt(5))
+                nn.init.zeros_(self.up_proj.weight)
+                nn.init.zeros_(self.down_proj.bias)
+                nn.init.zeros_(self.up_proj.bias)
+
+    def forward(self, x, add_residual=True, residual=None):
+        residual = x if residual is None else residual
+        if self.adapter_layernorm_option == 'in':
+            x = self.adapter_layer_norm_before(x)
+        
+        down = self.down_proj(x)
+        
+        down = self.non_linear_func(down)
+        down = nn.functional.dropout(down, p=self.dropout, training=self.training)
+        up = self.up_proj(down)
+        
+        up = up * self.scale
+
+        if self.adapter_layernorm_option == 'out':
+            up = self.adapter_layer_norm_before(up)
+
+        if add_residual:
+            output = up + residual
+        else:
+            output = up
+
+        return output
\ No newline at end of file
diff --git a/capspeech/eval/src/model/dialect/__init__.py b/capspeech/eval/src/model/dialect/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/eval/src/model/dialect/wavlm_dialect.py b/capspeech/eval/src/model/dialect/wavlm_dialect.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d8a1d6a5c44b4e5791f9197cde1ad6dec50fba6
--- /dev/null
+++ b/capspeech/eval/src/model/dialect/wavlm_dialect.py
@@ -0,0 +1,300 @@
+import os
+import pdb
+import copy
+import torch
+import argparse
+import loralib as lora
+import transformers.models.wavlm.modeling_wavlm as wavlm
+from speechbrain.nnet.normalization import LayerNorm
+from speechbrain.lobes.models.huggingface_transformers.huggingface import make_padding_masks
+
+from torch import nn
+from torch.nn import functional as F
+from transformers import Wav2Vec2FeatureExtractor
+from transformers import WavLMModel
+
+import sys
+from pathlib import Path
+sys.path.append(os.path.join(str(Path(os.path.realpath(__file__)).parents[1])))
+from revgrad import RevGrad
+
+class WavLMEncoderLayer(nn.Module):
+    def __init__(self, layer_idx, config, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = wavlm.WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = wavlm.WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.config = config
+        
+        if layer_idx > config.num_hidden_layers // 2:
+            if self.config.finetune_method == "lora" or self.config.finetune_method == "combined":
+                self.feed_forward.intermediate_dense    = lora.Linear(config.hidden_size, config.intermediate_size, r=config.lora_rank)
+                self.feed_forward.output_dense          = lora.Linear(config.intermediate_size, config.hidden_size, r=config.lora_rank)
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False, index=0):
+        
+        attn_residual = hidden_states
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+            index=index,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class WavLMEncoderLayerStableLayerNorm(nn.Module):
+    def __init__(self, layer_idx, config, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = wavlm.WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = wavlm.WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.config = config
+
+        if layer_idx > config.num_hidden_layers // 2:
+            if self.config.finetune_method == "lora" or self.config.finetune_method == "combined":
+                self.feed_forward.intermediate_dense    = lora.Linear(config.hidden_size, config.intermediate_size, r=config.lora_rank)
+                self.feed_forward.output_dense          = lora.Linear(config.intermediate_size, config.hidden_size, r=config.lora_rank)
+            
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+   
+class WavLMWrapper(nn.Module):
+    def __init__(
+        self, 
+        pretrain_model="wavlm_large", 
+        hidden_dim=256,
+        finetune_method="lora",
+        lora_rank=16,
+        freeze_params=True,
+        output_class_num=4,
+        use_conv_output=True,
+        apply_gradient_reversal=False,
+        num_dataset=4
+    ):
+        super(WavLMWrapper, self).__init__()
+        # 1. We Load the model first with weights
+        if pretrain_model == "wavlm":
+            self.backbone_model = WavLMModel.from_pretrained(
+                "microsoft/wavlm-base-plus",
+                output_hidden_states=True,
+            )
+        elif pretrain_model == "wavlm_large":
+            self.processor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-large')
+            self.backbone_model = WavLMModel.from_pretrained(
+                "microsoft/wavlm-large",
+                output_hidden_states=True,
+            )
+        self.pretrain_model             = pretrain_model
+        self.finetune_method            = finetune_method
+        self.apply_gradient_reversal    = apply_gradient_reversal
+        self.use_conv_output            = use_conv_output
+        
+        state_dict = self.backbone_model.state_dict()
+        # 2. Read the model config
+        self.model_config = self.backbone_model.config
+        self.model_config.finetune_method        = finetune_method
+        self.model_config.lora_rank              = lora_rank
+        
+        # 3. Config encoder layers with adapter or embedding prompt
+        if self.pretrain_model == "wavlm":
+            self.backbone_model.encoder.layers = nn.ModuleList(
+                [WavLMEncoderLayer(i, self.model_config, has_relative_position_bias=(i == 0)) for i in range(self.model_config.num_hidden_layers)]
+            )
+        elif self.pretrain_model == "wavlm_large":
+            self.backbone_model.encoder.layers = nn.ModuleList(
+                [WavLMEncoderLayerStableLayerNorm(i, self.model_config, has_relative_position_bias=(i == 0)) for i in range(self.model_config.num_hidden_layers)]
+            )
+        # 4. Load the weights back
+        msg = self.backbone_model.load_state_dict(state_dict, strict=False)
+
+        # 5. Freeze the weights
+        self.freeze_params = freeze_params
+        if self.freeze_params and self.finetune_method != "lora":
+            for _, p in self.backbone_model.named_parameters(): p.requires_grad = False
+        elif self.freeze_params and self.finetune_method == "lora":
+            for name, p in self.backbone_model.named_parameters():
+                if name in msg.missing_keys: p.requires_grad = True
+                else: p.requires_grad = False
+        else:
+            for _, p in self.backbone_model.named_parameters(): p.requires_grad = True
+
+        # 6. Downstream models
+        self.model_seq = nn.Sequential(
+            nn.Conv1d(self.model_config.hidden_size, hidden_dim, 1, padding=0),
+            nn.ReLU(),
+            nn.Dropout(p=0.1),
+            nn.Conv1d(hidden_dim, hidden_dim, 1, padding=0),
+            nn.ReLU(),
+            nn.Dropout(p=0.1),
+            nn.Conv1d(hidden_dim, hidden_dim, 1, padding=0)
+        )
+
+        if self.use_conv_output:
+            num_layers = self.model_config.num_hidden_layers + 1  # transformer layers + input embeddings
+            self.weights = nn.Parameter(torch.ones(num_layers)/num_layers)
+        else:
+            num_layers = self.model_config.num_hidden_layers
+            self.weights = nn.Parameter(torch.zeros(num_layers))
+        
+        if apply_gradient_reversal:
+            self.dataset_layer = nn.Sequential(
+                RevGrad(),
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, num_dataset),
+            )
+        
+        self.out_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, output_class_num),
+        )
+        
+    def forward(self, x, length=None, return_feature=False):
+        # 1. feature extraction and projections
+        if self.pretrain_model == "wavlm_large":  
+            with torch.no_grad():
+                signal, attention_mask = list(), list()
+                if length is not None: attention_mask = make_padding_masks(x, wav_len=length/length.max()).to(x.device)
+                else: attention_mask = make_padding_masks(x, wav_len=torch.tensor([1]).to(x.device)).to(x.device)
+
+                for idx in range(len(x)):
+                    input = self.processor(x[idx], sampling_rate=16_000, return_tensors="pt", padding=True)
+                    signal.append(input["input_values"][0].to(x.device))
+                signal = torch.stack(signal)
+
+        # 2. get length and mask
+        if length is not None:
+            length = self.get_feat_extract_output_lengths(length.detach().cpu())
+            length = length.cuda()
+
+        if self.pretrain_model == "wavlm": 
+            x = self.backbone_model(
+                x, output_hidden_states=True
+            ).hidden_states
+        else: 
+            x = self.backbone_model(
+                signal, 
+                attention_mask=attention_mask, 
+                output_hidden_states=True
+            ).hidden_states
+        
+        # 4. stacked feature
+        if self.use_conv_output: stacked_feature = torch.stack(x, dim=0)
+        else: stacked_feature = torch.stack(x, dim=0)[1:]
+        
+        # 5. Weighted sum
+        _, *origin_shape = stacked_feature.shape
+        # Return transformer enc outputs [num_enc_layers, B, T, D]
+        if self.use_conv_output:
+            stacked_feature = stacked_feature.view(self.backbone_model.config.num_hidden_layers+1, -1)
+        else:
+            stacked_feature = stacked_feature.view(self.backbone_model.config.num_hidden_layers, -1)
+        norm_weights = F.softmax(self.weights, dim=-1)
+        
+        # Perform weighted average
+        weighted_feature = (norm_weights.unsqueeze(-1) * stacked_feature).sum(dim=0)
+        features = weighted_feature.view(*origin_shape)
+        
+        # 6. Pass the weighted average to point-wise 1D Conv
+        # B x T x D
+        features = features.transpose(1, 2)
+        features = self.model_seq(features)
+        features = features.transpose(1, 2)
+        
+        # 7. Pooling
+        if length is not None:
+            mean, std = list(), list()
+            for snt_id in range(features.shape[0]):
+                # Avoiding padded time steps
+                actual_size = length[snt_id]
+                mean.append(torch.mean(features[snt_id, 0:actual_size, ...], dim=0))
+            features = torch.stack(mean)
+        else:
+            features = torch.mean(features, dim=1)
+
+        # 8. Output predictions
+        # B x D
+        predicted = self.out_layer(features)
+        if self.apply_gradient_reversal: 
+            dataset_predicted = self.dataset_layer(features)
+            if return_feature: return predicted, dataset_predicted, features
+            return predicted, dataset_predicted
+        if return_feature: return predicted, features
+        return predicted
+    
+    # From huggingface
+    def get_feat_extract_output_lengths(self, input_length):
+        """
+        Computes the output length of the convolutional layers
+        """
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+        for kernel_size, stride in zip(self.backbone_model.config.conv_kernel, self.backbone_model.config.conv_stride):
+            input_length = _conv_out_length(input_length, kernel_size, stride)
+        return input_length
+
+def prepare_mask(length, shape, dtype):
+    # Modified from huggingface
+    mask = torch.zeros(
+        shape, dtype=dtype
+    )
+    # these two operations makes sure that all values
+    # before the output lengths indices are attended to
+    mask[(torch.arange(mask.shape[0]), length.cpu() - 1)] = 1
+    mask = mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+    return mask
+    
+    
\ No newline at end of file
diff --git a/capspeech/eval/src/model/dialect/whisper_dialect.py b/capspeech/eval/src/model/dialect/whisper_dialect.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ac9ef042629ea6a25b62afd21a082d6e060e9db
--- /dev/null
+++ b/capspeech/eval/src/model/dialect/whisper_dialect.py
@@ -0,0 +1,301 @@
+import os
+import pdb
+import copy
+import torch
+import argparse
+import numpy as np
+import loralib as lora
+import transformers.models.whisper.modeling_whisper as whisper
+
+from torch import nn
+from torch.nn import functional as F
+from transformers.activations import ACT2FN
+from transformers import WhisperModel, AutoFeatureExtractor
+
+import sys
+from pathlib import Path
+sys.path.append(os.path.join(str(Path(os.path.realpath(__file__)).parents[1])))
+from revgrad import RevGrad
+
+class WhisperEncoderLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = whisper.WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.config = config
+        
+        if layer_idx > config.encoder_layers // 2:
+            if self.config.finetune_method == "lora" or self.config.finetune_method == "combined":
+                self.fc1 = lora.Linear(self.embed_dim, config.encoder_ffn_dim, r=config.lora_rank)
+                self.fc2 = lora.Linear(config.encoder_ffn_dim, self.embed_dim, r=config.lora_rank)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+   
+class WhisperWrapper(nn.Module):
+    def __init__(
+        self, 
+        pretrain_model="whisper_large",
+        output_class_num=4, 
+        hidden_dim=256, 
+        finetune_method="lora",
+        lora_rank=16,
+        freeze_params=True,
+        use_conv_output=True,
+        apply_gradient_reversal=False, 
+        num_dataset=4
+    ):
+        super(WhisperWrapper, self).__init__()
+        # 1. We Load the model first with weights
+        self.feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny", chunk_length=15)
+        self.pretrain_model = pretrain_model
+        if self.pretrain_model == "whisper_tiny":
+            self.backbone_model = WhisperModel.from_pretrained(
+                "openai/whisper-tiny",
+                output_hidden_states=True,
+                ignore_mismatched_sizes=True,
+                max_source_positions=750,
+            )
+        elif self.pretrain_model == "whisper_base":
+            self.backbone_model = WhisperModel.from_pretrained(
+                "openai/whisper-base",
+                output_hidden_states=True,
+                ignore_mismatched_sizes=True,
+                max_source_positions=750,
+            )
+        elif self.pretrain_model == "whisper_small":
+            self.backbone_model = WhisperModel.from_pretrained(
+                "openai/whisper-small",
+                output_hidden_states=True,
+                max_source_positions=750,
+                ignore_mismatched_sizes=True
+            )
+        elif self.pretrain_model == "whisper_medium":
+            self.backbone_model = WhisperModel.from_pretrained(
+                "openai/whisper-medium",
+                output_hidden_states=True,
+                ignore_mismatched_sizes=True
+            )
+        elif self.pretrain_model == "whisper_large":
+            self.feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-large-v3", chunk_length=15)
+            self.backbone_model = WhisperModel.from_pretrained(
+                "openai/whisper-large-v3",
+                output_hidden_states=True,
+                ignore_mismatched_sizes=True,
+                max_source_positions=750,
+            )
+        self.embed_positions = copy.deepcopy(self.backbone_model.encoder.embed_positions.weight)
+        self.embed_positions.requires_grad = False
+
+        state_dict = self.backbone_model.state_dict()
+        # 2. Read the model config
+        self.model_config = self.backbone_model.config
+        self.model_config.finetune_method       = finetune_method
+        self.model_config.lora_rank             = lora_rank
+        self.finetune_method                    = finetune_method
+        self.apply_gradient_reversal            = apply_gradient_reversal
+        self.use_conv_output                    = use_conv_output
+        
+        if self.finetune_method == "lora":
+            # 3. Config encoder layers with adapter or embedding prompt
+            self.backbone_model.encoder.layers = nn.ModuleList(
+                [WhisperEncoderLayer(self.model_config, layer_idx) for layer_idx in range(self.model_config.encoder_layers)]
+            )
+            # 4. Load the weights back
+            msg = self.backbone_model.load_state_dict(state_dict, strict=False)
+        
+        # 2. Freeze the weights
+        self.freeze_params = freeze_params
+        if self.freeze_params and self.finetune_method != "lora":
+            for _, p in self.backbone_model.named_parameters(): p.requires_grad = False
+        elif self.freeze_params and self.finetune_method == "lora":
+            for name, p in self.backbone_model.named_parameters():
+                if name in msg.missing_keys: p.requires_grad = True
+                else: p.requires_grad = False
+        else:
+            for name, p in self.backbone_model.named_parameters(): 
+                if "decoder" not in name and "conv1" not in name and "conv2" not in name and "embed_positions" not in name: p.requires_grad = True
+                else: p.requires_grad = False
+        
+        # 6. Downstream models
+        self.model_seq = nn.Sequential(
+            nn.Conv1d(self.model_config.hidden_size, hidden_dim, 1, padding=0),
+            nn.ReLU(),
+            nn.Dropout(p=0.1),
+            nn.Conv1d(hidden_dim, hidden_dim, 1, padding=0),
+            nn.ReLU(),
+            nn.Dropout(p=0.1),
+            nn.Conv1d(hidden_dim, hidden_dim, 1, padding=0)
+        )
+
+        if use_conv_output:
+            num_layers = self.model_config.num_hidden_layers + 1  # transformer layers + input embeddings
+            self.weights = nn.Parameter(torch.ones(num_layers)/num_layers)
+        else:
+            num_layers = self.model_config.num_hidden_layers
+            self.weights = nn.Parameter(torch.zeros(num_layers))
+        
+        if apply_gradient_reversal:
+            self.dataset_layer = nn.Sequential(
+                RevGrad(),
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, num_dataset),
+            )
+        self.out_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, output_class_num),
+        )
+            
+        
+    def forward(self, x, length=None, return_feature=False):
+        # 1. feature extraction and projections
+        if length is not None:
+            max_audio_len = 15*16000
+            # Append to list for feature_extractor to work
+            new_x = list()
+            for idx in range(len(length)):
+                new_x.append(x[idx].detach().cpu().numpy())
+            
+            # Max length is max audio len in a batch
+            features = self.feature_extractor(
+                new_x,
+                return_tensors="pt", 
+                sampling_rate=16000,
+                max_length=max_audio_len
+            )
+            features = features.input_features.cuda()
+        else:
+            max_audio_len = 15*16000
+            features = self.feature_extractor(
+                x[0].detach().cpu(), 
+                return_tensors="pt", 
+                sampling_rate=16000,
+                max_length=max_audio_len
+            )
+            features = features.input_features.cuda()
+        
+        # 2. get length and mask
+        if length is not None:
+            length = self._get_feat_extract_output_lengths(length.detach().cpu())
+            # Replace positional embeddings
+            self.backbone_model.encoder.embed_positions = self.backbone_model.encoder.embed_positions.from_pretrained(self.embed_positions[:750])
+        else:
+            # Replace positional embeddings
+            length = torch.tensor([len(x[0])])
+            length = self._get_feat_extract_output_lengths(length)
+            self.backbone_model.encoder.embed_positions = self.backbone_model.encoder.embed_positions.from_pretrained(self.embed_positions[:750])
+            
+        # 3. transformer encoding features
+        # compute reduced attention_mask corresponding to feature vectors
+        features = self.backbone_model.encoder(
+            features, output_hidden_states=True
+        ).hidden_states
+
+        features = torch.stack(features, dim=0)[-1]
+
+        # 6. Pass the weighted average to point-wise 1D Conv
+        # B x T x D
+        features = features.transpose(1, 2)
+        features = self.model_seq(features)
+        features = features.transpose(1, 2)
+        
+        # 7. Pooling
+        if length is not None:
+            mean, std = list(), list()
+            for snt_id in range(features.shape[0]):
+                # Avoiding padded time steps
+                actual_size = length[snt_id]
+                mean.append(torch.mean(features[snt_id, 0:actual_size, ...], dim=0))
+            features = torch.stack(mean)
+        else:
+            features = torch.mean(features, dim=1)
+            
+        # 8. Output predictions
+        # B x D
+        predicted = self.out_layer(features)
+        if self.apply_gradient_reversal: 
+            dataset_predicted = self.dataset_layer(features)
+            if return_feature: return predicted, dataset_predicted, features
+            return predicted, dataset_predicted
+        if return_feature: return predicted, features
+        return predicted
+        
+    # From huggingface
+    def _get_feat_extract_output_lengths(self, input_lengths):
+        """
+        Computes the output length of the convolutional layers
+        """
+        input_lengths = input_lengths // 160
+        input_lengths = (input_lengths - 1) // 2 + 1
+        return input_lengths
+
+def prepare_mask(length, shape, dtype):
+    # Modified from huggingface
+    mask = torch.zeros(
+        shape, dtype=dtype
+    )
+    # these two operations makes sure that all values
+    # before the output lengths indices are attended to
+    mask[(torch.arange(mask.shape[0]), length.cpu() - 1)] = 1
+    mask = mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+    return mask
+    
\ No newline at end of file
diff --git a/capspeech/eval/src/model/emotion/__init__.py b/capspeech/eval/src/model/emotion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/eval/src/model/emotion/wavlm_emotion.py b/capspeech/eval/src/model/emotion/wavlm_emotion.py
new file mode 100644
index 0000000000000000000000000000000000000000..b777d27fe1dfc2c5ca687b75e829418e76e6e06d
--- /dev/null
+++ b/capspeech/eval/src/model/emotion/wavlm_emotion.py
@@ -0,0 +1,315 @@
+import os
+import torch
+import loralib as lora
+import transformers.models.wavlm.modeling_wavlm as wavlm
+from speechbrain.lobes.models.huggingface_transformers.huggingface import make_padding_masks
+
+from torch import nn
+from torch.nn import functional as F
+from transformers import Wav2Vec2FeatureExtractor
+from transformers import WavLMModel
+
+import sys
+from pathlib import Path
+sys.path.append(os.path.join(str(Path(os.path.realpath(__file__)).parents[1])))
+
+class WavLMEncoderLayer(nn.Module):
+    def __init__(self, layer_idx, config, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = wavlm.WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = wavlm.WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.config = config
+        
+        if layer_idx > config.num_hidden_layers // 2:
+            if self.config.finetune_method == "lora" or self.config.finetune_method == "combined":
+                self.feed_forward.intermediate_dense    = lora.Linear(config.hidden_size, config.intermediate_size, r=config.lora_rank)
+                self.feed_forward.output_dense          = lora.Linear(config.intermediate_size, config.hidden_size, r=config.lora_rank)
+        
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False, index=0):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+            index=index,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        
+        # Adapter
+        if self.config.finetune_method == "adapter":
+            adapt_h = self.adapter(hidden_states)
+
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class WavLMEncoderLayerStableLayerNorm(nn.Module):
+    def __init__(self, layer_idx, config, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = wavlm.WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = wavlm.WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.config = config
+
+        if layer_idx > config.num_hidden_layers // 2:
+            if self.config.finetune_method == "lora" or self.config.finetune_method == "combined":
+                self.feed_forward.intermediate_dense    = lora.Linear(config.hidden_size, config.intermediate_size, r=config.lora_rank)
+                self.feed_forward.output_dense          = lora.Linear(config.intermediate_size, config.hidden_size, r=config.lora_rank)
+            
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+   
+class WavLMWrapper(nn.Module):
+    def __init__(
+        self, 
+        pretrain_model="wavlm_large", 
+        hidden_dim=256,
+        finetune_method="lora",
+        lora_rank=16,
+        freeze_params=True,
+        output_class_num=4,
+        use_conv_output=True,
+        detailed_class_num=17
+    ):
+        super(WavLMWrapper, self).__init__()
+        # 1. We Load the model first with weights
+        self.pretrain_model     = pretrain_model
+        self.finetune_method    = finetune_method
+        self.freeze_params      = freeze_params
+        self.use_conv_output    = use_conv_output
+        self.lora_rank          = lora_rank
+        if self.pretrain_model == "wavlm":
+            self.backbone_model = WavLMModel.from_pretrained(
+                "microsoft/wavlm-base-plus",
+                output_hidden_states=True,
+            )
+        elif self.pretrain_model == "wavlm_large":
+            self.processor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-large')
+            self.backbone_model = WavLMModel.from_pretrained(
+                "microsoft/wavlm-large",
+                output_hidden_states=True,
+            )
+        state_dict = self.backbone_model.state_dict()
+        # 2. Read the model config
+        self.model_config = self.backbone_model.config
+        self.model_config.finetune_method        = self.finetune_method
+        self.model_config.lora_rank              = self.lora_rank
+        
+        # 3. Config encoder layers with adapter or embedding prompt
+        if self.pretrain_model == "wavlm":
+            self.backbone_model.encoder.layers = nn.ModuleList(
+                [WavLMEncoderLayer(i, self.model_config, has_relative_position_bias=(i == 0)) for i in range(self.model_config.num_hidden_layers)]
+            )
+        elif self.pretrain_model == "wavlm_large":
+            self.backbone_model.encoder.layers = nn.ModuleList(
+                [WavLMEncoderLayerStableLayerNorm(i, self.model_config, has_relative_position_bias=(i == 0)) for i in range(self.model_config.num_hidden_layers)]
+            )
+        # 4. Load the weights back
+        msg = self.backbone_model.load_state_dict(state_dict, strict=False)
+
+        # 5. Freeze the weights
+        self.freeze_params = freeze_params
+        if self.freeze_params and self.finetune_method != "lora":
+            for _, p in self.backbone_model.named_parameters(): p.requires_grad = False
+        elif self.freeze_params and self.finetune_method == "lora":
+            for name, p in self.backbone_model.named_parameters():
+                if name in msg.missing_keys: p.requires_grad = True
+                else: p.requires_grad = False
+        else:
+            for _, p in self.backbone_model.named_parameters(): p.requires_grad = True
+
+        # 6. Downstream models
+        self.model_seq = nn.Sequential(
+            nn.Conv1d(self.model_config.hidden_size, hidden_dim, 1, padding=0),
+            nn.ReLU(),
+            nn.Dropout(p=0.1),
+            nn.Conv1d(hidden_dim, hidden_dim, 1, padding=0),
+            nn.ReLU(),
+            nn.Dropout(p=0.1),
+            nn.Conv1d(hidden_dim, hidden_dim, 1, padding=0)
+        )
+
+        if self.use_conv_output:
+            num_layers = self.model_config.num_hidden_layers + 1  # transformer layers + input embeddings
+            self.weights = nn.Parameter(torch.ones(num_layers)/num_layers)
+        else:
+            num_layers = self.model_config.num_hidden_layers
+            self.weights = nn.Parameter(torch.zeros(num_layers))
+        
+        self.emotion_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, output_class_num),
+        )
+
+        self.detailed_out_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, detailed_class_num),
+        )
+        
+        self.arousal_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()
+        )
+
+        self.valence_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()
+        )
+
+        self.dominance_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()
+        )
+        
+    def forward(self, x, length=None, return_feature=False):
+        # 1. feature extraction and projections
+        if self.pretrain_model == "wavlm_large":  
+            with torch.no_grad():
+                signal, attention_mask = list(), list()
+                if length is not None: attention_mask = make_padding_masks(x, wav_len=length/length.max()).to(x.device)
+                else: attention_mask = make_padding_masks(x, wav_len=torch.tensor([1]).to(x.device)).to(x.device)
+
+                for idx in range(len(x)):
+                    input = self.processor(x[idx], sampling_rate=16_000, return_tensors="pt", padding=True)
+                    signal.append(input["input_values"][0].to(x.device))
+                signal = torch.stack(signal)
+
+        # 2. get length and mask
+        if length is not None:
+            length = self.get_feat_extract_output_lengths(length.detach().cpu())
+            length = length.cuda()
+
+        if self.pretrain_model == "wavlm": 
+            x = self.backbone_model(
+                x, output_hidden_states=True
+            ).hidden_states
+        else: 
+            x = self.backbone_model(
+                signal, 
+                attention_mask=attention_mask, 
+                output_hidden_states=True
+            ).hidden_states
+        
+        # 4. stacked feature
+        if self.use_conv_output: stacked_feature = torch.stack(x, dim=0)
+        else: stacked_feature = torch.stack(x, dim=0)[1:]
+        
+        # 5. Weighted sum
+        _, *origin_shape = stacked_feature.shape
+        # Return transformer enc outputs [num_enc_layers, B, T, D]
+        if self.use_conv_output:
+            stacked_feature = stacked_feature.view(self.backbone_model.config.num_hidden_layers+1, -1)
+        else:
+            stacked_feature = stacked_feature.view(self.backbone_model.config.num_hidden_layers, -1)
+        norm_weights = F.softmax(self.weights, dim=-1)
+        
+        # Perform weighted average
+        weighted_feature = (norm_weights.unsqueeze(-1) * stacked_feature).sum(dim=0)
+        features = weighted_feature.view(*origin_shape)
+        
+        # 6. Pass the weighted average to point-wise 1D Conv
+        # B x T x D
+        features = features.transpose(1, 2)
+        features = self.model_seq(features)
+        features = features.transpose(1, 2)
+        
+        # 7. Pooling
+        if length is not None:
+            mean, std = list(), list()
+            for snt_id in range(features.shape[0]):
+                # Avoiding padded time steps
+                actual_size = length[snt_id]
+                mean.append(torch.mean(features[snt_id, 0:actual_size, ...], dim=0))
+            features = torch.stack(mean)
+        else:
+            features = torch.mean(features, dim=1)
+
+        # Output predictions
+        # B x D
+        predicted           = self.emotion_layer(features)
+        detailed_predicted  = self.detailed_out_layer(features)
+        arousal             = self.arousal_layer(features)
+        valence             = self.valence_layer(features)
+        dominance           = self.dominance_layer(features)
+        if return_feature: return predicted, features, detailed_predicted, arousal, valence, dominance
+        return predicted, detailed_predicted, arousal, valence, dominance
+    
+    # From huggingface
+    def get_feat_extract_output_lengths(self, input_length):
+        """
+        Computes the output length of the convolutional layers
+        """
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+        for kernel_size, stride in zip(self.backbone_model.config.conv_kernel, self.backbone_model.config.conv_stride):
+            input_length = _conv_out_length(input_length, kernel_size, stride)
+        return input_length
+
+def prepare_mask(length, shape, dtype):
+    # Modified from huggingface
+    mask = torch.zeros(
+        shape, dtype=dtype
+    )
+    # these two operations makes sure that all values
+    # before the output lengths indices are attended to
+    mask[(torch.arange(mask.shape[0]), length.cpu() - 1)] = 1
+    mask = mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+    return mask
+    
diff --git a/capspeech/eval/src/model/emotion/wavlm_emotion_dim.py b/capspeech/eval/src/model/emotion/wavlm_emotion_dim.py
new file mode 100644
index 0000000000000000000000000000000000000000..7515e2bab6460b396990177fd8d3b8762a8e06d9
--- /dev/null
+++ b/capspeech/eval/src/model/emotion/wavlm_emotion_dim.py
@@ -0,0 +1,318 @@
+import os
+import pdb
+import torch
+import argparse
+import numpy as np
+import loralib as lora
+import transformers.models.wav2vec2.modeling_wav2vec2 as w2v2
+import transformers.models.wavlm.modeling_wavlm as wavlm
+from speechbrain.lobes.models.huggingface_transformers.huggingface import make_padding_masks
+
+from torch import nn
+from torch.nn import functional as F
+from transformers import Wav2Vec2FeatureExtractor
+from transformers import WavLMModel
+
+import sys
+from pathlib import Path
+sys.path.append(os.path.join(str(Path(os.path.realpath(__file__)).parents[1])))
+
+class WavLMEncoderLayer(nn.Module):
+    def __init__(self, layer_idx, config, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = wavlm.WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = wavlm.WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.config = config
+        
+        if layer_idx > config.num_hidden_layers // 2:
+            if self.config.finetune_method == "lora" or self.config.finetune_method == "combined":
+                self.feed_forward.intermediate_dense    = lora.Linear(config.hidden_size, config.intermediate_size, r=config.lora_rank)
+                self.feed_forward.output_dense          = lora.Linear(config.intermediate_size, config.hidden_size, r=config.lora_rank)
+        
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False, index=0):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+            index=index,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        
+        # Adapter
+        if self.config.finetune_method == "adapter":
+            adapt_h = self.adapter(hidden_states)
+
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class WavLMEncoderLayerStableLayerNorm(nn.Module):
+    def __init__(self, layer_idx, config, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = wavlm.WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = wavlm.WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.config = config
+
+        if layer_idx > config.num_hidden_layers // 2:
+            if self.config.finetune_method == "lora" or self.config.finetune_method == "combined":
+                self.feed_forward.intermediate_dense    = lora.Linear(config.hidden_size, config.intermediate_size, r=config.lora_rank)
+                self.feed_forward.output_dense          = lora.Linear(config.intermediate_size, config.hidden_size, r=config.lora_rank)
+            
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+   
+class WavLMWrapper(nn.Module):
+    def __init__(
+        self, 
+        pretrain_model="wavlm_large", 
+        hidden_dim=256,
+        finetune_method="lora",
+        lora_rank=16,
+        freeze_params=True,
+        output_class_num=4,
+        use_conv_output=True,
+        detailed_class_num=17,
+        predict_gender=False
+    ):
+        super(WavLMWrapper, self).__init__()
+        # 1. We Load the model first with weights
+        self.pretrain_model     = pretrain_model
+        self.finetune_method    = finetune_method
+        self.freeze_params      = freeze_params
+        self.use_conv_output    = use_conv_output
+        self.lora_rank          = lora_rank
+        self.predict_gender     = predict_gender
+        if self.pretrain_model == "wavlm":
+            self.backbone_model = WavLMModel.from_pretrained(
+                "microsoft/wavlm-base-plus",
+                output_hidden_states=True,
+            )
+        elif self.pretrain_model == "wavlm_large":
+            self.processor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-large')
+            self.backbone_model = WavLMModel.from_pretrained(
+                "microsoft/wavlm-large",
+                output_hidden_states=True,
+            )
+        state_dict = self.backbone_model.state_dict()
+        # 2. Read the model config
+        self.model_config = self.backbone_model.config
+        self.model_config.finetune_method        = self.finetune_method
+        self.model_config.lora_rank              = self.lora_rank
+        
+        # 3. Config encoder layers with adapter or embedding prompt
+        if self.pretrain_model == "wavlm":
+            self.backbone_model.encoder.layers = nn.ModuleList(
+                [WavLMEncoderLayer(i, self.model_config, has_relative_position_bias=(i == 0)) for i in range(self.model_config.num_hidden_layers)]
+            )
+        elif self.pretrain_model == "wavlm_large":
+            self.backbone_model.encoder.layers = nn.ModuleList(
+                [WavLMEncoderLayerStableLayerNorm(i, self.model_config, has_relative_position_bias=(i == 0)) for i in range(self.model_config.num_hidden_layers)]
+            )
+        # 4. Load the weights back
+        msg = self.backbone_model.load_state_dict(state_dict, strict=False)
+
+        # 5. Freeze the weights
+        self.freeze_params = freeze_params
+        if self.freeze_params and self.finetune_method != "lora":
+            for _, p in self.backbone_model.named_parameters(): p.requires_grad = False
+        elif self.freeze_params and self.finetune_method == "lora":
+            for name, p in self.backbone_model.named_parameters():
+                if name in msg.missing_keys: p.requires_grad = True
+                else: p.requires_grad = False
+        else:
+            for _, p in self.backbone_model.named_parameters(): p.requires_grad = True
+
+        # 6. Downstream models
+        self.model_seq = nn.Sequential(
+            nn.Conv1d(self.model_config.hidden_size, hidden_dim, 1, padding=0),
+            nn.ReLU(),
+            nn.Dropout(p=0.1),
+            nn.Conv1d(hidden_dim, hidden_dim, 1, padding=0),
+            nn.ReLU(),
+            nn.Dropout(p=0.1),
+            nn.Conv1d(hidden_dim, hidden_dim, 1, padding=0)
+        )
+
+        if self.use_conv_output:
+            num_layers = self.model_config.num_hidden_layers + 1  # transformer layers + input embeddings
+            self.weights = nn.Parameter(torch.ones(num_layers)/num_layers)
+        else:
+            num_layers = self.model_config.num_hidden_layers
+            self.weights = nn.Parameter(torch.zeros(num_layers))
+        
+        self.arousal_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()
+        )
+
+        self.valence_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()
+        )
+
+        self.dominance_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()
+        )
+        
+        if self.predict_gender:
+            self.gender_layer = nn.Sequential(
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, 2)
+            )
+        
+    def forward(self, x, length=None, return_feature=False):
+        # 1. feature extraction and projections
+        if self.pretrain_model == "wavlm_large":  
+            with torch.no_grad():
+                signal, attention_mask = list(), list()
+                if length is not None: attention_mask = make_padding_masks(x, wav_len=length/length.max()).to(x.device)
+                else: attention_mask = make_padding_masks(x, wav_len=torch.tensor([1]).to(x.device)).to(x.device)
+
+                for idx in range(len(x)):
+                    input = self.processor(x[idx], sampling_rate=16_000, return_tensors="pt", padding=True)
+                    signal.append(input["input_values"][0].to(x.device))
+                signal = torch.stack(signal)
+
+        # 2. get length and mask
+        if length is not None:
+            length = self.get_feat_extract_output_lengths(length.detach().cpu())
+            length = length.cuda()
+
+        if self.pretrain_model == "wavlm": 
+            x = self.backbone_model(
+                x, output_hidden_states=True
+            ).hidden_states
+        else: 
+            x = self.backbone_model(
+                signal, 
+                attention_mask=attention_mask, 
+                output_hidden_states=True
+            ).hidden_states
+        
+        # 4. stacked feature
+        if self.use_conv_output: stacked_feature = torch.stack(x, dim=0)
+        else: stacked_feature = torch.stack(x, dim=0)[1:]
+        
+        # 5. Weighted sum
+        _, *origin_shape = stacked_feature.shape
+        # Return transformer enc outputs [num_enc_layers, B, T, D]
+        if self.use_conv_output:
+            stacked_feature = stacked_feature.view(self.backbone_model.config.num_hidden_layers+1, -1)
+        else:
+            stacked_feature = stacked_feature.view(self.backbone_model.config.num_hidden_layers, -1)
+        norm_weights = F.softmax(self.weights, dim=-1)
+        
+        # Perform weighted average
+        weighted_feature = (norm_weights.unsqueeze(-1) * stacked_feature).sum(dim=0)
+        features = weighted_feature.view(*origin_shape)
+        
+        # 6. Pass the weighted average to point-wise 1D Conv
+        # B x T x D
+        features = features.transpose(1, 2)
+        features = self.model_seq(features)
+        features = features.transpose(1, 2)
+        
+        # 7. Pooling
+        if length is not None:
+            mean, std = list(), list()
+            for snt_id in range(features.shape[0]):
+                # Avoiding padded time steps
+                actual_size = length[snt_id]
+                mean.append(torch.mean(features[snt_id, 0:actual_size, ...], dim=0))
+            features = torch.stack(mean)
+        else:
+            features = torch.mean(features, dim=1)
+
+        # 8. Output predictions
+        # B x D
+        arousal             = self.arousal_layer(features)
+        valence             = self.valence_layer(features)
+        dominance           = self.dominance_layer(features)
+        
+        if(self.predict_gender):
+            gender_outputs = self.gender_layer(features)
+            return arousal, valence, dominance, gender_outputs
+        
+        return arousal, valence, dominance
+    
+    # From huggingface
+    def get_feat_extract_output_lengths(self, input_length):
+        """
+        Computes the output length of the convolutional layers
+        """
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+        for kernel_size, stride in zip(self.backbone_model.config.conv_kernel, self.backbone_model.config.conv_stride):
+            input_length = _conv_out_length(input_length, kernel_size, stride)
+        return input_length
+
+def prepare_mask(length, shape, dtype):
+    # Modified from huggingface
+    mask = torch.zeros(
+        shape, dtype=dtype
+    )
+    # these two operations makes sure that all values
+    # before the output lengths indices are attended to
+    mask[(torch.arange(mask.shape[0]), length.cpu() - 1)] = 1
+    mask = mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+    return mask
+    
diff --git a/capspeech/eval/src/model/emotion/whisper_emotion.py b/capspeech/eval/src/model/emotion/whisper_emotion.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd5068d97670348b9c4cb15fadbe5c33c0fd6730
--- /dev/null
+++ b/capspeech/eval/src/model/emotion/whisper_emotion.py
@@ -0,0 +1,306 @@
+import os
+import copy
+import torch
+import loralib as lora
+import transformers.models.whisper.modeling_whisper as whisper
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers import WhisperModel, AutoFeatureExtractor
+
+import sys
+from pathlib import Path
+sys.path.append(os.path.join(str(Path(os.path.realpath(__file__)).parents[1])))
+
+class WhisperEncoderLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = whisper.WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.config = config
+        
+        if layer_idx > config.encoder_layers // 2:
+            if self.config.finetune_method == "lora" or self.config.finetune_method == "combined":
+                self.fc1 = lora.Linear(self.embed_dim, config.encoder_ffn_dim, r=config.lora_rank)
+                self.fc2 = lora.Linear(config.encoder_ffn_dim, self.embed_dim, r=config.lora_rank)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+   
+class WhisperWrapper(nn.Module):
+    def __init__(
+        self, 
+        pretrain_model="wavlm_large", 
+        hidden_dim=256,
+        finetune_method="lora",
+        lora_rank=16,
+        freeze_params=True,
+        output_class_num=4,
+        use_conv_output=True,
+        detailed_class_num=17
+    ):
+        super(WhisperWrapper, self).__init__()
+        # 1. We Load the model first with weights
+        self.pretrain_model     = pretrain_model
+        self.finetune_method    = finetune_method
+        self.freeze_params      = freeze_params
+        self.use_conv_output    = use_conv_output
+        self.lora_rank          = lora_rank
+        self.feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny", chunk_length=15)
+        if self.pretrain_model == "whisper_tiny":
+            self.backbone_model = WhisperModel.from_pretrained(
+                "openai/whisper-tiny",
+                output_hidden_states=True,
+                ignore_mismatched_sizes=True,
+                max_source_positions=750,
+            )
+        elif self.pretrain_model == "whisper_base":
+            self.backbone_model = WhisperModel.from_pretrained(
+                "openai/whisper-base",
+                output_hidden_states=True,
+                ignore_mismatched_sizes=True,
+                max_source_positions=750,
+            )
+        elif self.pretrain_model == "whisper_small":
+            self.backbone_model = WhisperModel.from_pretrained(
+                "openai/whisper-small",
+                output_hidden_states=True,
+                max_source_positions=750,
+                ignore_mismatched_sizes=True
+            )
+        elif self.pretrain_model == "whisper_medium":
+            self.backbone_model = WhisperModel.from_pretrained(
+                "openai/whisper-medium",
+                output_hidden_states=True,
+                ignore_mismatched_sizes=True
+            )
+        elif self.pretrain_model == "whisper_large":
+            self.feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-large-v3", chunk_length=15)
+            self.backbone_model = WhisperModel.from_pretrained(
+                "openai/whisper-large-v3",
+                output_hidden_states=True,
+                ignore_mismatched_sizes=True,
+                max_source_positions=750,
+            )
+        self.embed_positions = copy.deepcopy(self.backbone_model.encoder.embed_positions.weight)
+        self.embed_positions.requires_grad = False
+
+        state_dict = self.backbone_model.state_dict()
+        # 2. Read the model config
+        self.model_config = self.backbone_model.config
+        self.model_config.finetune_method        = self.finetune_method
+        self.model_config.lora_rank              = self.lora_rank
+
+        if self.finetune_method == "lora":
+            # 3. Config encoder layers with adapter or embedding prompt
+            # pdb.set_trace()
+            self.backbone_model.encoder.layers = nn.ModuleList(
+                [WhisperEncoderLayer(self.model_config, layer_idx) for layer_idx in range(self.model_config.encoder_layers)]
+            )
+            # 4. Load the weights back
+            msg = self.backbone_model.load_state_dict(state_dict, strict=False)
+        
+        # 2. Freeze the weights
+        if self.freeze_params and self.finetune_method != "lora":
+            for _, p in self.backbone_model.named_parameters(): p.requires_grad = False
+        elif self.freeze_params and self.finetune_method == "lora":
+            for name, p in self.backbone_model.named_parameters():
+                if name in msg.missing_keys: p.requires_grad = True
+                else: p.requires_grad = False
+        else:
+            for name, p in self.backbone_model.named_parameters(): 
+                if "decoder" not in name and "conv1" not in name and "conv2" not in name and "embed_positions" not in name: p.requires_grad = True
+                else: p.requires_grad = False
+        
+        # 6. Downstream models
+        self.model_seq = nn.Sequential(
+            nn.Conv1d(self.model_config.hidden_size, hidden_dim, 1, padding=0),
+            nn.ReLU(),
+            nn.Dropout(p=0.1),
+            nn.Conv1d(hidden_dim, hidden_dim, 1, padding=0),
+            nn.ReLU(),
+            nn.Dropout(p=0.1),
+            nn.Conv1d(hidden_dim, hidden_dim, 1, padding=0)
+        )
+
+        if self.use_conv_output:
+            num_layers = self.model_config.num_hidden_layers + 1  # transformer layers + input embeddings
+            self.weights = nn.Parameter(torch.ones(num_layers)/num_layers)
+        else:
+            num_layers = self.model_config.num_hidden_layers
+            self.weights = nn.Parameter(torch.zeros(num_layers))
+        
+        self.emotion_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, output_class_num),
+        )
+
+        self.detailed_out_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, detailed_class_num),
+        )
+        
+        self.arousal_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()
+        )
+
+        self.valence_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()
+        )
+
+        self.dominance_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()
+        )
+        
+        
+    def forward(self, x, length=None, return_feature=False):
+                
+        # 1. feature extraction and projections
+        if length is not None:
+            max_audio_len = 15*16000
+            # Append to list for feature_extractor to work
+            new_x = list()
+            for idx in range(len(length)):
+                new_x.append(x[idx].detach().cpu().numpy())
+            
+            # Max length is max audio len in a batch
+            features = self.feature_extractor(
+                new_x,
+                return_tensors="pt", 
+                sampling_rate=16000,
+                max_length=max_audio_len
+            )
+            features = features.input_features.cuda()
+        else:
+            max_audio_len = 15*16000
+            features = self.feature_extractor(
+                x[0].detach().cpu(), 
+                return_tensors="pt", 
+                sampling_rate=16000,
+                max_length=max_audio_len
+            )
+            features = features.input_features.cuda()
+        
+        # 2. get length and mask
+        if length is not None:
+            length = self._get_feat_extract_output_lengths(length.detach().cpu())
+            # Replace positional embeddings
+            self.backbone_model.encoder.embed_positions = self.backbone_model.encoder.embed_positions.from_pretrained(self.embed_positions[:750])
+        else:
+            # Replace positional embeddings
+            length = torch.tensor([len(x[0])])
+            length = self._get_feat_extract_output_lengths(length)
+            self.backbone_model.encoder.embed_positions = self.backbone_model.encoder.embed_positions.from_pretrained(self.embed_positions[:750])
+            
+        # 3. transformer encoding features
+        # compute reduced attention_mask corresponding to feature vectors
+        features = self.backbone_model.encoder(
+            features, output_hidden_states=True
+        ).hidden_states
+
+        features = torch.stack(features, dim=0)[-1]
+
+        # 6. Pass the weighted average to point-wise 1D Conv
+        # B x T x D
+        features = features.transpose(1, 2)
+        features = self.model_seq(features)
+        features = features.transpose(1, 2)
+        
+        # 7. Pooling
+        if length is not None:
+            mean, std = list(), list()
+            for snt_id in range(features.shape[0]):
+                # Avoiding padded time steps
+                actual_size = length[snt_id]
+                mean.append(torch.mean(features[snt_id, 0:actual_size, ...], dim=0))
+            features = torch.stack(mean)
+        else:
+            features = torch.mean(features, dim=1)
+        
+        # Output predictions
+        # B x D
+        predicted = self.emotion_layer(features)
+        detailed_predicted = self.detailed_out_layer(features)
+        arousal = self.arousal_layer(features)
+        valence = self.valence_layer(features)
+        dominance = self.dominance_layer(features)
+        if return_feature: return predicted, features, detailed_predicted, arousal, valence, dominance
+        return predicted, detailed_predicted, arousal, valence, dominance
+        
+    # From huggingface
+    def _get_feat_extract_output_lengths(self, input_lengths):
+        """
+        Computes the output length of the convolutional layers
+        """
+        input_lengths = input_lengths // 160
+        input_lengths = (input_lengths - 1) // 2 + 1
+        return input_lengths
diff --git a/capspeech/eval/src/model/emotion/whisper_emotion_dim.py b/capspeech/eval/src/model/emotion/whisper_emotion_dim.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0d9eaee7c0adf4bc4d974f85ef1239c9c2ad8a6
--- /dev/null
+++ b/capspeech/eval/src/model/emotion/whisper_emotion_dim.py
@@ -0,0 +1,316 @@
+import os
+import copy
+import torch
+import loralib as lora
+import transformers.models.whisper.modeling_whisper as whisper
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers import WhisperModel, AutoFeatureExtractor
+
+import sys
+from pathlib import Path
+sys.path.append(os.path.join(str(Path(os.path.realpath(__file__)).parents[1])))
+
+class WhisperEncoderLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = whisper.WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.config = config
+        
+        if layer_idx > config.encoder_layers // 2:
+            if self.config.finetune_method == "lora" or self.config.finetune_method == "combined":
+                self.fc1 = lora.Linear(self.embed_dim, config.encoder_ffn_dim, r=config.lora_rank)
+                self.fc2 = lora.Linear(config.encoder_ffn_dim, self.embed_dim, r=config.lora_rank)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+   
+class WhisperWrapper(nn.Module):
+    def __init__(
+        self, 
+        pretrain_model="wavlm_large", 
+        hidden_dim=256,
+        finetune_method="lora",
+        lora_rank=16,
+        freeze_params=True,
+        output_class_num=4,
+        use_conv_output=True,
+        detailed_class_num=17,
+        predict_gender=False,
+    ):
+        super(WhisperWrapper, self).__init__()
+        # 1. We Load the model first with weights
+        self.pretrain_model     = pretrain_model
+        self.finetune_method    = finetune_method
+        self.freeze_params      = freeze_params
+        self.use_conv_output    = use_conv_output
+        self.lora_rank          = lora_rank
+        self.predict_gender     = predict_gender
+        self.feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny", chunk_length=15)
+        if self.pretrain_model == "whisper_tiny":
+            self.backbone_model = WhisperModel.from_pretrained(
+                "openai/whisper-tiny",
+                output_hidden_states=True,
+                ignore_mismatched_sizes=True,
+                max_source_positions=750,
+            )
+        elif self.pretrain_model == "whisper_base":
+            self.backbone_model = WhisperModel.from_pretrained(
+                "openai/whisper-base",
+                output_hidden_states=True,
+                ignore_mismatched_sizes=True,
+                max_source_positions=750,
+            )
+        elif self.pretrain_model == "whisper_small":
+            self.backbone_model = WhisperModel.from_pretrained(
+                "openai/whisper-small",
+                output_hidden_states=True,
+                max_source_positions=750,
+                ignore_mismatched_sizes=True
+            )
+        elif self.pretrain_model == "whisper_medium":
+            self.backbone_model = WhisperModel.from_pretrained(
+                "openai/whisper-medium",
+                output_hidden_states=True,
+                ignore_mismatched_sizes=True
+            )
+        elif self.pretrain_model == "whisper_large":
+            self.feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-large-v3", chunk_length=15)
+            self.backbone_model = WhisperModel.from_pretrained(
+                "openai/whisper-large-v3",
+                output_hidden_states=True,
+                ignore_mismatched_sizes=True,
+                max_source_positions=750,
+            )
+        self.embed_positions = copy.deepcopy(self.backbone_model.encoder.embed_positions.weight)
+        self.embed_positions.requires_grad = False
+
+        state_dict = self.backbone_model.state_dict()
+        # 2. Read the model config
+        self.model_config = self.backbone_model.config
+        self.model_config.finetune_method        = self.finetune_method
+        self.model_config.lora_rank              = self.lora_rank
+
+        if self.finetune_method == "lora":
+            # 3. Config encoder layers with adapter or embedding prompt
+            # pdb.set_trace()
+            self.backbone_model.encoder.layers = nn.ModuleList(
+                [WhisperEncoderLayer(self.model_config, layer_idx) for layer_idx in range(self.model_config.encoder_layers)]
+            )
+            # 4. Load the weights back
+            msg = self.backbone_model.load_state_dict(state_dict, strict=False)
+        
+        # 2. Freeze the weights
+        if self.freeze_params and self.finetune_method != "lora":
+            for _, p in self.backbone_model.named_parameters(): p.requires_grad = False
+        elif self.freeze_params and self.finetune_method == "lora":
+            for name, p in self.backbone_model.named_parameters():
+                if name in msg.missing_keys: p.requires_grad = True
+                else: p.requires_grad = False
+        else:
+            for name, p in self.backbone_model.named_parameters(): 
+                if "decoder" not in name and "conv1" not in name and "conv2" not in name and "embed_positions" not in name: p.requires_grad = True
+                else: p.requires_grad = False
+        
+        # 6. Downstream models
+        self.model_seq = nn.Sequential(
+            nn.Conv1d(self.model_config.hidden_size, hidden_dim, 1, padding=0),
+            nn.ReLU(),
+            nn.Dropout(p=0.1),
+            nn.Conv1d(hidden_dim, hidden_dim, 1, padding=0),
+            nn.ReLU(),
+            nn.Dropout(p=0.1),
+            nn.Conv1d(hidden_dim, hidden_dim, 1, padding=0)
+        )
+
+        if self.use_conv_output:
+            num_layers = self.model_config.num_hidden_layers + 1  # transformer layers + input embeddings
+            self.weights = nn.Parameter(torch.ones(num_layers)/num_layers)
+        else:
+            num_layers = self.model_config.num_hidden_layers
+            self.weights = nn.Parameter(torch.zeros(num_layers))
+        
+        self.emotion_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, output_class_num),
+        )
+
+        self.detailed_out_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, detailed_class_num),
+        )
+        
+        self.arousal_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()
+        )
+
+        self.valence_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()
+        )
+
+        self.dominance_layer = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()
+        )
+        
+        if(self.predict_gender):
+            self.gender_layer = nn.Sequential(
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, 2)
+            )
+        
+    def forward(self, x, length=None):
+                
+        # 1. feature extraction and projections
+        if length is not None:
+            max_audio_len = 15*16000
+            # Append to list for feature_extractor to work
+            new_x = list()
+            for idx in range(len(length)):
+                new_x.append(x[idx].detach().cpu().numpy())
+            
+            # Max length is max audio len in a batch
+            features = self.feature_extractor(
+                new_x,
+                return_tensors="pt", 
+                sampling_rate=16000,
+                max_length=max_audio_len
+            )
+            features = features.input_features.cuda()
+        else:
+            max_audio_len = 15*16000
+            features = self.feature_extractor(
+                x[0].detach().cpu(), 
+                return_tensors="pt", 
+                sampling_rate=16000,
+                max_length=max_audio_len
+            )
+            features = features.input_features.cuda()
+        
+        # 2. get length and mask
+        if length is not None:
+            length = self._get_feat_extract_output_lengths(length.detach().cpu())
+            # Replace positional embeddings
+            self.backbone_model.encoder.embed_positions = self.backbone_model.encoder.embed_positions.from_pretrained(self.embed_positions[:750])
+        else:
+            # Replace positional embeddings
+            length = torch.tensor([len(x[0])])
+            length = self._get_feat_extract_output_lengths(length)
+            self.backbone_model.encoder.embed_positions = self.backbone_model.encoder.embed_positions.from_pretrained(self.embed_positions[:750])
+            
+        # 3. transformer encoding features
+        # compute reduced attention_mask corresponding to feature vectors
+        features = self.backbone_model.encoder(
+            features, output_hidden_states=True
+        ).hidden_states
+
+        features = torch.stack(features, dim=0)[-1]
+
+        # 6. Pass the weighted average to point-wise 1D Conv
+        # B x T x D
+        features = features.transpose(1, 2)
+        features = self.model_seq(features)
+        features = features.transpose(1, 2)
+        
+        # 7. Pooling
+        if length is not None:
+            mean, std = list(), list()
+            for snt_id in range(features.shape[0]):
+                # Avoiding padded time steps
+                actual_size = length[snt_id]
+                mean.append(torch.mean(features[snt_id, 0:actual_size, ...], dim=0))
+            features = torch.stack(mean)
+        else:
+            features = torch.mean(features, dim=1)
+        
+        # Output predictions
+        # B x D
+        arousal = self.arousal_layer(features)
+        valence = self.valence_layer(features)
+        dominance = self.dominance_layer(features)
+        
+        if(self.predict_gender):
+            gender_outputs = self.gender_layer(features)
+            return arousal, valence, dominance, gender_outputs
+        
+        return arousal, valence, dominance
+        
+    # From huggingface
+    def _get_feat_extract_output_lengths(self, input_lengths):
+        """
+        Computes the output length of the convolutional layers
+        """
+        input_lengths = input_lengths // 160
+        input_lengths = (input_lengths - 1) // 2 + 1
+        return input_lengths
diff --git a/capspeech/eval/src/model/revgrad.py b/capspeech/eval/src/model/revgrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..650cebd0717cc52ffcc5f9b45136ddb70b64bf7a
--- /dev/null
+++ b/capspeech/eval/src/model/revgrad.py
@@ -0,0 +1,18 @@
+from revgrad_func import revgrad
+from torch.nn import Module
+from torch import tensor
+
+class RevGrad(Module):
+    def __init__(self, alpha=1., *args, **kwargs):
+        """
+        A gradient reversal layer.
+
+        This layer has no parameters, and simply reverses the gradient
+        in the backward pass.
+        """
+        super().__init__(*args, **kwargs)
+
+        self._alpha = tensor(alpha, requires_grad=False)
+
+    def forward(self, input_):
+        return revgrad(input_, self._alpha)
\ No newline at end of file
diff --git a/capspeech/eval/src/model/revgrad_func.py b/capspeech/eval/src/model/revgrad_func.py
new file mode 100644
index 0000000000000000000000000000000000000000..2228c6b5058a1dbf03ab2aabf68490835bd7f758
--- /dev/null
+++ b/capspeech/eval/src/model/revgrad_func.py
@@ -0,0 +1,20 @@
+from torch.autograd import Function
+
+
+class RevGrad(Function):
+    @staticmethod
+    def forward(ctx, input_, alpha_):
+        ctx.save_for_backward(input_, alpha_)
+        output = input_
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):  # pragma: no cover
+        grad_input = None
+        _, alpha_ = ctx.saved_tensors
+        if ctx.needs_input_grad[0]:
+            grad_input = -grad_output * alpha_
+        return grad_input, None
+
+
+revgrad = RevGrad.apply
\ No newline at end of file
diff --git a/capspeech/eval/utmos_eval.py b/capspeech/eval/utmos_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd29de9d32a0b12e1fcc832d747084193bce216e
--- /dev/null
+++ b/capspeech/eval/utmos_eval.py
@@ -0,0 +1,4 @@
+import utmosv2
+model = utmosv2.create_model(pretrained=True)
+mos = model.predict(input_path="your-audio-path")
+print(mos)
\ No newline at end of file
diff --git a/capspeech/nar/README.md b/capspeech/nar/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0a3d988f0c4a24fdf1874d7a00eb602cdf221088
--- /dev/null
+++ b/capspeech/nar/README.md
@@ -0,0 +1,147 @@
+# CapSpeech-NAR
+
+
+## Preprocess Data
+
+You can use `data/process.sh` or run them step by step.
+
+1. Prepare json files. Run:
+```bash
+SAVE_DIR='./capspeech' # to save processed data
+CACHE_DIR='./cache' # to save dataset cache
+MLS_WAV_DIR='' # downloaded mls wav path
+LIBRITTSRMIX_WAV_DIR='' # downloaded librittsrmix wav path
+GIGASPEECH_WAV_DIR='' # downloaded gigaspeech wav path
+COMMONVOICE_WAV_DIR='' # downloaded commonvoice wav path
+EMILIA_WAV_DIR='' # downloaded emilia wav path
+CPUS=30
+N_WORKERS=8
+BATCH_SIZE=64
+python preprocess.py \
+    --save_dir ${SAVE_DIR} \
+    --cache_dir ${CACHE_DIR} \
+    --libriRmix_wav_dir ${LIBRITTSRMIX_WAV_DIR}\
+    --mls_wav_dir ${MLS_WAV_DIR} \
+    --commonvoice_dir ${COMMONVOICE_WAV_DIR} \
+    --gigaspeech_dir ${GIGASPEECH_WAV_DIR} \
+    --emilia_dir ${EMILIA_WAV_DIR} \
+    --splits train val \
+    --audio_min_length 3.0 \
+    --audio_max_length 18.0 
+```
+Notes: `SAVE_DIR` is the path to save processed data; `CACHE_DIR` is the path to save downloaded huggingface data; `MLS_WAV_DIR` is the path of downloaded MLS English-version wav path, it should contain something like `mls_english/test/audio/10226/10111/10226_10111_000001.flac`; `COMMONVOICE_WAV_DIR` is the path of downloaded Commonvoice English-version wav path, it should contain something like `commonvoice/common_voice_en_20233751.wav`; `GIGASPEECH_WAV_DIR` is the path of downloaded GigaSpeech wav path, it should contain something like `gigaspeech/AUD0000000468_S0000654.wav`; `LIBRITTSRMIX_WAV_DIR` is the path of downloaded LibriTTS-r Mix wav path, it should contain something like `LibriTTS_R/test-clean/1089/134686/1089_134686_000001_000001_01.wav`; `EMILIA_WAV_DIR` is the path of downloaded Emilia wav path, it should contain something like `EN_B00020_S00165_W000096.mp3`.
+
+You will get a `jsons` folder with `.json` files like this:
+```
+[
+    {
+        "segment_id": "1089_134686_000001_000001_01",
+        "audio_path": "/data/capspeech-data/librittsr-mix/LibriTTS_R/test-clean/1089/134686/1089_134686_000001_000001_01.wav",
+        "text": "<train_whistling> he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to be ladled <B_start> out in thick peppered flour fattened sauce stuff it into you his belly counselled him <B_end>",
+        "caption": "A middle-aged male's speech is characterized by a steady, slightly somber tone, with his voice carrying a moderately low pitch. His speech pace is moderate, neither too quick nor too slow, lending an air of calm and measured thoughtfulness to his delivery.",
+        "duration": 12.79125,
+        "source": "libritts-r"
+    },
+    ...
+]
+```
+
+2. Phonemize. Run:
+```bash
+SAVE_DIR='./capspeech'
+CPUS=30
+python phonemize.py \
+    --save_dir ${SAVE_DIR} \
+    --num_cpus ${CPUS}
+```
+
+You will get a `g2p` folder with `.txt` files.
+
+3. Caption with T5 embeddings. Run:
+```bash
+SAVE_DIR='./capspeech'
+python caption.py \
+    --save_dir ${SAVE_DIR}
+```
+
+You will get a `t5` folder with `.npz` files.
+
+4. Make manifests. Run:
+```bash
+SAVE_DIR='./capspeech'
+python filemaker.py \
+    --save_dir ${SAVE_DIR}
+```
+
+You will get a `manifest` folder with `.txt` files like this:
+```
+1995_1826_000016_000004_01	playing_accordion
+1995_1826_000016_000007_01	underwater_bubbling
+1995_1826_000016_000008_01	telephone
+1995_1826_000016_000009_01	eletric_blender_running
+1995_1826_000016_000010_01	harmonica
+```
+
+5. Make vocab. Run:
+```bash
+SAVE_DIR='./capspeech'
+python vocab.py \
+    --save_dir ${SAVE_DIR}
+```
+
+You will get a `vocab.txt` file.
+
+📝 **Note:** We provided the following scripts to process our data. Make sure to change to your path.
+
+1. Preprocess pretraining data:
+```bash
+bash data_preprocessing/process_pretrain.sh
+```
+2. Preprocess CapTTS, EmoCapTTS and AccCapTTS data:
+```bash
+bash data_preprocessing/process_captts.sh
+```
+3. Preprocess CapTTS-SE data:
+```bash
+bash data_preprocessing/process_capttsse.sh
+```
+4. Preprocess AgentTTS data:
+```bash
+bash data_preprocessing/process_agenttts.sh
+```
+
+## Pretrain
+```bash
+accelerate launch train.py --config-name "./configs/pretrain.yaml"
+```
+
+## Finetune on CapTTS
+```bash
+accelerate launch finetune.py --config-name "./configs/finetune_captts.yaml" --pretrained-ckpt "YOUR_MODEL_PATH"
+```
+
+## Finetune on EmoCapTTS
+```bash
+accelerate launch finetune.py --config-name "./configs/finetune_emocaptts.yaml" --pretrained-ckpt "YOUR_MODEL_PATH"
+```
+
+## Finetune on AccCapTTS
+```bash
+accelerate launch finetune.py --config-name "./configs/finetune_acccaptts.yaml" --pretrained-ckpt "YOUR_MODEL_PATH"
+```
+
+## Finetune on CapTTS-SE
+```bash
+accelerate launch finetune.py --config-name "./configs/finetune_capttsse.yaml" --pretrained-ckpt "YOUR_MODEL_PATH"
+```
+
+## Finetune on AgentTTS
+```bash
+accelerate launch finetune.py --config-name "./configs/finetune_agenttts.yaml" --pretrained-ckpt "YOUR_MODEL_PATH"
+```
+
+## Train a duration predictor
+```bash
+python duration_predictor.py
+```
+
diff --git a/capspeech/nar/__init__.py b/capspeech/nar/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/nar/activations.py b/capspeech/nar/activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e39817c4242ccd5f6b3a7f4d6cdb0bd82afdd9e
--- /dev/null
+++ b/capspeech/nar/activations.py
@@ -0,0 +1,123 @@
+import torch
+from torch import nn, sin, pow
+from torch.nn import Parameter
+
+
+class Snake(nn.Module):
+    """
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+
+    def __init__(
+        self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
+    ):
+        """
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        """
+        super(Snake, self).__init__()
+        self.in_features = in_features
+
+        # Initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # Log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else:  # Linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+
+        self.alpha.requires_grad = alpha_trainable
+
+        self.no_div_by_zero = 0.000000001
+
+    def forward(self, x):
+        """
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # Line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+
+        return x
+
+
+class SnakeBeta(nn.Module):
+    """
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+
+    def __init__(
+        self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
+    ):
+        """
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        """
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+
+        # Initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # Log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else:  # Linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+
+        self.no_div_by_zero = 0.000000001
+
+    def forward(self, x):
+        """
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # Line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+
+        return x
\ No newline at end of file
diff --git a/capspeech/nar/alias_free_activation/__init__.py b/capspeech/nar/alias_free_activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/nar/alias_free_activation/cuda/__init__.py b/capspeech/nar/alias_free_activation/cuda/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/nar/alias_free_activation/cuda/activation1d.py b/capspeech/nar/alias_free_activation/cuda/activation1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..d51a7003a0e10e2645bdb853ffd6d2048e00e0e8
--- /dev/null
+++ b/capspeech/nar/alias_free_activation/cuda/activation1d.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+
+import torch
+import torch.nn as nn
+from capspeech.nar.alias_free_activation.torch.resample import UpSample1d, DownSample1d
+
+# load fused CUDA kernel: this enables importing anti_alias_activation_cuda
+from capspeech.nar.alias_free_activation.cuda import load
+
+anti_alias_activation_cuda = load.load()
+
+
+class FusedAntiAliasActivation(torch.autograd.Function):
+    """
+    Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs.
+    The hyperparameters are hard-coded in the kernel to maximize speed.
+    NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta):
+        activation_results = anti_alias_activation_cuda.forward(
+            inputs, up_ftr, down_ftr, alpha, beta
+        )
+
+        return activation_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        raise NotImplementedError
+        return output_grads, None, None
+
+
+class Activation1d(nn.Module):
+    def __init__(
+        self,
+        activation,
+        up_ratio: int = 2,
+        down_ratio: int = 2,
+        up_kernel_size: int = 12,
+        down_kernel_size: int = 12,
+        fused: bool = True,
+    ):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+
+        self.fused = fused  # Whether to use fused CUDA kernel or not
+
+    def forward(self, x):
+        if not self.fused:
+            x = self.upsample(x)
+            x = self.act(x)
+            x = self.downsample(x)
+            return x
+        else:
+            if self.act.__class__.__name__ == "Snake":
+                beta = self.act.alpha.data  # Snake uses same params for alpha and beta
+            else:
+                beta = (
+                    self.act.beta.data
+                )  # Snakebeta uses different params for alpha and beta
+            alpha = self.act.alpha.data
+            if (
+                not self.act.alpha_logscale
+            ):  # Exp baked into cuda kernel, cancel it out with a log
+                alpha = torch.log(alpha)
+                beta = torch.log(beta)
+
+            x = FusedAntiAliasActivation.apply(
+                x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta
+            )
+            return x
diff --git a/capspeech/nar/alias_free_activation/cuda/anti_alias_activation.cpp b/capspeech/nar/alias_free_activation/cuda/anti_alias_activation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c5651f77143bd678169eb11564a7cf7a7969a59e
--- /dev/null
+++ b/capspeech/nar/alias_free_activation/cuda/anti_alias_activation.cpp
@@ -0,0 +1,23 @@
+/* coding=utf-8
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #include <torch/extension.h>
+
+extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)");
+}
\ No newline at end of file
diff --git a/capspeech/nar/alias_free_activation/cuda/anti_alias_activation_cuda.cu b/capspeech/nar/alias_free_activation/cuda/anti_alias_activation_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8c442334869fe72d639ec203fa4fac07f96a0ee1
--- /dev/null
+++ b/capspeech/nar/alias_free_activation/cuda/anti_alias_activation_cuda.cu
@@ -0,0 +1,246 @@
+/* coding=utf-8
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "type_shim.h"
+#include <assert.h>
+#include <cfloat>
+#include <limits>
+#include <stdint.h>
+#include <c10/macros/Macros.h>
+
+namespace
+{
+    // Hard-coded hyperparameters
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
+    constexpr int ELEMENTS_PER_LDG_STG = 1; //(WARP_ITERATIONS < 4) ? 1 : 4;
+    constexpr int BUFFER_SIZE = 32;
+    constexpr int FILTER_SIZE = 12;
+    constexpr int HALF_FILTER_SIZE = 6;
+    constexpr int UPSAMPLE_REPLICATION_PAD = 5; // 5 on each side, matching torch impl
+    constexpr int DOWNSAMPLE_REPLICATION_PAD_LEFT = 5; // matching torch impl
+    constexpr int DOWNSAMPLE_REPLICATION_PAD_RIGHT = 6; // matching torch impl
+
+    template <typename input_t, typename output_t, typename acc_t>
+    __global__ void anti_alias_activation_forward(
+        output_t *dst,
+        const input_t *src,
+        const input_t *up_ftr,
+        const input_t *down_ftr,
+        const input_t *alpha,
+        const input_t *beta,
+        int batch_size,
+        int channels,
+        int seq_len)
+    {
+        // Up and downsample filters
+        input_t up_filter[FILTER_SIZE];
+        input_t down_filter[FILTER_SIZE];
+
+        // Load data from global memory including extra indices reserved for replication paddings
+        input_t elements[2 * FILTER_SIZE + 2 * BUFFER_SIZE + 2 * UPSAMPLE_REPLICATION_PAD] = {0};
+        input_t intermediates[2 * FILTER_SIZE + 2 * BUFFER_SIZE + DOWNSAMPLE_REPLICATION_PAD_LEFT + DOWNSAMPLE_REPLICATION_PAD_RIGHT] = {0};
+
+        // Output stores downsampled output before writing to dst
+        output_t output[BUFFER_SIZE];
+
+        // blockDim/threadIdx = (128, 1, 1)
+        // gridDim/blockIdx = (seq_blocks, channels, batches)
+        int block_offset = (blockIdx.x * 128 * BUFFER_SIZE + seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
+        int local_offset = threadIdx.x * BUFFER_SIZE;
+        int seq_offset = blockIdx.x * 128 * BUFFER_SIZE + local_offset;
+
+        // intermediate have double the seq_len
+        int intermediate_local_offset = threadIdx.x * BUFFER_SIZE * 2;
+        int intermediate_seq_offset = blockIdx.x * 128 * BUFFER_SIZE * 2 + intermediate_local_offset;
+
+        // Get values needed for replication padding before moving pointer
+        const input_t *right_most_pntr = src + (seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
+        input_t seq_left_most_value = right_most_pntr[0];
+        input_t seq_right_most_value = right_most_pntr[seq_len - 1];
+
+        // Move src and dst pointers
+        src += block_offset + local_offset;
+        dst += block_offset + local_offset;
+
+        // Alpha and beta values for snake activatons. Applies exp by default
+        alpha = alpha + blockIdx.y;
+        input_t alpha_val = expf(alpha[0]);
+        beta = beta + blockIdx.y;
+        input_t beta_val = expf(beta[0]);
+
+        #pragma unroll
+        for (int it = 0; it < FILTER_SIZE; it += 1)
+        {
+            up_filter[it] = up_ftr[it];
+            down_filter[it] = down_ftr[it];
+        }
+
+        // Apply replication padding for upsampling, matching torch impl
+        #pragma unroll
+        for (int it = -HALF_FILTER_SIZE; it < BUFFER_SIZE + HALF_FILTER_SIZE; it += 1)
+        {
+            int element_index = seq_offset + it; // index for element
+            if ((element_index < 0) && (element_index >= -UPSAMPLE_REPLICATION_PAD))
+            {
+                elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_left_most_value;
+            }
+            if ((element_index >= seq_len) && (element_index < seq_len + UPSAMPLE_REPLICATION_PAD))
+            {
+                elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_right_most_value;
+            }
+            if ((element_index >= 0) && (element_index < seq_len))
+            {
+                elements[2 * (HALF_FILTER_SIZE + it)] = 2 * src[it];
+            }
+        }
+
+        // Apply upsampling strided convolution and write to intermediates. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT for replication padding of the downsampilng conv later
+        #pragma unroll
+        for (int it = 0; it < (2 * BUFFER_SIZE + 2 * FILTER_SIZE); it += 1)
+        {
+            input_t acc = 0.0;
+            int element_index = intermediate_seq_offset + it; // index for intermediate
+            #pragma unroll
+            for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
+            {
+                if ((element_index + f_idx) >= 0)
+                {
+                    acc += up_filter[f_idx] * elements[it + f_idx];
+                }
+            }
+            intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] = acc;
+        }
+
+        // Apply activation function. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT and DOWNSAMPLE_REPLICATION_PAD_RIGHT for replication padding of the downsampilng conv later
+        double no_div_by_zero = 0.000000001;
+        #pragma unroll
+        for (int it = 0; it < 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it += 1)
+        {
+            intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] += (1.0 / (beta_val + no_div_by_zero)) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val);
+        }
+
+        // Apply replication padding before downsampling conv from intermediates
+        #pragma unroll
+        for (int it = 0; it < DOWNSAMPLE_REPLICATION_PAD_LEFT; it += 1)
+        {
+            intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT];
+        }
+        #pragma unroll
+        for (int it = DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it < DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE + DOWNSAMPLE_REPLICATION_PAD_RIGHT; it += 1)
+        {
+            intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE - 1];
+        }
+
+        // Apply downsample strided convolution (assuming stride=2) from intermediates
+        #pragma unroll
+        for (int it = 0; it < BUFFER_SIZE; it += 1)
+        {
+            input_t acc = 0.0;
+            #pragma unroll
+            for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
+            {
+                // Add constant DOWNSAMPLE_REPLICATION_PAD_RIGHT to match torch implementation
+                acc += down_filter[f_idx] * intermediates[it * 2 + f_idx + DOWNSAMPLE_REPLICATION_PAD_RIGHT];
+            }
+            output[it] = acc;
+        }
+
+        // Write output to dst
+        #pragma unroll
+        for (int it = 0;  it < BUFFER_SIZE;  it += ELEMENTS_PER_LDG_STG)
+        {
+            int element_index = seq_offset + it;
+            if (element_index < seq_len)
+            {
+                dst[it] = output[it];
+            }
+        }
+
+    }
+
+    template <typename input_t, typename output_t, typename acc_t>
+    void dispatch_anti_alias_activation_forward(
+        output_t *dst,
+        const input_t *src,
+        const input_t *up_ftr,
+        const input_t *down_ftr,
+        const input_t *alpha,
+        const input_t *beta,
+        int batch_size,
+        int channels,
+        int seq_len)
+    {
+        if (seq_len == 0)
+        {
+            return;
+        }
+        else
+        {
+            // Use 128 threads per block to maximimize gpu utilization
+            constexpr int threads_per_block = 128;
+            constexpr int seq_len_per_block = 4096;
+            int blocks_per_seq_len = (seq_len + seq_len_per_block - 1) / seq_len_per_block;
+            dim3 blocks(blocks_per_seq_len, channels, batch_size);
+            dim3 threads(threads_per_block, 1, 1);
+
+            anti_alias_activation_forward<input_t, output_t, acc_t>
+                <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, up_ftr, down_ftr, alpha, beta, batch_size, channels, seq_len);
+        }
+    }
+}
+
+extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta)
+{
+    // Input is a 3d tensor with dimensions [batches, channels, seq_len]
+    const int batches = input.size(0);
+    const int channels = input.size(1);
+    const int seq_len = input.size(2);
+
+    // Output
+    auto act_options = input.options().requires_grad(false);
+
+    torch::Tensor anti_alias_activation_results =
+        torch::empty({batches, channels, seq_len}, act_options);
+
+    void *input_ptr = static_cast<void *>(input.data_ptr());
+    void *up_filter_ptr = static_cast<void *>(up_filter.data_ptr());
+    void *down_filter_ptr = static_cast<void *>(down_filter.data_ptr());
+    void *alpha_ptr = static_cast<void *>(alpha.data_ptr());
+    void *beta_ptr = static_cast<void *>(beta.data_ptr());
+    void *anti_alias_activation_results_ptr = static_cast<void *>(anti_alias_activation_results.data_ptr());
+
+    DISPATCH_FLOAT_HALF_AND_BFLOAT(
+        input.scalar_type(),
+        "dispatch anti alias activation_forward",
+        dispatch_anti_alias_activation_forward<scalar_t, scalar_t, float>(
+            reinterpret_cast<scalar_t *>(anti_alias_activation_results_ptr),
+            reinterpret_cast<const scalar_t *>(input_ptr),
+            reinterpret_cast<const scalar_t *>(up_filter_ptr),
+            reinterpret_cast<const scalar_t *>(down_filter_ptr),
+            reinterpret_cast<const scalar_t *>(alpha_ptr),
+            reinterpret_cast<const scalar_t *>(beta_ptr),
+            batches,
+            channels,
+            seq_len););
+    return anti_alias_activation_results;
+}
\ No newline at end of file
diff --git a/capspeech/nar/alias_free_activation/cuda/compat.h b/capspeech/nar/alias_free_activation/cuda/compat.h
new file mode 100644
index 0000000000000000000000000000000000000000..25818b2edf4cb0dc9130e62c7c4de8d16a01baa5
--- /dev/null
+++ b/capspeech/nar/alias_free_activation/cuda/compat.h
@@ -0,0 +1,29 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*This code is copied fron NVIDIA apex:
+ *     https://github.com/NVIDIA/apex
+ *     with minor changes. */
+
+#ifndef TORCH_CHECK
+#define TORCH_CHECK AT_CHECK
+#endif
+
+#ifdef VERSION_GE_1_3
+#define DATA_PTR data_ptr
+#else
+#define DATA_PTR data
+#endif
diff --git a/capspeech/nar/alias_free_activation/cuda/load.py b/capspeech/nar/alias_free_activation/cuda/load.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca5d01de398249e75e9e2298958764acb436edba
--- /dev/null
+++ b/capspeech/nar/alias_free_activation/cuda/load.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+
+import os
+import pathlib
+import subprocess
+
+from torch.utils import cpp_extension
+
+"""
+Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels. 
+Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below
+"""
+os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+
+
+def load():
+    # Check if cuda 11 is installed for compute capability 8.0
+    cc_flag = []
+    _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
+    if int(bare_metal_major) >= 11:
+        cc_flag.append("-gencode")
+        cc_flag.append("arch=compute_80,code=sm_80")
+
+    # Build path
+    srcpath = pathlib.Path(__file__).parent.absolute()
+    buildpath = srcpath / "build"
+    _create_build_dir(buildpath)
+
+    # Helper function to build the kernels.
+    def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
+        return cpp_extension.load(
+            name=name,
+            sources=sources,
+            build_directory=buildpath,
+            extra_cflags=[
+                "-O3",
+            ],
+            extra_cuda_cflags=[
+                "-O3",
+                "-gencode",
+                "arch=compute_70,code=sm_70",
+                "--use_fast_math",
+            ]
+            + extra_cuda_flags
+            + cc_flag,
+            verbose=True,
+        )
+
+    extra_cuda_flags = [
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+    ]
+
+    sources = [
+        srcpath / "anti_alias_activation.cpp",
+        srcpath / "anti_alias_activation_cuda.cu",
+    ]
+    anti_alias_activation_cuda = _cpp_extention_load_helper(
+        "anti_alias_activation_cuda", sources, extra_cuda_flags
+    )
+
+    return anti_alias_activation_cuda
+
+
+def _get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output(
+        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
+    )
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+
+    return raw_output, bare_metal_major, bare_metal_minor
+
+
+def _create_build_dir(buildpath):
+    try:
+        os.mkdir(buildpath)
+    except OSError:
+        if not os.path.isdir(buildpath):
+            print(f"Creation of the build directory {buildpath} failed")
diff --git a/capspeech/nar/alias_free_activation/cuda/type_shim.h b/capspeech/nar/alias_free_activation/cuda/type_shim.h
new file mode 100644
index 0000000000000000000000000000000000000000..5db7e8a397e982d4d30d16ab6060814b98b7ab83
--- /dev/null
+++ b/capspeech/nar/alias_free_activation/cuda/type_shim.h
@@ -0,0 +1,92 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include "compat.h"
+
+#define DISPATCH_FLOAT_HALF_AND_BFLOAT(TYPE, NAME, ...)                 \
+	switch (TYPE)                                                       \
+	{                                                                   \
+	case at::ScalarType::Float:                                         \
+	{                                                                   \
+		using scalar_t = float;                                         \
+		__VA_ARGS__;                                                    \
+		break;                                                          \
+	}                                                                   \
+	case at::ScalarType::Half:                                          \
+	{                                                                   \
+		using scalar_t = at::Half;                                      \
+		__VA_ARGS__;                                                    \
+		break;                                                          \
+	}                                                                   \
+	case at::ScalarType::BFloat16:                                      \
+	{                                                                   \
+		using scalar_t = at::BFloat16;                                  \
+		__VA_ARGS__;                                                    \
+		break;                                                          \
+	}                                                                   \
+	default:                                                            \
+		AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+	}
+
+#define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
+	switch (TYPEIN)                                                            \
+	{                                                                          \
+	case at::ScalarType::Float:                                                \
+	{                                                                          \
+		using scalar_t_in = float;                                             \
+		switch (TYPEOUT)                                                       \
+		{                                                                      \
+		case at::ScalarType::Float:                                            \
+		{                                                                      \
+			using scalar_t_out = float;                                        \
+			__VA_ARGS__;                                                       \
+			break;                                                             \
+		}                                                                      \
+		case at::ScalarType::Half:                                             \
+		{                                                                      \
+			using scalar_t_out = at::Half;                                     \
+			__VA_ARGS__;                                                       \
+			break;                                                             \
+		}                                                                      \
+		case at::ScalarType::BFloat16:                                         \
+		{                                                                      \
+			using scalar_t_out = at::BFloat16;                                 \
+			__VA_ARGS__;                                                       \
+			break;                                                             \
+		}                                                                      \
+		default:                                                               \
+			AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
+		}                                                                      \
+		break;                                                                 \
+	}                                                                          \
+	case at::ScalarType::Half:                                                 \
+	{                                                                          \
+		using scalar_t_in = at::Half;                                          \
+		using scalar_t_out = at::Half;                                         \
+		__VA_ARGS__;                                                           \
+		break;                                                                 \
+	}                                                                          \
+	case at::ScalarType::BFloat16:                                             \
+	{                                                                          \
+		using scalar_t_in = at::BFloat16;                                      \
+		using scalar_t_out = at::BFloat16;                                     \
+		__VA_ARGS__;                                                           \
+		break;                                                                 \
+	}                                                                          \
+	default:                                                                   \
+		AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");      \
+	}
diff --git a/capspeech/nar/alias_free_activation/torch/__init__.py b/capspeech/nar/alias_free_activation/torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f756ed83f87f9839e457b240f60469bc187707d
--- /dev/null
+++ b/capspeech/nar/alias_free_activation/torch/__init__.py
@@ -0,0 +1,6 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+from .filter import *
+from .resample import *
+from .act import *
diff --git a/capspeech/nar/alias_free_activation/torch/act.py b/capspeech/nar/alias_free_activation/torch/act.py
new file mode 100644
index 0000000000000000000000000000000000000000..7001f3dd2877308c046362afee6cdf9dbb983b9e
--- /dev/null
+++ b/capspeech/nar/alias_free_activation/torch/act.py
@@ -0,0 +1,30 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch.nn as nn
+from capspeech.nar.alias_free_activation.torch.resample import UpSample1d, DownSample1d
+
+
+class Activation1d(nn.Module):
+    def __init__(
+        self,
+        activation,
+        up_ratio: int = 2,
+        down_ratio: int = 2,
+        up_kernel_size: int = 12,
+        down_kernel_size: int = 12,
+    ):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+
+        return x
diff --git a/capspeech/nar/alias_free_activation/torch/filter.py b/capspeech/nar/alias_free_activation/torch/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fa35b0d5ddf8d6cb04cd9d47364ca033cebcd32
--- /dev/null
+++ b/capspeech/nar/alias_free_activation/torch/filter.py
@@ -0,0 +1,101 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+if "sinc" in dir(torch):
+    sinc = torch.sinc
+else:
+    # This code is adopted from adefossez's julius.core.sinc under the MIT License
+    # https://adefossez.github.io/julius/julius/core.html
+    #   LICENSE is in incl_licenses directory.
+    def sinc(x: torch.Tensor):
+        """
+        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
+        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
+        """
+        return torch.where(
+            x == 0,
+            torch.tensor(1.0, device=x.device, dtype=x.dtype),
+            torch.sin(math.pi * x) / math.pi / x,
+        )
+
+
+# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
+# https://adefossez.github.io/julius/julius/lowpass.html
+#   LICENSE is in incl_licenses directory.
+def kaiser_sinc_filter1d(
+    cutoff, half_width, kernel_size
+):  # return filter [1,1,kernel_size]
+    even = kernel_size % 2 == 0
+    half_size = kernel_size // 2
+
+    # For kaiser window
+    delta_f = 4 * half_width
+    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if A > 50.0:
+        beta = 0.1102 * (A - 8.7)
+    elif A >= 21.0:
+        beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
+    else:
+        beta = 0.0
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
+
+    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+    if even:
+        time = torch.arange(-half_size, half_size) + 0.5
+    else:
+        time = torch.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)
+    else:
+        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
+        """
+        Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal.
+        """
+        filter_ /= filter_.sum()
+        filter = filter_.view(1, 1, kernel_size)
+
+    return filter
+
+
+class LowPassFilter1d(nn.Module):
+    def __init__(
+        self,
+        cutoff=0.5,
+        half_width=0.6,
+        stride: int = 1,
+        padding: bool = True,
+        padding_mode: str = "replicate",
+        kernel_size: int = 12,
+    ):
+        """
+        kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible.
+        """
+        super().__init__()
+        if cutoff < -0.0:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = kernel_size % 2 == 0
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = stride
+        self.padding = padding
+        self.padding_mode = padding_mode
+        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+        self.register_buffer("filter", filter)
+
+    # Input [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+
+        if self.padding:
+            x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
+        out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+
+        return out
diff --git a/capspeech/nar/alias_free_activation/torch/resample.py b/capspeech/nar/alias_free_activation/torch/resample.py
new file mode 100644
index 0000000000000000000000000000000000000000..df4fd791e4affce2dba3c1bd3e5c678be7e5d0e0
--- /dev/null
+++ b/capspeech/nar/alias_free_activation/torch/resample.py
@@ -0,0 +1,58 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch.nn as nn
+from torch.nn import functional as F
+from capspeech.nar.alias_free_activation.torch.filter import LowPassFilter1d
+from capspeech.nar.alias_free_activation.torch.filter import kaiser_sinc_filter1d
+
+
+class UpSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = (
+            int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        )
+        self.stride = ratio
+        self.pad = self.kernel_size // ratio - 1
+        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+        self.pad_right = (
+            self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+        )
+        filter = kaiser_sinc_filter1d(
+            cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size
+        )
+        self.register_buffer("filter", filter)
+
+    # x: [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+
+        x = F.pad(x, (self.pad, self.pad), mode="replicate")
+        x = self.ratio * F.conv_transpose1d(
+            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C
+        )
+        x = x[..., self.pad_left : -self.pad_right]
+
+        return x
+
+
+class DownSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = (
+            int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        )
+        self.lowpass = LowPassFilter1d(
+            cutoff=0.5 / ratio,
+            half_width=0.6 / ratio,
+            stride=ratio,
+            kernel_size=self.kernel_size,
+        )
+
+    def forward(self, x):
+        xx = self.lowpass(x)
+
+        return xx
diff --git a/capspeech/nar/bigvgan.py b/capspeech/nar/bigvgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..d35ea071a6e3a8fbb99fde5eb30810a4938d772f
--- /dev/null
+++ b/capspeech/nar/bigvgan.py
@@ -0,0 +1,601 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+import os
+import json
+from pathlib import Path
+from typing import Optional, Union, Dict
+
+import torch
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils import weight_norm, remove_weight_norm
+from capspeech.nar import activations
+from capspeech.nar.alias_free_activation.torch.act import Activation1d as TorchActivation1d
+
+from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
+import shutil
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+import glob
+import os
+import matplotlib
+import torch
+from torch.nn.utils import weight_norm
+
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+from capspeech.nar.meldataset import MAX_WAV_VALUE
+from scipy.io.wavfile import write
+
+
+def plot_spectrogram(spectrogram):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
+    plt.colorbar(im, ax=ax)
+
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def plot_spectrogram_clipped(spectrogram, clip_max=2.0):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(
+        spectrogram,
+        aspect="auto",
+        origin="lower",
+        interpolation="none",
+        vmin=1e-6,
+        vmax=clip_max,
+    )
+    plt.colorbar(im, ax=ax)
+
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def apply_weight_norm(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        weight_norm(m)
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+
+
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    print(f"Loading '{filepath}'")
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    print("Complete.")
+    return checkpoint_dict
+
+
+def save_checkpoint(filepath, obj):
+    print(f"Saving checkpoint to {filepath}")
+    torch.save(obj, filepath)
+    print("Complete.")
+
+
+def scan_checkpoint(cp_dir, prefix, renamed_file=None):
+    # Fallback to original scanning logic first
+    pattern = os.path.join(cp_dir, prefix + "????????")
+    cp_list = glob.glob(pattern)
+
+    if len(cp_list) > 0:
+        last_checkpoint_path = sorted(cp_list)[-1]
+        print(f"[INFO] Resuming from checkpoint: '{last_checkpoint_path}'")
+        return last_checkpoint_path
+
+    # If no pattern-based checkpoints are found, check for renamed file
+    if renamed_file:
+        renamed_path = os.path.join(cp_dir, renamed_file)
+        if os.path.isfile(renamed_path):
+            print(f"[INFO] Resuming from renamed checkpoint: '{renamed_file}'")
+            return renamed_path
+
+    return None
+
+
+def save_audio(audio, path, sr):
+    # wav: torch with 1d shape
+    audio = audio * MAX_WAV_VALUE
+    audio = audio.cpu().numpy().astype("int16")
+    write(path, sr, audio)
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))
+
+def load_hparams_from_json(path) -> AttrDict:
+    with open(path) as f:
+        data = f.read()
+    return AttrDict(json.loads(data))
+
+
+class AMPBlock1(torch.nn.Module):
+    """
+    AMPBlock applies Snake / SnakeBeta activation functions with trainable parameters that control periodicity, defined for each layer.
+    AMPBlock1 has additional self.convs2 that contains additional Conv1d layers with a fixed dilation=1 followed by each layer in self.convs1
+
+    Args:
+        h (AttrDict): Hyperparameters.
+        channels (int): Number of convolution channels.
+        kernel_size (int): Size of the convolution kernel. Default is 3.
+        dilation (tuple): Dilation rates for the convolutions. Each dilation layer has two convolutions. Default is (1, 3, 5).
+        activation (str): Activation function type. Should be either 'snake' or 'snakebeta'. Default is None.
+    """
+
+    def __init__(
+        self,
+        h: AttrDict,
+        channels: int,
+        kernel_size: int = 3,
+        dilation: tuple = (1, 3, 5),
+        activation: str = None,
+    ):
+        super().__init__()
+        
+        self.h = h
+
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        stride=1,
+                        dilation=d,
+                        padding=get_padding(kernel_size, d),
+                    )
+                )
+                for d in dilation
+            ]
+        )
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        stride=1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+        self.convs2.apply(init_weights)
+
+        self.num_layers = len(self.convs1) + len(
+            self.convs2
+        )  # Total number of conv layers
+
+        # Select which Activation1d, lazy-load cuda version to ensure backward compatibility
+        if self.h.get("use_cuda_kernel", False):
+            from alias_free_activation.cuda.activation1d import (
+                Activation1d as CudaActivation1d,
+            )
+
+            Activation1d = CudaActivation1d
+        else:
+            Activation1d = TorchActivation1d
+
+        # Activation functions
+        if activation == "snake":
+            self.activations = nn.ModuleList(
+                [
+                    Activation1d(
+                        activation=activations.Snake(
+                            channels, alpha_logscale=h.snake_logscale
+                        )
+                    )
+                    for _ in range(self.num_layers)
+                ]
+            )
+        elif activation == "snakebeta":
+            self.activations = nn.ModuleList(
+                [
+                    Activation1d(
+                        activation=activations.SnakeBeta(
+                            channels, alpha_logscale=h.snake_logscale
+                        )
+                    )
+                    for _ in range(self.num_layers)
+                ]
+            )
+        else:
+            raise NotImplementedError(
+                "activation incorrectly specified. check the config file and look for 'activation'."
+            )
+
+    def forward(self, x):
+        acts1, acts2 = self.activations[::2], self.activations[1::2]
+        for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
+            xt = a1(x)
+            xt = c1(xt)
+            xt = a2(xt)
+            xt = c2(xt)
+            x = xt + x
+
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class AMPBlock2(torch.nn.Module):
+    """
+    AMPBlock applies Snake / SnakeBeta activation functions with trainable parameters that control periodicity, defined for each layer.
+    Unlike AMPBlock1, AMPBlock2 does not contain extra Conv1d layers with fixed dilation=1
+
+    Args:
+        h (AttrDict): Hyperparameters.
+        channels (int): Number of convolution channels.
+        kernel_size (int): Size of the convolution kernel. Default is 3.
+        dilation (tuple): Dilation rates for the convolutions. Each dilation layer has two convolutions. Default is (1, 3, 5).
+        activation (str): Activation function type. Should be either 'snake' or 'snakebeta'. Default is None.
+    """
+
+    def __init__(
+        self,
+        h: AttrDict,
+        channels: int,
+        kernel_size: int = 3,
+        dilation: tuple = (1, 3, 5),
+        activation: str = None,
+    ):
+        super().__init__()
+        
+        self.h = h
+
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        stride=1,
+                        dilation=d,
+                        padding=get_padding(kernel_size, d),
+                    )
+                )
+                for d in dilation
+            ]
+        )
+        self.convs.apply(init_weights)
+
+        self.num_layers = len(self.convs)  # Total number of conv layers
+
+        # Select which Activation1d, lazy-load cuda version to ensure backward compatibility
+        if self.h.get("use_cuda_kernel", False):
+            from alias_free_activation.cuda.activation1d import (
+                Activation1d as CudaActivation1d,
+            )
+
+            Activation1d = CudaActivation1d
+        else:
+            Activation1d = TorchActivation1d
+
+        # Activation functions
+        if activation == "snake":
+            self.activations = nn.ModuleList(
+                [
+                    Activation1d(
+                        activation=activations.Snake(
+                            channels, alpha_logscale=h.snake_logscale
+                        )
+                    )
+                    for _ in range(self.num_layers)
+                ]
+            )
+        elif activation == "snakebeta":
+            self.activations = nn.ModuleList(
+                [
+                    Activation1d(
+                        activation=activations.SnakeBeta(
+                            channels, alpha_logscale=h.snake_logscale
+                        )
+                    )
+                    for _ in range(self.num_layers)
+                ]
+            )
+        else:
+            raise NotImplementedError(
+                "activation incorrectly specified. check the config file and look for 'activation'."
+            )
+
+    def forward(self, x):
+        for c, a in zip(self.convs, self.activations):
+            xt = a(x)
+            xt = c(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class BigVGAN(
+    torch.nn.Module,
+    PyTorchModelHubMixin,
+    library_name="bigvgan",
+    repo_url="https://github.com/NVIDIA/BigVGAN",
+    docs_url="https://github.com/NVIDIA/BigVGAN/blob/main/README.md",
+    pipeline_tag="audio-to-audio",
+    license="mit",
+    tags=["neural-vocoder", "audio-generation", "arxiv:2206.04658"],
+):
+    """
+    BigVGAN is a neural vocoder model that applies anti-aliased periodic activation for residual blocks (resblocks).
+    New in BigVGAN-v2: it can optionally use optimized CUDA kernels for AMP (anti-aliased multi-periodicity) blocks.
+
+    Args:
+        h (AttrDict): Hyperparameters.
+        use_cuda_kernel (bool): If set to True, loads optimized CUDA kernels for AMP. This should be used for inference only, as training is not supported with CUDA kernels.
+
+    Note:
+        - The `use_cuda_kernel` parameter should be used for inference only, as training with CUDA kernels is not supported.
+        - Ensure that the activation function is correctly specified in the hyperparameters (h.activation).
+    """
+
+    def __init__(self, h: AttrDict, use_cuda_kernel: bool = False):
+        super().__init__()
+        self.h = h
+        self.h["use_cuda_kernel"] = use_cuda_kernel
+
+        # Select which Activation1d, lazy-load cuda version to ensure backward compatibility
+        if self.h.get("use_cuda_kernel", False):
+            from alias_free_activation.cuda.activation1d import (
+                Activation1d as CudaActivation1d,
+            )
+
+            Activation1d = CudaActivation1d
+        else:
+            Activation1d = TorchActivation1d
+
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+
+        # Pre-conv
+        self.conv_pre = weight_norm(
+            Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)
+        )
+
+        # Define which AMPBlock to use. BigVGAN uses AMPBlock1 as default
+        if h.resblock == "1":
+            resblock_class = AMPBlock1
+        elif h.resblock == "2":
+            resblock_class = AMPBlock2
+        else:
+            raise ValueError(
+                f"Incorrect resblock class specified in hyperparameters. Got {h.resblock}"
+            )
+
+        # Transposed conv-based upsamplers. does not apply anti-aliasing
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(
+                nn.ModuleList(
+                    [
+                        weight_norm(
+                            ConvTranspose1d(
+                                h.upsample_initial_channel // (2**i),
+                                h.upsample_initial_channel // (2 ** (i + 1)),
+                                k,
+                                u,
+                                padding=(k - u) // 2,
+                            )
+                        )
+                    ]
+                )
+            )
+
+        # Residual blocks using anti-aliased multi-periodicity composition modules (AMP)
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
+            ):
+                self.resblocks.append(
+                    resblock_class(h, ch, k, d, activation=h.activation)
+                )
+
+        # Post-conv
+        activation_post = (
+            activations.Snake(ch, alpha_logscale=h.snake_logscale)
+            if h.activation == "snake"
+            else (
+                activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale)
+                if h.activation == "snakebeta"
+                else None
+            )
+        )
+        if activation_post is None:
+            raise NotImplementedError(
+                "activation incorrectly specified. check the config file and look for 'activation'."
+            )
+
+        self.activation_post = Activation1d(activation=activation_post)
+
+        # Whether to use bias for the final conv_post. Default to True for backward compatibility
+        self.use_bias_at_final = h.get("use_bias_at_final", True)
+        self.conv_post = weight_norm(
+            Conv1d(ch, 1, 7, 1, padding=3, bias=self.use_bias_at_final)
+        )
+
+        # Weight initialization
+        for i in range(len(self.ups)):
+            self.ups[i].apply(init_weights)
+        self.conv_post.apply(init_weights)
+
+        # Final tanh activation. Defaults to True for backward compatibility
+        self.use_tanh_at_final = h.get("use_tanh_at_final", True)
+
+    def forward(self, x):
+        # Pre-conv
+        x = self.conv_pre(x)
+
+        for i in range(self.num_upsamples):
+            # Upsampling
+            for i_up in range(len(self.ups[i])):
+                x = self.ups[i][i_up](x)
+            # AMP blocks
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+
+        # Post-conv
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        # Final tanh activation
+        if self.use_tanh_at_final:
+            x = torch.tanh(x)
+        else:
+            x = torch.clamp(x, min=-1.0, max=1.0)  # Bound the output to [-1, 1]
+
+        return x
+
+    def remove_weight_norm(self):
+        try:
+            print("Removing weight norm...")
+            for l in self.ups:
+                for l_i in l:
+                    remove_weight_norm(l_i)
+            for l in self.resblocks:
+                l.remove_weight_norm()
+            remove_weight_norm(self.conv_pre)
+            remove_weight_norm(self.conv_post)
+        except ValueError:
+            print("[INFO] Model already removed weight norm. Skipping!")
+            pass
+
+    # Additional methods for huggingface_hub support
+    def _save_pretrained(self, save_directory: Path) -> None:
+        """Save weights and config.json from a Pytorch model to a local directory."""
+
+        model_path = save_directory / "bigvgan_generator.pt"
+        torch.save({"generator": self.state_dict()}, model_path)
+
+        config_path = save_directory / "config.json"
+        with open(config_path, "w") as config_file:
+            json.dump(self.h, config_file, indent=4)
+
+    @classmethod
+    def _from_pretrained(
+        cls,
+        *,
+        model_id: str,
+        revision: str,
+        cache_dir: str,
+        force_download: bool,
+        proxies: Optional[Dict],
+        resume_download: bool,
+        local_files_only: bool,
+        token: Union[str, bool, None],
+        map_location: str = "cpu",  # Additional argument
+        strict: bool = False,  # Additional argument
+        use_cuda_kernel: bool = False,
+        **model_kwargs,
+    ):
+        """Load Pytorch pretrained weights and return the loaded model."""
+
+        # Download and load hyperparameters (h) used by BigVGAN
+        if os.path.isdir(model_id):
+            print("Loading config.json from local directory")
+            config_file = os.path.join(model_id, "config.json")
+        else:
+            config_file = hf_hub_download(
+                repo_id=model_id,
+                filename="config.json",
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+        h = load_hparams_from_json(config_file)
+
+        # instantiate BigVGAN using h
+        if use_cuda_kernel:
+            print(
+                f"[WARNING] You have specified use_cuda_kernel=True during BigVGAN.from_pretrained(). Only inference is supported (training is not implemented)!"
+            )
+            print(
+                f"[WARNING] You need nvcc and ninja installed in your system that matches your PyTorch build is using to build the kernel. If not, the model will fail to initialize or generate incorrect waveform!"
+            )
+            print(
+                f"[WARNING] For detail, see the official GitHub repository: https://github.com/NVIDIA/BigVGAN?tab=readme-ov-file#using-custom-cuda-kernel-for-synthesis"
+            )
+        model = cls(h, use_cuda_kernel=use_cuda_kernel)
+
+        # Download and load pretrained generator weight
+        if os.path.isdir(model_id):
+            print("Loading weights from local directory")
+            model_file = os.path.join(model_id, "bigvgan_generator.pt")
+        else:
+            print(f"Loading weights from {model_id}")
+            model_file = hf_hub_download(
+                repo_id=model_id,
+                filename="bigvgan_generator.pt",
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+
+        checkpoint_dict = torch.load(model_file, map_location=map_location)
+
+        try:
+            model.load_state_dict(checkpoint_dict["generator"])
+        except RuntimeError:
+            print(
+                f"[INFO] the pretrained checkpoint does not contain weight norm. Loading the checkpoint after removing weight norm!"
+            )
+            model.remove_weight_norm()
+            model.load_state_dict(checkpoint_dict["generator"])
+
+        return model
\ No newline at end of file
diff --git a/capspeech/nar/configs/__init__.py b/capspeech/nar/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/nar/configs/finetune_acccaptts.yaml b/capspeech/nar/configs/finetune_acccaptts.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a7b9c263c5fb0ec1be4f21ebdd71350024765558
--- /dev/null
+++ b/capspeech/nar/configs/finetune_acccaptts.yaml
@@ -0,0 +1,71 @@
+model_name: finetune_acccaptts
+
+model:
+  dim: 1024
+  depth: 24
+  heads: 16
+  ff_mult: 2
+  text_dim: 512
+  # disable convnext in text embedding
+  conv_layers: 0
+  # phoneme vocab size
+  text_num_embeds: 200
+  mel_dim: 100
+  t5_dim: 1024
+  clap_dim: 512
+  # disable it on a100
+  use_checkpoint: false
+  qk_norm: true
+  skip: true
+
+mel:
+  target_sample_rate: 24000
+  n_mel_channels: 100
+  hop_length: 256
+
+opt:
+  learning_rate: 2.0e-05
+  beta1: 0.9
+  beta2: 0.999
+  weight_decay: 0.01
+  adam_epsilon: 1.0e-08
+  grad_clip: 1.0
+  batch_size: 64
+  accumulation_steps: 1
+  # mask_range: [0.7, 1.0]
+  drop_spk: 0.1
+  drop_text: 0.5
+
+  lr_scheduler:
+    warmup_steps: 1000
+    decay_steps: 100000
+    end_factor: 1.0e-02
+
+data:
+  trainset:
+    dataset_dir: "" # your processed path
+    clap_emb_dir: "./data/clap_embs/"
+    t5_folder_name: "t5"
+    phn_folder_name: "g2p"
+    manifest_name: "manifest"
+    json_name: "jsons"
+    dynamic_batching: true
+    text_pad_token: -1
+    audio_pad_token: 0.0
+    split: "train_SFT_AccCapTTS"
+    sr: 24000
+    norm_audio: false
+
+  valset:
+    dataset_dir: "" # your processed path
+    clap_emb_dir: "./data/clap_embs/"
+    t5_folder_name: "t5"
+    phn_folder_name: "g2p"
+    manifest_name: "manifest"
+    json_name: "jsons"
+    dynamic_batching: true
+    text_pad_token: -1
+    audio_pad_token: 0.0
+    split: "validation_SFT_AccCapTTS"
+    sr: 24000
+    norm_audio: false
diff --git a/capspeech/nar/configs/finetune_agenttts.yaml b/capspeech/nar/configs/finetune_agenttts.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc0555d6d0125e0549244f7b1dea19ee56e2dc65
--- /dev/null
+++ b/capspeech/nar/configs/finetune_agenttts.yaml
@@ -0,0 +1,71 @@
+model_name: finetune_agenttts
+
+model:
+  dim: 1024
+  depth: 24
+  heads: 16
+  ff_mult: 2
+  text_dim: 512
+  # disable convnext in text embedding
+  conv_layers: 0
+  # phoneme vocab size
+  text_num_embeds: 200
+  mel_dim: 100
+  t5_dim: 1024
+  clap_dim: 512
+  # disable it on a100
+  use_checkpoint: false
+  qk_norm: true
+  skip: true
+
+mel:
+  target_sample_rate: 24000
+  n_mel_channels: 100
+  hop_length: 256
+
+opt:
+  learning_rate: 2.0e-05
+  beta1: 0.9
+  beta2: 0.999
+  weight_decay: 0.01
+  adam_epsilon: 1.0e-08
+  grad_clip: 1.0
+  batch_size: 64
+  accumulation_steps: 1
+  # mask_range: [0.7, 1.0]
+  drop_spk: 0.1
+  drop_text: 0.5
+
+  lr_scheduler:
+    warmup_steps: 250
+    decay_steps: 10000
+    end_factor: 1.0e-02
+
+data:
+  trainset:
+    dataset_dir: "" # your processed path
+    clap_emb_dir: "./data/clap_embs/"
+    t5_folder_name: "t5"
+    phn_folder_name: "g2p"
+    manifest_name: "manifest"
+    json_name: "jsons"
+    dynamic_batching: true
+    text_pad_token: -1
+    audio_pad_token: 0.0
+    split: "train_AgentDB"
+    sr: 24000
+    norm_audio: false
+
+  valset:
+    dataset_dir: "" # your processed path
+    clap_emb_dir: "./data/clap_embs/"
+    t5_folder_name: "t5"
+    phn_folder_name: "g2p"
+    manifest_name: "manifest"
+    json_name: "jsons"
+    dynamic_batching: true
+    text_pad_token: -1
+    audio_pad_token: 0.0
+    split: "test_AgentDB"
+    sr: 24000
+    norm_audio: false
diff --git a/capspeech/nar/configs/finetune_captts.yaml b/capspeech/nar/configs/finetune_captts.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c3e0529eb53bf46bb42f509e4186b08805e0c6be
--- /dev/null
+++ b/capspeech/nar/configs/finetune_captts.yaml
@@ -0,0 +1,71 @@
+model_name: finetune_captts
+
+model:
+  dim: 1024
+  depth: 24
+  heads: 16
+  ff_mult: 2
+  text_dim: 512
+  # disable convnext in text embedding
+  conv_layers: 0
+  # phoneme vocab size
+  text_num_embeds: 200
+  mel_dim: 100
+  t5_dim: 1024
+  clap_dim: 512
+  # disable it on a100
+  use_checkpoint: false
+  qk_norm: true
+  skip: true
+
+mel:
+  target_sample_rate: 24000
+  n_mel_channels: 100
+  hop_length: 256
+
+opt:
+  learning_rate: 2.0e-05
+  beta1: 0.9
+  beta2: 0.999
+  weight_decay: 0.01
+  adam_epsilon: 1.0e-08
+  grad_clip: 1.0
+  batch_size: 64
+  accumulation_steps: 1
+  # mask_range: [0.7, 1.0]
+  drop_spk: 0.1
+  drop_text: 0.5
+
+  lr_scheduler:
+    warmup_steps: 1000
+    decay_steps: 100000
+    end_factor: 1.0e-02
+
+data:
+  trainset:
+    dataset_dir: "" # your processed path
+    clap_emb_dir: "./data/clap_embs/"
+    t5_folder_name: "t5"
+    phn_folder_name: "g2p"
+    manifest_name: "manifest"
+    json_name: "jsons"
+    dynamic_batching: true
+    text_pad_token: -1
+    audio_pad_token: 0.0
+    split: "train_SFT_CapTTS"
+    sr: 24000
+    norm_audio: false
+
+  valset:
+    dataset_dir: "" # your processed path
+    clap_emb_dir: "./data/clap_embs/"
+    t5_folder_name: "t5"
+    phn_folder_name: "g2p"
+    manifest_name: "manifest"
+    json_name: "jsons"
+    dynamic_batching: true
+    text_pad_token: -1
+    audio_pad_token: 0.0
+    split: "validation_SFT_CapTTS"
+    sr: 24000
+    norm_audio: false
diff --git a/capspeech/nar/configs/finetune_capttsse.yaml b/capspeech/nar/configs/finetune_capttsse.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b5abdca043c5d5bc27233846c0ca7656b66030c0
--- /dev/null
+++ b/capspeech/nar/configs/finetune_capttsse.yaml
@@ -0,0 +1,71 @@
+model_name: finetune_capttssedb
+
+model:
+  dim: 1024
+  depth: 24
+  heads: 16
+  ff_mult: 2
+  text_dim: 512
+  # disable convnext in text embedding
+  conv_layers: 0
+  # phoneme vocab size
+  text_num_embeds: 200
+  mel_dim: 100
+  t5_dim: 1024
+  clap_dim: 512
+  # disable it on a100
+  use_checkpoint: false
+  qk_norm: true
+  skip: true
+
+mel:
+  target_sample_rate: 24000
+  n_mel_channels: 100
+  hop_length: 256
+
+opt:
+  learning_rate: 1.0e-05
+  beta1: 0.9
+  beta2: 0.999
+  weight_decay: 0.01
+  adam_epsilon: 1.0e-08
+  grad_clip: 1.0
+  batch_size: 64
+  accumulation_steps: 1
+  # mask_range: [0.7, 1.0]
+  drop_spk: 0.1
+  drop_text: 0.5
+
+  lr_scheduler:
+    warmup_steps: 50
+    decay_steps: 1000
+    end_factor: 1.0e-02
+
+data:
+  trainset:
+    dataset_dir: "" # your processed path
+    clap_emb_dir: "./data/clap_embs/"
+    t5_folder_name: "t5"
+    phn_folder_name: "g2p"
+    manifest_name: "manifest"
+    json_name: "jsons"
+    dynamic_batching: true
+    text_pad_token: -1
+    audio_pad_token: 0.0
+    split: "train_SEDB"
+    sr: 24000
+    norm_audio: false
+
+  valset:
+    dataset_dir: "" # your processed path
+    clap_emb_dir: "./data/clap_embs/"
+    t5_folder_name: "t5"
+    phn_folder_name: "g2p"
+    manifest_name: "manifest"
+    json_name: "jsons"
+    dynamic_batching: true
+    text_pad_token: -1
+    audio_pad_token: 0.0
+    split: "test_SEDB"
+    sr: 24000
+    norm_audio: false
diff --git a/capspeech/nar/configs/finetune_emocaptts.yaml b/capspeech/nar/configs/finetune_emocaptts.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..451eb7c23a21136460ce96f41a5f8f1d3e4dc822
--- /dev/null
+++ b/capspeech/nar/configs/finetune_emocaptts.yaml
@@ -0,0 +1,71 @@
+model_name: finetune_emocaptts
+
+model:
+  dim: 1024
+  depth: 24
+  heads: 16
+  ff_mult: 2
+  text_dim: 512
+  # disable convnext in text embedding
+  conv_layers: 0
+  # phoneme vocab size
+  text_num_embeds: 200
+  mel_dim: 100
+  t5_dim: 1024
+  clap_dim: 512
+  # disable it on a100
+  use_checkpoint: false
+  qk_norm: true
+  skip: true
+
+mel:
+  target_sample_rate: 24000
+  n_mel_channels: 100
+  hop_length: 256
+
+opt:
+  learning_rate: 2.0e-05
+  beta1: 0.9
+  beta2: 0.999
+  weight_decay: 0.01
+  adam_epsilon: 1.0e-08
+  grad_clip: 1.0
+  batch_size: 64
+  accumulation_steps: 1
+  # mask_range: [0.7, 1.0]
+  drop_spk: 0.1
+  drop_text: 0.5
+
+  lr_scheduler:
+    warmup_steps: 1000
+    decay_steps: 100000
+    end_factor: 1.0e-02
+
+data:
+  trainset:
+    dataset_dir: "" # your processed path
+    clap_emb_dir: "./data/clap_embs/"
+    t5_folder_name: "t5"
+    phn_folder_name: "g2p"
+    manifest_name: "manifest"
+    json_name: "jsons"
+    dynamic_batching: true
+    text_pad_token: -1
+    audio_pad_token: 0.0
+    split: "train_SFT_EmoCapTTS"
+    sr: 24000
+    norm_audio: false
+
+  valset:
+    dataset_dir: "" # your processed path
+    clap_emb_dir: "./data/clap_embs/"
+    t5_folder_name: "t5"
+    phn_folder_name: "g2p"
+    manifest_name: "manifest"
+    json_name: "jsons"
+    dynamic_batching: true
+    text_pad_token: -1
+    audio_pad_token: 0.0
+    split: "validation_SFT_EmoCapTTS"
+    sr: 24000
+    norm_audio: false
diff --git a/capspeech/nar/configs/pretrain.yaml b/capspeech/nar/configs/pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f04f3cbefd865b48eefbcbe1547a3fff3323bfd0
--- /dev/null
+++ b/capspeech/nar/configs/pretrain.yaml
@@ -0,0 +1,71 @@
+model_name: pretrain
+
+model:
+  dim: 1024
+  depth: 24
+  heads: 16
+  ff_mult: 4
+  text_dim: 512
+  # disable convnext in text embedding
+  conv_layers: 0
+  # phoneme vocab size
+  text_num_embeds: 200
+  mel_dim: 100
+  t5_dim: 1024
+  clap_dim: 512
+  # disable it on a100
+  use_checkpoint: false
+  qk_norm: true
+  skip: true
+
+mel:
+  target_sample_rate: 24000
+  n_mel_channels: 100
+  hop_length: 256
+
+opt:
+  learning_rate: 2.0e-04
+  beta1: 0.9
+  beta2: 0.999
+  weight_decay: 0.01
+  adam_epsilon: 1.0e-08
+  grad_clip: 1.0
+  batch_size: 64
+  accumulation_steps: 1
+  # mask_range: [0.7, 1.0]
+  drop_spk: 0.1
+  drop_text: 0.5
+
+  lr_scheduler:
+    warmup_steps: 5000
+    decay_steps: 150000
+    end_factor: 1.0e-02
+
+data:
+  trainset:
+    dataset_dir: "" # your processed path
+    clap_emb_dir: "./data/clap_embs/"
+    t5_folder_name: "t5"
+    phn_folder_name: "g2p"
+    manifest_name: "manifest"
+    json_name: "jsons"
+    dynamic_batching: true
+    text_pad_token: -1
+    audio_pad_token: 0.0
+    split: "train_PT"
+    sr: 24000
+    norm_audio: false
+
+  valset:
+    dataset_dir: "" # your processed path
+    clap_emb_dir: "./data/clap_embs/"
+    t5_folder_name: "t5"
+    phn_folder_name: "g2p"
+    manifest_name: "manifest"
+    json_name: "jsons"
+    dynamic_batching: true
+    text_pad_token: -1
+    audio_pad_token: 0.0
+    split: "validation_PT"
+    sr: 24000
+    norm_audio: false
diff --git a/capspeech/nar/data/Emilia_ZH_EN_pinyin/__init__.py b/capspeech/nar/data/Emilia_ZH_EN_pinyin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/nar/data/Emilia_ZH_EN_pinyin/vocab.txt b/capspeech/nar/data/Emilia_ZH_EN_pinyin/vocab.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a30a90c12e1ab38b95c97770d5c5cd1d03c392e2
--- /dev/null
+++ b/capspeech/nar/data/Emilia_ZH_EN_pinyin/vocab.txt
@@ -0,0 +1,2545 @@
+ 
+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+=
+>
+?
+@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+[
+\
+]
+_
+a
+a1
+ai1
+ai2
+ai3
+ai4
+an1
+an3
+an4
+ang1
+ang2
+ang4
+ao1
+ao2
+ao3
+ao4
+b
+ba
+ba1
+ba2
+ba3
+ba4
+bai1
+bai2
+bai3
+bai4
+ban1
+ban2
+ban3
+ban4
+bang1
+bang2
+bang3
+bang4
+bao1
+bao2
+bao3
+bao4
+bei
+bei1
+bei2
+bei3
+bei4
+ben1
+ben2
+ben3
+ben4
+beng
+beng1
+beng2
+beng3
+beng4
+bi1
+bi2
+bi3
+bi4
+bian1
+bian2
+bian3
+bian4
+biao1
+biao2
+biao3
+bie1
+bie2
+bie3
+bie4
+bin1
+bin4
+bing1
+bing2
+bing3
+bing4
+bo
+bo1
+bo2
+bo3
+bo4
+bu2
+bu3
+bu4
+c
+ca1
+cai1
+cai2
+cai3
+cai4
+can1
+can2
+can3
+can4
+cang1
+cang2
+cao1
+cao2
+cao3
+ce4
+cen1
+cen2
+ceng1
+ceng2
+ceng4
+cha1
+cha2
+cha3
+cha4
+chai1
+chai2
+chan1
+chan2
+chan3
+chan4
+chang1
+chang2
+chang3
+chang4
+chao1
+chao2
+chao3
+che1
+che2
+che3
+che4
+chen1
+chen2
+chen3
+chen4
+cheng1
+cheng2
+cheng3
+cheng4
+chi1
+chi2
+chi3
+chi4
+chong1
+chong2
+chong3
+chong4
+chou1
+chou2
+chou3
+chou4
+chu1
+chu2
+chu3
+chu4
+chua1
+chuai1
+chuai2
+chuai3
+chuai4
+chuan1
+chuan2
+chuan3
+chuan4
+chuang1
+chuang2
+chuang3
+chuang4
+chui1
+chui2
+chun1
+chun2
+chun3
+chuo1
+chuo4
+ci1
+ci2
+ci3
+ci4
+cong1
+cong2
+cou4
+cu1
+cu4
+cuan1
+cuan2
+cuan4
+cui1
+cui3
+cui4
+cun1
+cun2
+cun4
+cuo1
+cuo2
+cuo4
+d
+da
+da1
+da2
+da3
+da4
+dai1
+dai2
+dai3
+dai4
+dan1
+dan2
+dan3
+dan4
+dang1
+dang2
+dang3
+dang4
+dao1
+dao2
+dao3
+dao4
+de
+de1
+de2
+dei3
+den4
+deng1
+deng2
+deng3
+deng4
+di1
+di2
+di3
+di4
+dia3
+dian1
+dian2
+dian3
+dian4
+diao1
+diao3
+diao4
+die1
+die2
+die4
+ding1
+ding2
+ding3
+ding4
+diu1
+dong1
+dong3
+dong4
+dou1
+dou2
+dou3
+dou4
+du1
+du2
+du3
+du4
+duan1
+duan2
+duan3
+duan4
+dui1
+dui4
+dun1
+dun3
+dun4
+duo1
+duo2
+duo3
+duo4
+e
+e1
+e2
+e3
+e4
+ei2
+en1
+en4
+er
+er2
+er3
+er4
+f
+fa1
+fa2
+fa3
+fa4
+fan1
+fan2
+fan3
+fan4
+fang1
+fang2
+fang3
+fang4
+fei1
+fei2
+fei3
+fei4
+fen1
+fen2
+fen3
+fen4
+feng1
+feng2
+feng3
+feng4
+fo2
+fou2
+fou3
+fu1
+fu2
+fu3
+fu4
+g
+ga1
+ga2
+ga3
+ga4
+gai1
+gai2
+gai3
+gai4
+gan1
+gan2
+gan3
+gan4
+gang1
+gang2
+gang3
+gang4
+gao1
+gao2
+gao3
+gao4
+ge1
+ge2
+ge3
+ge4
+gei2
+gei3
+gen1
+gen2
+gen3
+gen4
+geng1
+geng3
+geng4
+gong1
+gong3
+gong4
+gou1
+gou2
+gou3
+gou4
+gu
+gu1
+gu2
+gu3
+gu4
+gua1
+gua2
+gua3
+gua4
+guai1
+guai2
+guai3
+guai4
+guan1
+guan2
+guan3
+guan4
+guang1
+guang2
+guang3
+guang4
+gui1
+gui2
+gui3
+gui4
+gun3
+gun4
+guo1
+guo2
+guo3
+guo4
+h
+ha1
+ha2
+ha3
+hai1
+hai2
+hai3
+hai4
+han1
+han2
+han3
+han4
+hang1
+hang2
+hang4
+hao1
+hao2
+hao3
+hao4
+he1
+he2
+he4
+hei1
+hen2
+hen3
+hen4
+heng1
+heng2
+heng4
+hong1
+hong2
+hong3
+hong4
+hou1
+hou2
+hou3
+hou4
+hu1
+hu2
+hu3
+hu4
+hua1
+hua2
+hua4
+huai2
+huai4
+huan1
+huan2
+huan3
+huan4
+huang1
+huang2
+huang3
+huang4
+hui1
+hui2
+hui3
+hui4
+hun1
+hun2
+hun4
+huo
+huo1
+huo2
+huo3
+huo4
+i
+j
+ji1
+ji2
+ji3
+ji4
+jia
+jia1
+jia2
+jia3
+jia4
+jian1
+jian2
+jian3
+jian4
+jiang1
+jiang2
+jiang3
+jiang4
+jiao1
+jiao2
+jiao3
+jiao4
+jie1
+jie2
+jie3
+jie4
+jin1
+jin2
+jin3
+jin4
+jing1
+jing2
+jing3
+jing4
+jiong3
+jiu1
+jiu2
+jiu3
+jiu4
+ju1
+ju2
+ju3
+ju4
+juan1
+juan2
+juan3
+juan4
+jue1
+jue2
+jue4
+jun1
+jun4
+k
+ka1
+ka2
+ka3
+kai1
+kai2
+kai3
+kai4
+kan1
+kan2
+kan3
+kan4
+kang1
+kang2
+kang4
+kao1
+kao2
+kao3
+kao4
+ke1
+ke2
+ke3
+ke4
+ken3
+keng1
+kong1
+kong3
+kong4
+kou1
+kou2
+kou3
+kou4
+ku1
+ku2
+ku3
+ku4
+kua1
+kua3
+kua4
+kuai3
+kuai4
+kuan1
+kuan2
+kuan3
+kuang1
+kuang2
+kuang4
+kui1
+kui2
+kui3
+kui4
+kun1
+kun3
+kun4
+kuo4
+l
+la
+la1
+la2
+la3
+la4
+lai2
+lai4
+lan2
+lan3
+lan4
+lang1
+lang2
+lang3
+lang4
+lao1
+lao2
+lao3
+lao4
+le
+le1
+le4
+lei
+lei1
+lei2
+lei3
+lei4
+leng1
+leng2
+leng3
+leng4
+li
+li1
+li2
+li3
+li4
+lia3
+lian2
+lian3
+lian4
+liang2
+liang3
+liang4
+liao1
+liao2
+liao3
+liao4
+lie1
+lie2
+lie3
+lie4
+lin1
+lin2
+lin3
+lin4
+ling2
+ling3
+ling4
+liu1
+liu2
+liu3
+liu4
+long1
+long2
+long3
+long4
+lou1
+lou2
+lou3
+lou4
+lu1
+lu2
+lu3
+lu4
+luan2
+luan3
+luan4
+lun1
+lun2
+lun4
+luo1
+luo2
+luo3
+luo4
+lv2
+lv3
+lv4
+lve3
+lve4
+m
+ma
+ma1
+ma2
+ma3
+ma4
+mai2
+mai3
+mai4
+man1
+man2
+man3
+man4
+mang2
+mang3
+mao1
+mao2
+mao3
+mao4
+me
+mei2
+mei3
+mei4
+men
+men1
+men2
+men4
+meng
+meng1
+meng2
+meng3
+meng4
+mi1
+mi2
+mi3
+mi4
+mian2
+mian3
+mian4
+miao1
+miao2
+miao3
+miao4
+mie1
+mie4
+min2
+min3
+ming2
+ming3
+ming4
+miu4
+mo1
+mo2
+mo3
+mo4
+mou1
+mou2
+mou3
+mu2
+mu3
+mu4
+n
+n2
+na1
+na2
+na3
+na4
+nai2
+nai3
+nai4
+nan1
+nan2
+nan3
+nan4
+nang1
+nang2
+nang3
+nao1
+nao2
+nao3
+nao4
+ne
+ne2
+ne4
+nei3
+nei4
+nen4
+neng2
+ni1
+ni2
+ni3
+ni4
+nian1
+nian2
+nian3
+nian4
+niang2
+niang4
+niao2
+niao3
+niao4
+nie1
+nie4
+nin2
+ning2
+ning3
+ning4
+niu1
+niu2
+niu3
+niu4
+nong2
+nong4
+nou4
+nu2
+nu3
+nu4
+nuan3
+nuo2
+nuo4
+nv2
+nv3
+nve4
+o
+o1
+o2
+ou1
+ou2
+ou3
+ou4
+p
+pa1
+pa2
+pa4
+pai1
+pai2
+pai3
+pai4
+pan1
+pan2
+pan4
+pang1
+pang2
+pang4
+pao1
+pao2
+pao3
+pao4
+pei1
+pei2
+pei4
+pen1
+pen2
+pen4
+peng1
+peng2
+peng3
+peng4
+pi1
+pi2
+pi3
+pi4
+pian1
+pian2
+pian4
+piao1
+piao2
+piao3
+piao4
+pie1
+pie2
+pie3
+pin1
+pin2
+pin3
+pin4
+ping1
+ping2
+po1
+po2
+po3
+po4
+pou1
+pu1
+pu2
+pu3
+pu4
+q
+qi1
+qi2
+qi3
+qi4
+qia1
+qia3
+qia4
+qian1
+qian2
+qian3
+qian4
+qiang1
+qiang2
+qiang3
+qiang4
+qiao1
+qiao2
+qiao3
+qiao4
+qie1
+qie2
+qie3
+qie4
+qin1
+qin2
+qin3
+qin4
+qing1
+qing2
+qing3
+qing4
+qiong1
+qiong2
+qiu1
+qiu2
+qiu3
+qu1
+qu2
+qu3
+qu4
+quan1
+quan2
+quan3
+quan4
+que1
+que2
+que4
+qun2
+r
+ran2
+ran3
+rang1
+rang2
+rang3
+rang4
+rao2
+rao3
+rao4
+re2
+re3
+re4
+ren2
+ren3
+ren4
+reng1
+reng2
+ri4
+rong1
+rong2
+rong3
+rou2
+rou4
+ru2
+ru3
+ru4
+ruan2
+ruan3
+rui3
+rui4
+run4
+ruo4
+s
+sa1
+sa2
+sa3
+sa4
+sai1
+sai4
+san1
+san2
+san3
+san4
+sang1
+sang3
+sang4
+sao1
+sao2
+sao3
+sao4
+se4
+sen1
+seng1
+sha1
+sha2
+sha3
+sha4
+shai1
+shai2
+shai3
+shai4
+shan1
+shan3
+shan4
+shang
+shang1
+shang3
+shang4
+shao1
+shao2
+shao3
+shao4
+she1
+she2
+she3
+she4
+shei2
+shen1
+shen2
+shen3
+shen4
+sheng1
+sheng2
+sheng3
+sheng4
+shi
+shi1
+shi2
+shi3
+shi4
+shou1
+shou2
+shou3
+shou4
+shu1
+shu2
+shu3
+shu4
+shua1
+shua2
+shua3
+shua4
+shuai1
+shuai3
+shuai4
+shuan1
+shuan4
+shuang1
+shuang3
+shui2
+shui3
+shui4
+shun3
+shun4
+shuo1
+shuo4
+si1
+si2
+si3
+si4
+song1
+song3
+song4
+sou1
+sou3
+sou4
+su1
+su2
+su4
+suan1
+suan4
+sui1
+sui2
+sui3
+sui4
+sun1
+sun3
+suo
+suo1
+suo2
+suo3
+t
+ta1
+ta2
+ta3
+ta4
+tai1
+tai2
+tai4
+tan1
+tan2
+tan3
+tan4
+tang1
+tang2
+tang3
+tang4
+tao1
+tao2
+tao3
+tao4
+te4
+teng2
+ti1
+ti2
+ti3
+ti4
+tian1
+tian2
+tian3
+tiao1
+tiao2
+tiao3
+tiao4
+tie1
+tie2
+tie3
+tie4
+ting1
+ting2
+ting3
+tong1
+tong2
+tong3
+tong4
+tou
+tou1
+tou2
+tou4
+tu1
+tu2
+tu3
+tu4
+tuan1
+tuan2
+tui1
+tui2
+tui3
+tui4
+tun1
+tun2
+tun4
+tuo1
+tuo2
+tuo3
+tuo4
+u
+v
+w
+wa
+wa1
+wa2
+wa3
+wa4
+wai1
+wai3
+wai4
+wan1
+wan2
+wan3
+wan4
+wang1
+wang2
+wang3
+wang4
+wei1
+wei2
+wei3
+wei4
+wen1
+wen2
+wen3
+wen4
+weng1
+weng4
+wo1
+wo2
+wo3
+wo4
+wu1
+wu2
+wu3
+wu4
+x
+xi1
+xi2
+xi3
+xi4
+xia1
+xia2
+xia4
+xian1
+xian2
+xian3
+xian4
+xiang1
+xiang2
+xiang3
+xiang4
+xiao1
+xiao2
+xiao3
+xiao4
+xie1
+xie2
+xie3
+xie4
+xin1
+xin2
+xin4
+xing1
+xing2
+xing3
+xing4
+xiong1
+xiong2
+xiu1
+xiu3
+xiu4
+xu
+xu1
+xu2
+xu3
+xu4
+xuan1
+xuan2
+xuan3
+xuan4
+xue1
+xue2
+xue3
+xue4
+xun1
+xun2
+xun4
+y
+ya
+ya1
+ya2
+ya3
+ya4
+yan1
+yan2
+yan3
+yan4
+yang1
+yang2
+yang3
+yang4
+yao1
+yao2
+yao3
+yao4
+ye1
+ye2
+ye3
+ye4
+yi
+yi1
+yi2
+yi3
+yi4
+yin1
+yin2
+yin3
+yin4
+ying1
+ying2
+ying3
+ying4
+yo1
+yong1
+yong2
+yong3
+yong4
+you1
+you2
+you3
+you4
+yu1
+yu2
+yu3
+yu4
+yuan1
+yuan2
+yuan3
+yuan4
+yue1
+yue4
+yun1
+yun2
+yun3
+yun4
+z
+za1
+za2
+za3
+zai1
+zai3
+zai4
+zan1
+zan2
+zan3
+zan4
+zang1
+zang4
+zao1
+zao2
+zao3
+zao4
+ze2
+ze4
+zei2
+zen3
+zeng1
+zeng4
+zha1
+zha2
+zha3
+zha4
+zhai1
+zhai2
+zhai3
+zhai4
+zhan1
+zhan2
+zhan3
+zhan4
+zhang1
+zhang2
+zhang3
+zhang4
+zhao1
+zhao2
+zhao3
+zhao4
+zhe
+zhe1
+zhe2
+zhe3
+zhe4
+zhen1
+zhen2
+zhen3
+zhen4
+zheng1
+zheng2
+zheng3
+zheng4
+zhi1
+zhi2
+zhi3
+zhi4
+zhong1
+zhong2
+zhong3
+zhong4
+zhou1
+zhou2
+zhou3
+zhou4
+zhu1
+zhu2
+zhu3
+zhu4
+zhua1
+zhua2
+zhua3
+zhuai1
+zhuai3
+zhuai4
+zhuan1
+zhuan2
+zhuan3
+zhuan4
+zhuang1
+zhuang4
+zhui1
+zhui4
+zhun1
+zhun2
+zhun3
+zhuo1
+zhuo2
+zi
+zi1
+zi2
+zi3
+zi4
+zong1
+zong2
+zong3
+zong4
+zou1
+zou2
+zou3
+zou4
+zu1
+zu2
+zu3
+zuan1
+zuan3
+zuan4
+zui2
+zui3
+zui4
+zun1
+zuo
+zuo1
+zuo2
+zuo3
+zuo4
+{
+~
+¡
+¢
+£
+¥
+§
+¨
+©
+«
+®
+¯
+°
+±
+²
+³
+´
+µ
+·
+¹
+º
+»
+¼
+½
+¾
+¿
+À
+Á
+Â
+Ã
+Ä
+Å
+Æ
+Ç
+È
+É
+Ê
+Í
+Î
+Ñ
+Ó
+Ö
+×
+Ø
+Ú
+Ü
+Ý
+Þ
+ß
+à
+á
+â
+ã
+ä
+å
+æ
+ç
+è
+é
+ê
+ë
+ì
+í
+î
+ï
+ð
+ñ
+ò
+ó
+ô
+õ
+ö
+ø
+ù
+ú
+û
+ü
+ý
+Ā
+ā
+ă
+ą
+ć
+Č
+č
+Đ
+đ
+ē
+ė
+ę
+ě
+ĝ
+ğ
+ħ
+ī
+į
+İ
+ı
+Ł
+ł
+ń
+ņ
+ň
+ŋ
+Ō
+ō
+ő
+œ
+ř
+Ś
+ś
+Ş
+ş
+Š
+š
+Ť
+ť
+ũ
+ū
+ź
+Ż
+ż
+Ž
+ž
+ơ
+ư
+ǎ
+ǐ
+ǒ
+ǔ
+ǚ
+ș
+ț
+ɑ
+ɔ
+ɕ
+ə
+ɛ
+ɜ
+ɡ
+ɣ
+ɪ
+ɫ
+ɴ
+ɹ
+ɾ
+ʃ
+ʊ
+ʌ
+ʒ
+ʔ
+ʰ
+ʷ
+ʻ
+ʾ
+ʿ
+ˈ
+ː
+˙
+˜
+ˢ
+́
+̅
+Α
+Β
+Δ
+Ε
+Θ
+Κ
+Λ
+Μ
+Ξ
+Π
+Σ
+Τ
+Φ
+Χ
+Ψ
+Ω
+ά
+έ
+ή
+ί
+α
+β
+γ
+δ
+ε
+ζ
+η
+θ
+ι
+κ
+λ
+μ
+ν
+ξ
+ο
+π
+ρ
+ς
+σ
+τ
+υ
+φ
+χ
+ψ
+ω
+ϊ
+ό
+ύ
+ώ
+ϕ
+ϵ
+Ё
+А
+Б
+В
+Г
+Д
+Е
+Ж
+З
+И
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Щ
+Ы
+Ь
+Э
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+ё
+і
+ְ
+ִ
+ֵ
+ֶ
+ַ
+ָ
+ֹ
+ּ
+־
+ׁ
+א
+ב
+ג
+ד
+ה
+ו
+ז
+ח
+ט
+י
+כ
+ל
+ם
+מ
+ן
+נ
+ס
+ע
+פ
+ק
+ר
+ש
+ת
+أ
+ب
+ة
+ت
+ج
+ح
+د
+ر
+ز
+س
+ص
+ط
+ع
+ق
+ك
+ل
+م
+ن
+ه
+و
+ي
+َ
+ُ
+ِ
+ْ
+ก
+ข
+ง
+จ
+ต
+ท
+น
+ป
+ย
+ร
+ว
+ส
+ห
+อ
+ฮ
+ั
+า
+ี
+ึ
+โ
+ใ
+ไ
+่
+้
+์
+ḍ
+Ḥ
+ḥ
+ṁ
+ṃ
+ṅ
+ṇ
+Ṛ
+ṛ
+Ṣ
+ṣ
+Ṭ
+ṭ
+ạ
+ả
+Ấ
+ấ
+ầ
+ậ
+ắ
+ằ
+ẻ
+ẽ
+ế
+ề
+ể
+ễ
+ệ
+ị
+ọ
+ỏ
+ố
+ồ
+ộ
+ớ
+ờ
+ở
+ụ
+ủ
+ứ
+ữ
+ἀ
+ἁ
+Ἀ
+ἐ
+ἔ
+ἰ
+ἱ
+ὀ
+ὁ
+ὐ
+ὲ
+ὸ
+ᾶ
+᾽
+ῆ
+ῇ
+ῶ
+‎
+‑
+‒
+–
+—
+―
+‖
+†
+‡
+•
+…
+‧
+‬
+′
+″
+⁄
+⁡
+⁰
+⁴
+⁵
+⁶
+⁷
+⁸
+⁹
+₁
+₂
+₃
+€
+₱
+₹
+₽
+℃
+ℏ
+ℓ
+№
+ℝ
+™
+⅓
+⅔
+⅛
+→
+∂
+∈
+∑
+−
+∗
+√
+∞
+∫
+≈
+≠
+≡
+≤
+≥
+⋅
+⋯
+█
+♪
+⟨
+⟩
+、
+。
+《
+》
+「
+」
+【
+】
+あ
+う
+え
+お
+か
+が
+き
+ぎ
+く
+ぐ
+け
+げ
+こ
+ご
+さ
+し
+じ
+す
+ず
+せ
+ぜ
+そ
+ぞ
+た
+だ
+ち
+っ
+つ
+で
+と
+ど
+な
+に
+ね
+の
+は
+ば
+ひ
+ぶ
+へ
+べ
+ま
+み
+む
+め
+も
+ゃ
+や
+ゆ
+ょ
+よ
+ら
+り
+る
+れ
+ろ
+わ
+を
+ん
+ァ
+ア
+ィ
+イ
+ウ
+ェ
+エ
+オ
+カ
+ガ
+キ
+ク
+ケ
+ゲ
+コ
+ゴ
+サ
+ザ
+シ
+ジ
+ス
+ズ
+セ
+ゾ
+タ
+ダ
+チ
+ッ
+ツ
+テ
+デ
+ト
+ド
+ナ
+ニ
+ネ
+ノ
+バ
+パ
+ビ
+ピ
+フ
+プ
+ヘ
+ベ
+ペ
+ホ
+ボ
+ポ
+マ
+ミ
+ム
+メ
+モ
+ャ
+ヤ
+ュ
+ユ
+ョ
+ヨ
+ラ
+リ
+ル
+レ
+ロ
+ワ
+ン
+・
+ー
+ㄋ
+ㄍ
+ㄎ
+ㄏ
+ㄓ
+ㄕ
+ㄚ
+ㄜ
+ㄟ
+ㄤ
+ㄥ
+ㄧ
+ㄱ
+ㄴ
+ㄷ
+ㄹ
+ㅁ
+ㅂ
+ㅅ
+ㅈ
+ㅍ
+ㅎ
+ㅏ
+ㅓ
+ㅗ
+ㅜ
+ㅡ
+ㅣ
+㗎
+가
+각
+간
+갈
+감
+갑
+갓
+갔
+강
+같
+개
+거
+건
+걸
+겁
+것
+겉
+게
+겠
+겨
+결
+겼
+경
+계
+고
+곤
+골
+곱
+공
+과
+관
+광
+교
+구
+국
+굴
+귀
+귄
+그
+근
+글
+금
+기
+긴
+길
+까
+깍
+깔
+깜
+깨
+께
+꼬
+꼭
+꽃
+꾸
+꿔
+끔
+끗
+끝
+끼
+나
+난
+날
+남
+납
+내
+냐
+냥
+너
+넘
+넣
+네
+녁
+년
+녕
+노
+녹
+놀
+누
+눈
+느
+는
+늘
+니
+님
+닙
+다
+닥
+단
+달
+닭
+당
+대
+더
+덕
+던
+덥
+데
+도
+독
+동
+돼
+됐
+되
+된
+될
+두
+둑
+둥
+드
+들
+등
+디
+따
+딱
+딸
+땅
+때
+떤
+떨
+떻
+또
+똑
+뚱
+뛰
+뜻
+띠
+라
+락
+란
+람
+랍
+랑
+래
+랜
+러
+런
+럼
+렇
+레
+려
+력
+렵
+렸
+로
+록
+롬
+루
+르
+른
+를
+름
+릉
+리
+릴
+림
+마
+막
+만
+많
+말
+맑
+맙
+맛
+매
+머
+먹
+멍
+메
+면
+명
+몇
+모
+목
+몸
+못
+무
+문
+물
+뭐
+뭘
+미
+민
+밌
+밑
+바
+박
+밖
+반
+받
+발
+밤
+밥
+방
+배
+백
+밸
+뱀
+버
+번
+벌
+벚
+베
+벼
+벽
+별
+병
+보
+복
+본
+볼
+봐
+봤
+부
+분
+불
+비
+빔
+빛
+빠
+빨
+뼈
+뽀
+뿅
+쁘
+사
+산
+살
+삼
+샀
+상
+새
+색
+생
+서
+선
+설
+섭
+섰
+성
+세
+셔
+션
+셨
+소
+속
+손
+송
+수
+숙
+순
+술
+숫
+숭
+숲
+쉬
+쉽
+스
+슨
+습
+슷
+시
+식
+신
+실
+싫
+심
+십
+싶
+싸
+써
+쓰
+쓴
+씌
+씨
+씩
+씬
+아
+악
+안
+않
+알
+야
+약
+얀
+양
+얘
+어
+언
+얼
+엄
+업
+없
+었
+엉
+에
+여
+역
+연
+염
+엽
+영
+옆
+예
+옛
+오
+온
+올
+옷
+옹
+와
+왔
+왜
+요
+욕
+용
+우
+운
+울
+웃
+워
+원
+월
+웠
+위
+윙
+유
+육
+윤
+으
+은
+을
+음
+응
+의
+이
+익
+인
+일
+읽
+임
+입
+있
+자
+작
+잔
+잖
+잘
+잡
+잤
+장
+재
+저
+전
+점
+정
+제
+져
+졌
+조
+족
+좀
+종
+좋
+죠
+주
+준
+줄
+중
+줘
+즈
+즐
+즘
+지
+진
+집
+짜
+짝
+쩌
+쪼
+쪽
+쫌
+쭈
+쯔
+찌
+찍
+차
+착
+찾
+책
+처
+천
+철
+체
+쳐
+쳤
+초
+촌
+추
+출
+춤
+춥
+춰
+치
+친
+칠
+침
+칩
+칼
+커
+켓
+코
+콩
+쿠
+퀴
+크
+큰
+큽
+키
+킨
+타
+태
+터
+턴
+털
+테
+토
+통
+투
+트
+특
+튼
+틀
+티
+팀
+파
+팔
+패
+페
+펜
+펭
+평
+포
+폭
+표
+품
+풍
+프
+플
+피
+필
+하
+학
+한
+할
+함
+합
+항
+해
+햇
+했
+행
+허
+험
+형
+혜
+호
+혼
+홀
+화
+회
+획
+후
+휴
+흐
+흔
+희
+히
+힘
+ﷺ
+ﷻ
+！
+，
+？
+�
+𠮶
diff --git a/capspeech/nar/data/__init__.py b/capspeech/nar/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/nar/data/librispeech_pc_test_clean_cross_sentence.lst b/capspeech/nar/data/librispeech_pc_test_clean_cross_sentence.lst
new file mode 100644
index 0000000000000000000000000000000000000000..6969a66f3986832ceaefc546bfe5fc0bc043d1b1
--- /dev/null
+++ b/capspeech/nar/data/librispeech_pc_test_clean_cross_sentence.lst
@@ -0,0 +1,1127 @@
+4992-41806-0009	4.355	exclaimed Bill Harmon to his wife as they went through the lighted hall.	4992-23283-0000	6.645	But the more forgetfulness had then prevailed, the more powerful was the force of remembrance when she awoke.
+4992-23283-0001	2.71	Miss Milner's health is not good"!	4992-23283-0003	4.645	So there is to me"! added Sandford, with a sarcastic sneer.
+4992-23283-0015	3.675	Is she not afraid that I will thwart her inclinations"?	4992-23283-0004	8.06	And yet you must own her behaviour has warranted them has it not been in this particular incoherent and unaccountable"?
+4992-23283-0015	3.675	Is she not afraid that I will thwart her inclinations"?	4992-23283-0007	4.045	To ask any more questions of you, I believe, would be unfair.
+4992-41797-0012	2.705	She is wild to know how to do things.	4992-23283-0008	4.91	He seemed to wait for her reply; but as she made none, he proceeded-
+4992-41797-0016	3.3	They couldn't run nor move; they're just pasteboard".	4992-23283-0009	8.395	Oh! my Lord," cried Miss Woodley, with a most forcible accent, " You are the last person on earth she would pardon me for entrusting".
+4992-41797-0005	3.845	Done? He ain't done a thing he'd oughter sence he was born.	4992-23283-0010	5	But in such a case, Miss Milner's election of a husband shall not direct mine.
+4992-41797-0012	2.705	She is wild to know how to do things.	4992-23283-0011	4.225	If she does not know how to estimate her own value, I do.
+4992-41806-0004	3.7	Burn, fire, burn! Flicker, flicker, flame!	4992-23283-0013	6.63	My Lord, Miss Milner's taste is not a depraved one; it is but too refined".
+4992-41797-0012	2.705	She is wild to know how to do things.	4992-23283-0014	4.535	What can you mean by that, Miss Woodley? You talk mysteriously.
+4992-41797-0012	2.705	She is wild to know how to do things.	4992-23283-0016	4.495	Again he searched his own thoughts; nor ineffectually as before.
+4992-23283-0007	4.045	To ask any more questions of you, I believe, would be unfair.	4992-23283-0018	6.575	To relieve her from both, he laid his hand with force upon his heart, and said, "Do you believe me"?
+4992-23283-0016	4.495	Again he searched his own thoughts; nor ineffectually as before.	4992-23283-0019	6.585	I will make no unjust use of what I know," he replied with firmness. "I believe you, my Lord".
+672-122797-0005	3.26	Oh, that made him so angry!	672-122797-0000	4.07	Out in the woods stood a nice little Fir Tree.
+672-122797-0029	3.05	How it will shine this evening"!	672-122797-0003	4.76	But this was what the Tree could not bear to hear.
+672-122797-0000	4.07	Out in the woods stood a nice little Fir Tree.	672-122797-0007	6.42	In autumn the wood cutters always came and felled some of the largest trees.
+672-122797-0000	4.07	Out in the woods stood a nice little Fir Tree.	672-122797-0012	7.765	I would fain know if I am destined for so glorious a career," cried the Tree, rejoicing.
+672-122797-0029	3.05	How it will shine this evening"!	672-122797-0013	8.705	I am now tall, and my branches spread like the others that were carried off last year! Oh!
+672-122797-0032	4	cried the young ladies, and they quickly put out the fire.	672-122797-0015	4.455	Were I in the warm room with all the splendor and magnificence!
+672-122797-0044	3.74	And he leaned against the wall lost in reverie.	672-122797-0016	9.215	Yes; then something better, something still grander, will surely follow, or wherefore should they thus ornament me?
+672-122797-0041	3.88	In the morning the servant and the housemaid came in.	672-122797-0017	4.82	Something better, something still grander must follow - but what?
+672-122797-0000	4.07	Out in the woods stood a nice little Fir Tree.	672-122797-0018	4.93	Rejoice in our presence"! said the Air and the Sunlight.
+672-122797-0047	3.325	How kind man is, after all!	672-122797-0019	4.11	Rejoice in thy own fresh youth"!
+672-122797-0053	2.955	They were so extremely curious.	672-122797-0020	8.825	But the Tree did not rejoice at all; he grew and grew, and was green both winter and summer.
+672-122797-0032	4	cried the young ladies, and they quickly put out the fire.	672-122797-0021	4.15	and towards Christmas he was one of the first that was cut down.
+672-122797-0032	4	cried the young ladies, and they quickly put out the fire.	672-122797-0023	9.695063	He well knew that he should never see his dear old comrades, the little bushes and flowers around him, anymore; perhaps not even the birds!
+672-122797-0059	3.52	Only that one," answered the Tree.	672-122797-0024	4.13	The departure was not at all agreeable.
+672-122797-0000	4.07	Out in the woods stood a nice little Fir Tree.	672-122797-0027	4.79	The servants, as well as the young ladies, decorated it.
+672-122797-0015	4.455	Were I in the warm room with all the splendor and magnificence!	672-122797-0030	4.575	Perhaps the other trees from the forest will come to look at me!
+672-122797-0015	4.455	Were I in the warm room with all the splendor and magnificence!	672-122797-0032	4	cried the young ladies, and they quickly put out the fire.
+672-122797-0015	4.455	Were I in the warm room with all the splendor and magnificence!	672-122797-0034	5.11	A story"! cried the children, drawing a little fat man towards the Tree.
+672-122797-0011	2.54	And then? What happens then"?	672-122797-0036	5.365	Humpy Dumpy fell downstairs, and yet he married the princess!
+672-122797-0044	3.74	And he leaned against the wall lost in reverie.	672-122797-0038	8.8	thought the Fir Tree, and believed it all, because the man who told the story was so good looking. "Well, well!
+672-122797-0043	3.78	What's the meaning of this"? thought the Tree.	672-122797-0039	4.025	I won't tremble tomorrow"! thought the Fir Tree.
+672-122797-0000	4.07	Out in the woods stood a nice little Fir Tree.	672-122797-0040	5.125	And the whole night the Tree stood still and in deep thought.
+672-122797-0059	3.52	Only that one," answered the Tree.	672-122797-0046	4.715	Tis now winter out of doors"! thought the Tree.
+672-122797-0054	4.25	I know no such place," said the Tree.	672-122797-0048	6.555	If it only were not so dark here, and so terribly lonely!
+672-122797-0041	3.88	In the morning the servant and the housemaid came in.	672-122797-0050	4.855	They snuffed about the Fir Tree, and rustled among the branches.
+672-122797-0054	4.25	I know no such place," said the Tree.	672-122797-0051	4.665	I am by no means old," said the Fir Tree.
+672-122797-0011	2.54	And then? What happens then"?	672-122797-0052	4.285	There's many a one considerably older than I am".
+672-122797-0031	3.98	It blazed up famously. "Help! Help"!	672-122797-0054	4.25	I know no such place," said the Tree.
+672-122797-0032	4	cried the young ladies, and they quickly put out the fire.	672-122797-0055	8.23	And then he told all about his youth; and the little Mice had never heard the like before; and they listened and said,
+672-122797-0000	4.07	Out in the woods stood a nice little Fir Tree.	672-122797-0056	5.225	said the Fir Tree, thinking over what he had himself related.
+672-122797-0065	3.03	Now that too is over.	672-122797-0057	6.56	Yes, in reality those were happy times".
+672-122797-0000	4.07	Out in the woods stood a nice little Fir Tree.	672-122797-0058	4.47	Who is Humpy Dumpy"? asked the Mice.
+672-122797-0005	3.26	Oh, that made him so angry!	672-122797-0061	7.59	Don't you know one about bacon and tallow candles? Can't you tell any larder stories"?
+672-122797-0021	4.15	and towards Christmas he was one of the first that was cut down.	672-122797-0066	4.815	Why, one morning there came a quantity of people and set to work in the loft.
+672-122797-0010	3.815	Rejoice in thy growth"! said the Sunbeams.	672-122797-0068	4.02	but it was not the Fir Tree that they meant.
+672-122797-0028	2.61	This evening"! they all said.	672-122797-0069	5.01	It was in a corner that he lay, among weeds and nettles.
+672-122797-0032	4	cried the young ladies, and they quickly put out the fire.	672-122797-0070	6.27	The golden star of tinsel was still on the top of the Tree, and glittered in the sunshine.
+672-122797-0021	4.15	and towards Christmas he was one of the first that was cut down.	672-122797-0071	8.875	In the court yard some of the merry children were playing who had danced at Christmas round the Fir Tree, and were so glad at the sight of him.
+672-122797-0000	4.07	Out in the woods stood a nice little Fir Tree.	672-122797-0072	7.94	And the gardener's boy chopped the Tree into small pieces; there was a whole heap lying there.
+672-122797-0053	2.955	They were so extremely curious.	672-122797-0073	8.205	The wood flamed up splendidly under the large brewing copper, and it sighed so deeply!
+672-122797-0062	2.675	No," said the Tree.	672-122797-0074	8.73	However, that was over now - the Tree gone, the story at an end.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-960-0001	8.250063	The influence with the Timaeus has exercised upon posterity is due partly to a misunderstanding.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-960-0004	8.22	There is no danger of the modern commentators on the Timaeus falling into the absurdities of the Neo Platonists.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-960-0007	7.64	But they have nothing to do with the interpretation of Plato, and in spirit they are opposed to him.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-960-0012	6.89	Many, if not all the elements of the Pre Socratic philosophy are included in the Timaeus.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-960-0014	8.775	The ideas also remain, but they have become types in nature, forms of men, animals, birds, fishes.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-960-0015	7.83	The style and plan of the Timaeus differ greatly from that of any other of the Platonic dialogues.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-960-0016	7.76	But Plato has not the same mastery over his instrument which he exhibits in the Phaedrus or Symposium.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-960-0017	7.87	Nothing can exceed the beauty or art of the introduction, in which he is using words after his accustomed manner.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-960-0018	8.38	But in the rest of the work the power of language seems to fail him, and the dramatic form is wholly given up.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-960-0020	9.88	And hence we find the same sort of clumsiness in the Timaeus of Plato which characterizes the philosophical poem of Lucretius.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-960-0022	7.425	Plato had not the command of his materials which would have enabled him to produce a perfect work of art.
+8224-274384-0003	3.87	or hath he given us any gift?	8224-274381-0011	6.48	His conduct and presence of mind in this emergence appeared conspicuous.
+1221-135766-0013	3.645	Pearl was a born outcast of the infantile world.	1221-135767-0005	5.865	It was the scarlet letter in another form: the scarlet letter endowed with life!
+1221-135766-0015	2.63	If spoken to, she would not speak again.	1221-135767-0010	8.2	She screamed and shouted, too, with a terrific volume of sound, which, doubtless, caused the hearts of the fugitives to quake within them.
+1221-135767-0008	3.095	Come, therefore, and let us fling mud at them"!	1221-135767-0014	7.07	Yea, his honourable worship is within. But he hath a godly minister or two with him, and likewise a leech.
+1221-135767-0020	3.345	In truth, she seemed absolutely hidden behind it.	1221-135767-0024	5.85	Pearl, seeing the rose bushes, began to cry for a red rose, and would not be pacified.
+7176-88083-0008	3.28	In despair he hurled himself downward too soon.	7176-92135-0001	7.56	In short he becomes a "prominent figure in London Society" - and, if he is not careful, somebody will say so.
+7176-92135-0007	3.275	Anyhow it's jolly exciting, and I can do the dialogue all right.	7176-92135-0005	5.47	But suppose you said, "I'm fond of writing; my people always say my letters home are good enough for Punch.
+7176-92135-0027	2.835	Lady Larkspur starts suddenly and turns towards him.	7176-92135-0006	7.795	I've got a little idea for a play about a man and a woman and another woman, and - but perhaps I'd better keep the plot a secret for the moment.
+7176-88083-0009	4.045	The great hawk followed hurriedly, to retrieve his prey from the ground.	7176-92135-0008	4.43	Lend me your ear for ten minutes, and you shall learn just what stagecraft is".
+7176-92135-0004	2.425	Frankly I cannot always say.	7176-92135-0009	4.38	And I should begin with a short homily on Soliloquy.
+7176-88083-0016	3.92	Straightway the hawk glided from his perch and darted after him.	7176-92135-0015	6.755	And so on, till you get to the end, when Ophelia might say, "Ah, yes," or something non committal of that sort.
+7176-88083-0006	4.295	It might have seemed that a trout of this size was a fairly substantial meal.	7176-92135-0016	7.545	This would be an easy way of doing it, but it would not be the best way, for the reason that it is too easy to call attention to itself.
+7176-88083-0016	3.92	Straightway the hawk glided from his perch and darted after him.	7176-92135-0017	7.17	In the old badly made play it was frequently necessary for one of the characters to take the audience into his confidence.
+7176-88083-0016	3.92	Straightway the hawk glided from his perch and darted after him.	7176-92135-0018	8.94	In the modern well constructed play he simply rings up an imaginary confederate and tells him what he is going to do. Could anything be more natural?
+7176-88083-0008	3.28	In despair he hurled himself downward too soon.	7176-92135-0020	7.165	Double nine two three, Elsinore.... Double- nine, yes.... Hallo, is that you, Horatio? Hamlet speaking.
+7176-88083-0016	3.92	Straightway the hawk glided from his perch and darted after him.	7176-92135-0022	8.23	To be or not to be, that is the question; whether 'tis nobler in the mind to suffer the slings and arrows - What? No, Hamlet speaking.
+7176-92135-0002	3.415	But even the unsuccessful dramatist has his moments.	7176-92135-0023	6.215	You gave me double- five, I want double- nine.... Hallo, is that you, Horatio? Hamlet speaking.
+7176-92135-0026	2.95	Enter Hamlet with his favourite boar hound.	7176-92135-0024	4.1	To be or not to be, that is the question; whether 'tis nobler
+7176-88083-0016	3.92	Straightway the hawk glided from his perch and darted after him.	7176-92135-0042	7.27	In novels the hero has often "pushed his meals away untasted," but no stage hero would do anything so unnatural as this.
+7176-92135-0007	3.275	Anyhow it's jolly exciting, and I can do the dialogue all right.	7176-92135-0044	5.175	But it is the cigarette which chiefly has brought the modern drama to its present state of perfection.
+4077-13754-0001	3.77	But a word further concerning the expedition in general.	4077-13751-0001	8.745	Its origin was small - a germ, an insignificant seed, hardly to be thought of as likely to arouse opposition.
+4077-13754-0001	3.77	But a word further concerning the expedition in general.	4077-13751-0002	9.75	Instead of but six regularly affiliated members, and at most two score of adherents, the organization numbers today many hundred thousand souls.
+4077-13751-0013	4.315	Their sufferings have never yet been fitly chronicled by human scribe.	4077-13751-0010	6.72	To the fervent Latter day Saint, a temple is not simply a church building, a house for religious assembly.
+4077-13751-0019	2.92	Who began the quarrel? Was it the "Mormons"?	4077-13751-0013	4.315	Their sufferings have never yet been fitly chronicled by human scribe.
+4077-13754-0001	3.77	But a word further concerning the expedition in general.	4077-13751-0017	5.095	Oh, what a record to read; what a picture to gaze upon; how awful the fact!
+6930-81414-0019	3.38	Voltaire picked up something from the ground and looked at it.	6930-76324-0002	5.56	The poor little things"! cried Cynthia. "Think of them having been turned to the wall all these years!
+6930-76324-0009	3.405	Do you suppose the miniature was a copy of the same thing"?	6930-76324-0004	6.15	But Joyce had not been listening. All at once she put down her candle on the table and faced her companion.
+6930-76324-0009	3.405	Do you suppose the miniature was a copy of the same thing"?	6930-76324-0005	5.035	The twin brother did something she didn't like, and she turned his picture to the wall.
+6930-76324-0001	3.2	They were certainly no nearer the solution of their problem.	6930-76324-0006	4.455	Hers happened to be in the same frame too, but she evidently didn't care about that.
+6930-76324-0006	4.455	Hers happened to be in the same frame too, but she evidently didn't care about that.	6930-76324-0008	5.185	I thought we were 'stumped' again when I first saw that picture, but it's been of some use, after all.
+6930-76324-0026	3.085	Isn't he the greatest for getting into odd corners"!	6930-76324-0011	9.24	They worry me terribly. And, besides, I'd like to see what this lovely furniture looks like without such quantities of dust all over it". "Good scheme, CYN"!
+6930-76324-0006	4.455	Hers happened to be in the same frame too, but she evidently didn't care about that.	6930-76324-0012	4.655	We'll come in here this afternoon with old clothes on, and have a regular house cleaning!
+6930-76324-0010	2.69	What in the world is that"? queried Joyce.	6930-76324-0013	4.305	It can't hurt anything, I'm sure, for we won't disturb things at all.
+6930-76324-0007	2.82	Now what have you to say, Cynthia Sprague"?	6930-76324-0014	4.72	This thought, however, did not enter the heads of the enthusiastic pair.
+6930-76324-0019	2.575	Now let's dust the furniture and pictures".	6930-76324-0016	9.205	The lure proved too much for him, and he came sporting after it, as friskily as a young kitten, much to Cynthia's delight when she caught sight of him.
+6930-81414-0018	2.93	I remember saying. "Have we been together"?	6930-76324-0017	5.41	Oh, let him come along"! she urged. "I do love to see him about that old house.
+6930-76324-0025	4.12	Why, it's Goliath as usual"! they both cried, peering in.	6930-76324-0020	6.315	Yet, little as it was, it had already made a vast difference in the aspect of the room.
+6930-76324-0007	2.82	Now what have you to say, Cynthia Sprague"?	6930-76324-0021	7.355	Surface dust at least had been removed, and the fine old furniture gave a hint of its real elegance and polish.
+6930-76324-0013	4.305	It can't hurt anything, I'm sure, for we won't disturb things at all.	6930-76324-0023	4.85	And my pocket money is getting low again, and you haven't any left, as usual.
+6930-76324-0026	3.085	Isn't he the greatest for getting into odd corners"!	6930-76324-0024	4.05	They say illumination by candle light is the prettiest in the world.
+6930-81414-0012	4.43	said another voice, which I recognized as Voltaire's. "Kaffar?	6930-76324-0025	4.12	Why, it's Goliath as usual"! they both cried, peering in.
+6930-81414-0019	3.38	Voltaire picked up something from the ground and looked at it.	6930-76324-0027	8.27	Forgetting all their weariness, they seized their candles and scurried through the house, finding an occasional paper tucked away in some odd corner.
+6930-81414-0018	2.93	I remember saying. "Have we been together"?	6930-76324-0028	9.875	Well, I'm convinced that the Boarded up House mystery happened not earlier than april sixteenth, eighteen sixty one, and probably not much later.
+6930-76324-0007	2.82	Now what have you to say, Cynthia Sprague"?	6930-81414-0004	9.56	The story of its evil influence came back to me, and in my bewildered condition I wondered whether there was not some truth in what had been said.
+6930-75918-0000	3.505	Concord returned to its place amidst the tents.	6930-81414-0006	6.8	What then? A human hand, large and shapely, appeared distinctly on the surface of the pond.
+6930-75918-0011	3.195	I am convinced of what I say," said the count.	6930-81414-0007	4.365	Nothing more, not even the wrist to which it might be attached.
+6930-75918-0013	2.94	In those very terms; I even added more.	6930-81414-0008	6.055	It did not beckon, or indeed move at all; it was as still as the hand of death.
+6930-81414-0010	3.835	A sound of voices. A flash of light.	6930-81414-0011	4.7	A feeling of freedom, and I was awake! Where?
+6930-76324-0025	4.12	Why, it's Goliath as usual"! they both cried, peering in.	6930-81414-0012	4.43	said another voice, which I recognized as Voltaire's. "Kaffar?
+6930-81414-0007	4.365	Nothing more, not even the wrist to which it might be attached.	6930-81414-0013	7.325	I had scarcely known what I had been saying or doing up to this time, but as he spoke I looked at my hand.
+6930-75918-0007	3.315	You will be frank with me"? "I always am".	6930-81414-0014	7.41	In the light of the moon I saw a knife red with blood, and my hand, too, was also discoloured.
+6930-81414-0025	2.53	My position was too terrible.	6930-81414-0020	5	I say you do know what this means, and you must tell us".
+6930-81414-0027	3.85	For some time after that I remembered nothing distinctly.	6930-81414-0022	4.34	I had again been acting under the influence of this man's power.
+6930-81414-0021	3.225	A terrible thought flashed into my mind.	6930-81414-0023	4.885	Perchance, too, Kaffar's death might serve him in good stead.
+6930-75918-0010	3.035	I can perceive love clearly enough".	6930-81414-0024	5.05	My tongue refused to articulate; my power of speech left me.
+1221-135766-0015	2.63	If spoken to, she would not speak again.	1221-135766-0002	4.825	Yet these thoughts affected Hester Prynne less with hope than apprehension.
+1221-135766-0015	2.63	If spoken to, she would not speak again.	1221-135766-0004	7.44	This outward mutability indicated, and did not more than fairly express, the various properties of her inner life.
+1221-135766-0013	3.645	Pearl was a born outcast of the infantile world.	1221-135766-0007	8.795	Hester Prynne, nevertheless, the loving mother of this one child, ran little risk of erring on the side of undue severity.
+1221-135767-0020	3.345	In truth, she seemed absolutely hidden behind it.	1221-135766-0014	4.75	Pearl saw, and gazed intently, but never sought to make acquaintance.
+7021-79740-0012	3.26	said she, pointing to the playthings; "see!	7021-79730-0005	8.01	So you will be a good girl, I know, and not make any trouble, but will stay at home contentedly - won't you?
+8463-294828-0021	2.735	A route slightly less direct, that's all.	8463-294825-0001	7.805	This reality begins to explain the dark power and otherworldly fascination of Twenty Thousand Leagues Under the Seas.
+8463-287645-0014	3.02	of starting. I didn't know the way to come.	8463-294825-0003	9.935	Nemo builds a fabulous futuristic submarine, the Nautilus, then conducts an underwater campaign of vengeance against his imperialist oppressor.
+8463-287645-0001	3.545	It is hardly necessary to say more of them here.	8463-294825-0005	7.7	Other subtleties occur inside each episode, the textures sparkling with wit, information, and insight.
+8463-287645-0001	3.545	It is hardly necessary to say more of them here.	8463-294825-0010	4.580063	And in this last action he falls into the classic sin of Pride.
+8463-287645-0009	3.71	I never knew of but one man who could ever please him.	8463-294825-0012	5.965063	The Nautilus nearly perishes in the Antarctic and Nemo sinks into a growing depression.
+1580-141083-0021	3.715	There is no opening except the one pane," said our learned guide.	1580-141083-0000	8.94	I will endeavour, in my statement, to avoid such terms as would serve to limit the events to any particular place, or give a clue as to the people concerned.
+1580-141084-0034	4.49	Well, well, don't trouble to answer. Listen, and see that I do you no injustice.	1580-141083-0002	6.135	My friend's temper had not improved since he had been deprived of the congenial surroundings of Baker Street.
+1580-141083-0023	3.33	One could hardly hope for any upon so dry a day.	1580-141083-0003	6.55	Without his scrapbooks, his chemicals, and his homely untidiness, he was an uncomfortable man.
+1580-141084-0003	4.1	No names, please"! said Holmes, as we knocked at Gilchrist's door.	1580-141083-0004	4.515	I had to read it over carefully, as the text must be absolutely correct.
+1580-141084-0045	3.625	Suddenly he heard him at the very door. There was no possible escape.	1580-141083-0007	4.565	The moment I looked at my table, I was aware that someone had rummaged among my papers.
+1580-141083-0011	2.825	A broken tip of lead was lying there also.	1580-141083-0008	4.305	The proof was in three long slips. I had left them all together.
+1580-141083-0030	3.48	mister Soames was somewhat overwhelmed by this flood of information.	1580-141083-0009	7.04	The alternative was that someone passing had observed the key in the door, had known that I was out, and had entered to look at the papers.
+1580-141083-0030	3.48	mister Soames was somewhat overwhelmed by this flood of information.	1580-141083-0010	5.32	I gave him a little brandy and left him collapsed in a chair, while I made a most careful examination of the room.
+1580-141083-0050	3.085	I really don't think he knew much about it, mister Holmes.	1580-141083-0012	7.065	Not only this, but on the table I found a small ball of black dough or clay, with specks of something which looks like sawdust in it.
+1580-141083-0019	2.705	Above were three students, one on each story.	1580-141083-0013	4.32	Above all things, I desire to settle the matter quietly and discreetly".
+1580-141083-0048	2.785	How came you to leave the key in the door"?	1580-141083-0015	4.985	Did anyone know that these proofs would be there"? "No one save the printer".
+1580-141084-0021	4.01	On the palm were three little pyramids of black, doughy clay.	1580-141083-0016	4.255	I was in such a hurry to come to you". "You left your door open"?
+1580-141083-0036	3.98	Holmes held it out on his open palm in the glare of the electric light.	1580-141083-0020	5.135	Then he approached it, and, standing on tiptoe with his neck craned, he looked into the room.
+1580-141084-0050	2.78	If mister Soames saw them, the game was up.	1580-141083-0024	4.48	You left him in a chair, you say. Which chair"? "By the window there".
+1580-141084-0037	2.965	When I approached your room, I examined the window.	1580-141083-0026	4.775	As a matter of fact, he could not," said Soames, "for I entered by the side door".
+1580-141083-0030	3.48	mister Soames was somewhat overwhelmed by this flood of information.	1580-141083-0027	5.225	How long would it take him to do that, using every possible contraction? A quarter of an hour, not less.
+1580-141084-0050	2.78	If mister Soames saw them, the game was up.	1580-141083-0031	6.25	Holmes held out a small chip with the letters NN and a space of clear wood after them. "You see"?
+1580-141084-0036	2.475	The Indian I also thought nothing of.	1580-141083-0032	4.135	Watson, I have always done you an injustice. There are others.
+1580-141084-0045	3.625	Suddenly he heard him at the very door. There was no possible escape.	1580-141083-0033	7.45	I was hoping that if the paper on which he wrote was thin, some trace of it might come through upon this polished surface. No, I see nothing.
+1580-141083-0025	3.905	The man entered and took the papers, sheet by sheet, from the central table.	1580-141083-0034	6.99	As Holmes drew the curtain I was aware, from some little rigidity and alertness of his attitude, that he was prepared for an emergency.
+1580-141084-0050	2.78	If mister Soames saw them, the game was up.	1580-141083-0035	4.98	Holmes turned away, and stooped suddenly to the floor. "Hello! What's this"?
+1580-141083-0030	3.48	mister Soames was somewhat overwhelmed by this flood of information.	1580-141083-0037	5.73	What could he do? He caught up everything which would betray him, and he rushed into your bedroom to conceal himself".
+1580-141083-0036	3.98	Holmes held it out on his open palm in the glare of the electric light.	1580-141083-0038	7.535	I understand you to say that there are three students who use this stair, and are in the habit of passing your door"? "Yes, there are".
+1580-141083-0024	4.48	You left him in a chair, you say. Which chair"? "By the window there".	1580-141083-0042	5.865	My scholar has been left very poor, but he is hard working and industrious. He will do well.
+1580-141084-0014	3.97	Why, Bannister, the servant. What's his game in the matter"?	1580-141083-0044	5.505	I dare not go so far as that. But, of the three, he is perhaps the least unlikely".
+1580-141083-0025	3.905	The man entered and took the papers, sheet by sheet, from the central table.	1580-141083-0045	4.36	He was still suffering from this sudden disturbance of the quiet routine of his life.
+1580-141083-0052	3.45	Oh, I would not venture to say, sir.	1580-141083-0053	4.015	You haven't seen any of them"? "No, sir".
+4992-41797-0003	2.835	mister Popham laid down his brush.	4992-41797-0000	5.485	Yes, dead these four years, an' a good job for her, too.
+4992-41797-0003	2.835	mister Popham laid down his brush.	4992-41797-0002	5.625	Grandfather was Alexander Carey, L L. D., - Doctor of Laws, that is".
+4992-23283-0016	4.495	Again he searched his own thoughts; nor ineffectually as before.	4992-41797-0004	7.315	I swan to man"! he ejaculated. "If you don't work hard you can't keep up with the times! Doctor of Laws!
+4992-23283-0015	3.675	Is she not afraid that I will thwart her inclinations"?	4992-41797-0006	4.55	He keeps the thou shalt not commandments first rate, Hen Lord does!
+4992-23283-0015	3.675	Is she not afraid that I will thwart her inclinations"?	4992-41797-0007	6.905	He give up his position and shut the family up in that tomb of a house so 't he could study his books.
+4992-41797-0012	2.705	She is wild to know how to do things.	4992-41797-0008	8.965	mister Popham exaggerated nothing, but on the contrary left much unsaid in his narrative of the family at the House of Lords.
+4992-41797-0003	2.835	mister Popham laid down his brush.	4992-41797-0010	6.82	Always irritable, cold, indifferent, he had grown rapidly more so as years went on.
+4992-41797-0016	3.3	They couldn't run nor move; they're just pasteboard".	4992-41797-0011	5.445	Whatever appealed to her sense of beauty was straightway transferred to paper or canvas.
+4992-41806-0009	4.355	exclaimed Bill Harmon to his wife as they went through the lighted hall.	4992-41797-0013	9.8	She makes effort after effort, trembling with eagerness, and when she fails to reproduce what she sees, she works herself into a frenzy of grief and disappointment".
+4992-41806-0009	4.355	exclaimed Bill Harmon to his wife as they went through the lighted hall.	4992-41797-0014	7.215	When she could not make a rabbit or a bird look "real" on paper, she searched in her father's books for pictures of its bones.
+4992-41806-0009	4.355	exclaimed Bill Harmon to his wife as they went through the lighted hall.	4992-41797-0015	8.65	Cyril, there must be some better way of doing; I just draw the outline of an animal and then I put hairs or feathers on it. They have no bodies.
+4992-23283-0011	4.225	If she does not know how to estimate her own value, I do.	4992-41797-0017	8.69	He wouldn't search, so don't worry," replied Cyril quietly, and the two looked at each other and knew that it was so.
+4992-23283-0016	4.495	Again he searched his own thoughts; nor ineffectually as before.	4992-41797-0018	9.155	There, in the cedar hollow, then, lived Olive Lord, an angry, resentful, little creature weighed down by a fierce sense of injury.
+4992-23283-0001	2.71	Miss Milner's health is not good"!	4992-41797-0019	4.755	Olive's mournful black eyes met Nancy's sparkling brown ones.
+4992-41797-0012	2.705	She is wild to know how to do things.	4992-41797-0020	7.49	Nancy's curly chestnut crop shone in the sun, and Olive's thick black plaits looked blacker by contrast.
+4992-23283-0007	4.045	To ask any more questions of you, I believe, would be unfair.	4992-41797-0021	8.23	She's wonderful! More wonderful than anybody we've ever seen anywhere, and she draws better than the teacher in Charlestown!
+4992-23283-0001	2.71	Miss Milner's health is not good"!	4992-41797-0022	6.45	She's older than I am, but so tiny and sad and shy that she seems like a child.
+2830-3980-0001	3.945	They said to the Galatians: "You have no right to think highly of Paul.	2830-3979-0000	6.12	We want you to help us publish some leading work of Luther's for the general American market. Will you do it"?
+2830-3980-0020	3.46	This is no sinful pride. It is holy pride.	2830-3979-0002	4.315	Let us begin with that: his Commentary on Galatians..".
+2830-3980-0046	2.84	Was it not enough to say, "from God the Father"?	2830-3979-0003	8.085	The undertaking, which seemed so attractive when viewed as a literary task, proved a most difficult one, and at times became oppressive.
+2830-3980-0012	3.42	The most they could claim is that they were sent by others.	2830-3979-0006	4.55	A word should now be said about the origin of Luther's Commentary on Galatians.
+2830-3980-0013	4.145	He mentions the apostles first because they were appointed directly by God.	2830-3979-0008	9.44	In other words, these three men took down the lectures which Luther addressed to his students in the course of Galatians, and Roerer prepared the manuscript for the printer.
+2830-3980-0013	4.145	He mentions the apostles first because they were appointed directly by God.	2830-3979-0009	8.35	It presents like no other of Luther's writings the central thought of Christianity, the justification of the sinner for the sake of Christ's merits alone.
+2830-3980-0020	3.46	This is no sinful pride. It is holy pride.	2830-3979-0011	9.45	The Lord who has given us power to teach and to hear, let Him also give us the power to serve and to do". LUKE two
+2094-142345-0025	3.595	Cold, is it, my darling? Bless your sweet face"!	2094-142345-0001	8.03	But the windows are patched with wooden panes, and the door, I think, is like the gate it is never opened.
+2094-142345-0025	3.595	Cold, is it, my darling? Bless your sweet face"!	2094-142345-0005	9.09	Several clothes horses, a pillion, a spinning wheel, and an old box wide open and stuffed full of coloured rags.
+2094-142345-0060	2.71	Oh, I've no doubt it's in capital order.	2094-142345-0021	5.335	That's the way with you that's the road you'd all like to go, headlongs to ruin.
+2094-142345-0018	3.155	Who taught you to scrub a floor, I should like to know?	2094-142345-0034	7.99	And there's linen in the house as I could well spare you, for I've got lots o' sheeting and table clothing, and towelling, as isn't made up.
+2094-142345-0026	2.825	She's going to put the ironing things away".	2094-142345-0036	6.915	Nay, dear aunt, you never heard me say that all people are called to forsake their work and their families.
+2094-142345-0020	2.435	That's what you'd like to be doing, is it?	2094-142345-0039	6.28	I've strong assurance that no evil will happen to you and my uncle and the children from anything I've done.
+2094-142345-0020	2.435	That's what you'd like to be doing, is it?	2094-142345-0043	7.35	By this time the two gentlemen had reached the palings and had got down from their horses: it was plain they meant to come in.
+2094-142345-0032	3.24	I often heard her talk of you in the same sort of way.	2094-142345-0048	6.39	said Captain Donnithorne, seating himself where he could see along the short passage to the open dairy door.
+2094-142345-0004	2.64	And what through the left hand window?	2094-142345-0049	6.125	No, sir, he isn't; he's gone to Rosseter to see mister West, the factor, about the wool.
+2094-142345-0018	3.155	Who taught you to scrub a floor, I should like to know?	2094-142345-0051	5.31	No, thank you; I'll just look at the whelps and leave a message about them with your shepherd.
+2094-142345-0060	2.71	Oh, I've no doubt it's in capital order.	2094-142345-0052	6.53	I must come another day and see your husband; I want to have a consultation with him about horses.
+1995-1837-0009	3.76	The lagoon had been level with the dykes a week ago; and now?	1995-1836-0001	6	At last the Cotton Combine was to all appearances an assured fact and he was slated for the Senate.
+1995-1837-0015	4.485	The squares of cotton, sharp edged, heavy, were just about to burst to bolls!	1995-1836-0003	7.965	She was not herself a notably intelligent woman; she greatly admired intelligence or whatever looked to her like intelligence in others.
+1995-1837-0015	4.485	The squares of cotton, sharp edged, heavy, were just about to burst to bolls!	1995-1836-0006	7.715	She was therefore most agreeably surprised to hear mister Cresswell express himself so cordially as approving of Negro education.
+1995-1837-0005	2.635	She was so strange and human a creature.	1995-1836-0008	6.985	I believe in the training of people to their highest capacity". The Englishman here heartily seconded him.
+1995-1837-0000	3.865	He knew the Silver Fleece - his and Zora's - must be ruined.	1995-1836-0009	6.71	But," Cresswell added significantly, "capacity differs enormously between races".
+1995-1826-0004	3.035	Might learn something useful down there".	1995-1836-0011	4.705	Positively heroic," added Cresswell, avoiding his sister's eyes.
+1995-1837-0022	3.415	Up in the sick room Zora lay on the little white bed.	1995-1836-0014	9.045	Fortunately," said mister Vanderpool, "Northerners and Southerners are arriving at a better mutual understanding on most of these matters".
+237-126133-0021	4.365	she asked impulsively, "I didn't believe you could persuade her, father".	237-126133-0003	6.56	Somehow, of all the days when the home feeling was the strongest, this day it seemed as if she could bear it no longer.
+237-126133-0025	3.755	At last he came out of them, and wiped his face vigorously.	237-126133-0005	6.51	Oh, she's always at the piano," said Van. "She must be there now, somewhere," and then somebody laughed.
+237-126133-0016	4.25	Oh no, Jasper; I must go by my very own self".	237-126133-0006	6.15	At this, the bundle opened suddenly, and - out popped Phronsie!
+237-126133-0021	4.365	she asked impulsively, "I didn't believe you could persuade her, father".	237-126133-0007	8.68	But Polly couldn't speak; and if Jasper hadn't caught her just in time, she would have tumbled over backward from the stool, Phronsie and all!
+237-126133-0025	3.755	At last he came out of them, and wiped his face vigorously.	237-126133-0010	6.24	Oh, you are the dearest and best mister King I ever saw! but how did you make mammy let her come"?
+237-126133-0009	3.97	Now you'll stay," cried Van; "say, Polly, won't you".	237-126133-0011	6.71	Isn't he splendid"! cried Jasper in intense pride, swelling up. "Father knew how to do it".
+237-126133-0018	4.095	Don't mind it, Polly," whispered Jasper; "twasn't her fault".	237-126133-0012	4.45	There, there," he said soothingly, patting her brown, fuzzy head.
+237-126133-0016	4.25	Oh no, Jasper; I must go by my very own self".	237-126133-0013	6.815	I know," gasped Polly, controlling her sobs; "I won't - only - I can't thank you"!
+237-126133-0025	3.755	At last he came out of them, and wiped his face vigorously.	237-126133-0014	6.79	asked Phronsie in intense interest slipping down out of Polly's arms, and crowding up close to Jasper's side.
+237-126133-0025	3.755	At last he came out of them, and wiped his face vigorously.	237-126133-0015	9.34	Yes, all alone by himself," asserted Jasper, vehemently, and winking furiously to the others to stop their laughing; "he did now, truly, Phronsie".
+237-126133-0009	3.97	Now you'll stay," cried Van; "say, Polly, won't you".	237-126133-0016	4.25	Oh no, Jasper; I must go by my very own self".
+237-126133-0021	4.365	she asked impulsively, "I didn't believe you could persuade her, father".	237-126133-0017	6.21	There Jap, you've caught it," laughed Percy; while the others screamed at the sight of Jasper's face.
+237-126133-0008	3.865	asked Phronsie, with her little face close to Polly's own.	237-126133-0018	4.095	Don't mind it, Polly," whispered Jasper; "twasn't her fault".
+237-126133-0025	3.755	At last he came out of them, and wiped his face vigorously.	237-126133-0019	7.12	Dear me"! ejaculated the old gentleman, in the utmost amazement; "and such a time as I've had to get her here too"!
+237-126133-0025	3.755	At last he came out of them, and wiped his face vigorously.	237-126133-0021	4.365	she asked impulsively, "I didn't believe you could persuade her, father".
+237-126133-0021	4.365	she asked impulsively, "I didn't believe you could persuade her, father".	237-126133-0022	5.04	I didn't have any fears, if I worked it rightly," said the old gentleman complacently.
+237-126133-0021	4.365	she asked impulsively, "I didn't believe you could persuade her, father".	237-126133-0023	6.675	he cried in high dudgeon; just as if he owned the whole of the Peppers, and could dispose of them all to suit his fancy!
+237-126133-0021	4.365	she asked impulsively, "I didn't believe you could persuade her, father".	237-126133-0024	9.665	And the old gentleman was so delighted with his success, that he had to burst out into a series of short, happy bits of laughter, that occupied quite a space of time.
+4507-16021-0040	3.925	One thinks one hears hydras talking.	4507-16021-0003	4.895	She has a son, theft, and a daughter, hunger.
+4507-16021-0012	2.735	Why should one halt on the way?	4507-16021-0005	4.21	We have never understood this sort of objections.
+4507-16021-0015	3.86	Since when has malady banished medicine?	4507-16021-0011	5.615	Why should one not explore everything, and study everything?
+4507-16021-0000	2.59	Chapter one Origin.	4507-16021-0014	6.115	Now, when has horror ever excluded study?
+4507-16021-0007	2.63	Slang makes one shudder"!	4507-16021-0024	5.14	Algebra, medicine, botany, have each their slang.
+4507-16021-0041	2.975	It is unintelligible in the dark.	4507-16021-0025	9.215	To meet the needs of this conflict, wretchedness has invented a language of combat, which is slang.
+4507-16021-0050	3.895	And you belong to that small class who are happy!	4507-16021-0033	5.545	Do we really know the mountain well when we are not acquainted with the cavern?
+4507-16021-0058	3.11	The flame is the enemy of the wing.	4507-16021-0035	7.535	True history being a mixture of all things, the true historian mingles in everything.
+4507-16021-0028	3.265	Even dialect, let that pass!	4507-16021-0036	5.435	Facts form one of these, and ideas the other.
+4507-16021-0015	3.86	Since when has malady banished medicine?	4507-16021-0037	5.35	There it clothes itself in word masks, in metaphor rags.
+4507-16021-0050	3.895	And you belong to that small class who are happy!	4507-16021-0045	4.89	It is so made, that everywhere we feel the sense of punishment.
+4507-16021-0012	2.735	Why should one halt on the way?	4507-16021-0046	4.59	Each day has its own great grief or its little care.
+4507-16021-0050	3.895	And you belong to that small class who are happy!	4507-16021-0048	5.215	This without reckoning in the pains of the heart. And so it goes on.
+4507-16021-0050	3.895	And you belong to that small class who are happy!	4507-16021-0049	5.91	There is hardly one day out of a hundred which is wholly joyous and sunny.
+4507-16021-0019	2.93	It is the language of wretchedness.	4507-16021-0051	6.17	In this world, evidently the vestibule of another, there are no fortunate.
+4507-16021-0007	2.63	Slang makes one shudder"!	4507-16021-0052	6.275	The real human division is this: the luminous and the shady.
+4507-16021-0005	4.21	We have never understood this sort of objections.	4507-16021-0053	8.095	To diminish the number of the shady, to augment the number of the luminous,-that is the object.
+4507-16021-0029	3.87	To this we reply in one word, only.	4507-16021-0054	4.315	That is why we cry: Education! science!
+4507-16021-0041	2.975	It is unintelligible in the dark.	4507-16021-0055	7.225	To teach reading, means to light the fire; every syllable spelled out sparkles.
+4507-16021-0040	3.925	One thinks one hears hydras talking.	4507-16021-0056	6.345	However, he who says light does not, necessarily, say joy.
+4507-16021-0038	3.885	In this guise it becomes horrible.	4507-16021-0057	4.61	People suffer in the light; excess burns.
+4507-16021-0015	3.86	Since when has malady banished medicine?	4507-16021-0059	6.205	To burn without ceasing to fly, therein lies the marvel of genius.
+8555-284447-0020	4.09	The goat's warlike spirit was roused by this successful attack.	8555-284447-0000	9.605	Then he rushed down stairs into the courtyard, shouting loudly for his soldiers and threatening to patch everybody in his dominions if the sailorman was not recaptured.
+8555-284447-0003	4.415	But Captain Bill made no such attempt, knowing it would be useless.	8555-284447-0001	8.61	Hold him fast, my men, and as soon as I've had my coffee and oatmeal I'll take him to the Room of the Great Knife and patch him".
+8555-284447-0022	3.56	I had a notion it was you, mate, as saved me from the knife.	8555-284447-0002	8.025	I wouldn't mind a cup of coffee myself," said Captain Bill. "I've had considerable exercise this morning and I'm all ready for breakfast".
+8555-284447-0009	3.275	Mornin', girls; hope ye feel as well as ye look".	8555-284447-0003	4.415	But Captain Bill made no such attempt, knowing it would be useless.
+8555-284447-0020	4.09	The goat's warlike spirit was roused by this successful attack.	8555-284447-0004	5.485	As soon as they entered the Room of the Great Knife the Boolooroo gave a yell of disappointment.
+8555-284447-0020	4.09	The goat's warlike spirit was roused by this successful attack.	8555-284447-0005	6.83	The Room of the Great Knife was high and big, and around it ran rows of benches for the spectators to sit upon.
+8555-284449-0005	2.555	When he finished she said cheerfully:	8555-284447-0007	6.365	Therefore her Majesty paid no attention to anyone and no one paid any attention to her.
+8555-284447-0020	4.09	The goat's warlike spirit was roused by this successful attack.	8555-284447-0008	8.39	Rich jewels of blue stones glittered upon their persons and the royal ladies were fully as gorgeous as they were haughty and overbearing.
+8555-284447-0020	4.09	The goat's warlike spirit was roused by this successful attack.	8555-284447-0013	9.04	Why, you said to fetch the first living creature we met, and that was this billygoat," replied the Captain, panting hard as he held fast to one of the goat's horns.
+8555-284447-0020	4.09	The goat's warlike spirit was roused by this successful attack.	8555-284447-0014	8.47	The idea of patching Captain Bill to a goat was vastly amusing to him, and the more he thought of it the more he roared with laughter.
+8555-284447-0020	4.09	The goat's warlike spirit was roused by this successful attack.	8555-284447-0018	5.46	At once the goat gave a leap, escaped from the soldiers and with bowed head rushed upon the Boolooroo.
+8555-284447-0003	4.415	But Captain Bill made no such attempt, knowing it would be useless.	8555-284447-0020	4.09	The goat's warlike spirit was roused by this successful attack.
+8555-284447-0022	3.56	I had a notion it was you, mate, as saved me from the knife.	8555-284447-0023	7.155	I couldn't shiver much, bein' bound so tight, but when I'm loose I mean to have jus' one good shiver to relieve my feelin's".
+8555-292519-0013	4.185	That was but rustling of dripping plants in the dark.	8555-284447-0024	4.635	Come and get the Boolooroo," she said, going toward the benches.
+8230-279154-0003	3.195	And what sort of evidence is logically possible?	8230-279154-0000	8.805	The analysis of knowledge will occupy us until the end of the thirteenth lecture, and is the most difficult part of our whole enterprise.
+8230-279154-0032	3.88	It is this that is of interest to theory of knowledge.	8230-279154-0005	7.72	All that I am doing is to use its logical tenability as a help in the analysis of what occurs when we remember.
+8230-279154-0003	3.195	And what sort of evidence is logically possible?	8230-279154-0006	7.51	The behaviourist, who attempts to make psychology a record of behaviour, has to trust his memory in making the record.
+8230-279154-0008	3.62	But I do not think such an inference is warranted.	8230-279154-0011	6.25	Some images, like some sensations, feel very familiar, while others feel strange.
+8230-279154-0003	3.195	And what sort of evidence is logically possible?	8230-279154-0014	7.94	I come now to the other characteristic which memory images must have in order to account for our knowledge of the past.
+8230-279154-0003	3.195	And what sort of evidence is logically possible?	8230-279154-0015	8.05	They must have some characteristic which makes us regard them as referring to more or less remote portions of the past.
+8230-279154-0003	3.195	And what sort of evidence is logically possible?	8230-279154-0017	7.93	There may be a specific feeling which could be called the feeling of "pastness," especially where immediate memory is concerned.
+8230-279154-0003	3.195	And what sort of evidence is logically possible?	8230-279154-0020	7.835	If we had retained the "subject" or "act" in knowledge, the whole problem of memory would have been comparatively simple.
+8230-279154-0003	3.195	And what sort of evidence is logically possible?	8230-279154-0021	6.56	Remembering has to be a present occurrence in some way resembling, or related to, what is remembered.
+8230-279154-0012	3.64	Familiarity is a feeling capable of degrees.	8230-279154-0022	6.44	Some points may be taken as fixed, and such as any theory of memory must arrive at.
+8230-279154-0032	3.88	It is this that is of interest to theory of knowledge.	8230-279154-0023	6.265	In this case, as in most others, what may be taken as certain in advance is rather vague.
+8230-279154-0008	3.62	But I do not think such an inference is warranted.	8230-279154-0024	6.34	The first of our vague but indubitable data is that there is knowledge of the past.
+8230-279154-0032	3.88	It is this that is of interest to theory of knowledge.	8230-279154-0026	9.3	This distinction is vital to the understanding of memory. But it is not so easy to carry out in practice as it is to draw in theory.
+8230-279154-0003	3.195	And what sort of evidence is logically possible?	8230-279154-0029	8.54	The fact that a man can recite a poem does not show that he remembers any previous occasion on which he has recited or read it.
+8230-279154-0008	3.62	But I do not think such an inference is warranted.	8230-279154-0030	7.28	Semon's two books, mentioned in an earlier lecture, do not touch knowledge memory at all closely.
+8230-279154-0012	3.64	Familiarity is a feeling capable of degrees.	8230-279154-0035	7.555	Thus no knowledge as to the past is to be derived from the feeling of familiarity alone.
+8230-279154-0003	3.195	And what sort of evidence is logically possible?	8230-279154-0039	4.59	This knowledge is memory in one sense, though in another it is not.
+7021-85628-0000	3.02	But Anders cared nothing about that.	7021-79740-0001	5.995	Della had a young sister named Maria, and a cousin whose name was Jane.
+7021-85628-0019	3.255	With one jump Anders got out of his chair.	7021-79740-0002	9.225	Now Delia contrived to obtain a great influence and ascendency over the minds of the children by means of these dolls.
+7021-79740-0009	3.635	They were now playing with their dolls in the parlor.	7021-79740-0003	4.985	To give an idea of these conversations I will report one of them in full.
+7021-79740-0012	3.26	said she, pointing to the playthings; "see!	7021-79740-0004	6.465	You have come, Andella (Andella was the name of Jane's doll), to make Rosalie a visit.
+7021-85628-0019	3.255	With one jump Anders got out of his chair.	7021-79740-0006	5.965	I expect you have been a very good girl, Andella, since you were here last".
+7021-79740-0012	3.26	said she, pointing to the playthings; "see!	7021-79740-0007	6.99	Then, turning to Jane, she asked, in a somewhat altered tone, "Has she been a good girl, Jane"?
+7021-79740-0009	3.635	They were now playing with their dolls in the parlor.	7021-79740-0013	7.365	Put these playthings all away quick, and carefully, and we will not let them know any thing about your leaving them out".
+61-70970-0016	4.37	We will go out together to the bower; there is a way down to the court from my window.	61-70968-0000	4.905	He began a confused complaint against the wizard, who had vanished behind the curtain on the left.
+61-70968-0012	2.61	Cries of: "A Nottingham! A Nottingham"!	61-70968-0003	4.315	He was like unto my father, in a way, and yet was not my father.
+61-70970-0009	3.405	Tis late; and I go myself within a short space.	61-70968-0005	5.07	This was so sweet a lady, sir, and in some manner I do think she died.
+61-70968-0018	2.405	So I did push this fellow".	61-70968-0009	4.51	Like as not, young master, though I am an old man".
+61-70970-0033	3.42	Truly such a horse should be worth much in Nottingham Fair!	61-70968-0010	8.295	Forthwith all ran to the opening of the tent to see what might be amiss; but Master Will, who peeped out first, needed no more than one glance.
+61-70970-0016	4.37	We will go out together to the bower; there is a way down to the court from my window.	61-70968-0011	6.375	He gave way to the others very readily and retreated unperceived by the Squire and Mistress Fitzooth to the rear of the tent.
+61-70970-0019	3.78	At last all was quiet and black in the courtyard of Gamewell.	61-70968-0013	4.45	Before them fled the stroller and his three sons, capless and terrified.
+61-70968-0006	2.935	But then the picture was gone as quickly as it came".	61-70968-0014	7.485	What is the tumult and rioting"? cried out the Squire, authoritatively, and he blew twice on a silver whistle which hung at his belt.
+61-70968-0036	2.934938	George Montfichet will never forget this day.	61-70968-0015	5.375	Nay, we refused their request most politely, most noble," said the little stroller.
+61-70970-0007	4.485	He was in deep converse with the clerk, and entered the hall holding him by the arm.	61-70968-0017	5.11	I could not see my boy injured, excellence, for but doing his duty as one of Cumberland's sons.
+61-70970-0023	3.705	Be not so foolish, friend," said Fitzooth, crossly.	61-70968-0019	5.475	It is enough," said George Gamewell, sharply, and he turned upon the crowd.
+61-70968-0025	4.41	Come to me, men, here, here"! He raised his voice still louder.	61-70968-0020	5.105	Shame on you, citizens," cried he; "I blush for my fellows of Nottingham.
+61-70968-0048	3.02	And Henry might return to England at any moment.	61-70968-0022	4.67	Tis fine for you to talk, old man," answered the lean, sullen apprentice.
+61-70970-0033	3.42	Truly such a horse should be worth much in Nottingham Fair!	61-70968-0023	5.025	But I wrestled with this fellow and do know that he played unfairly in the second bout.
+61-70970-0032	3.135	enquired Robin, with his suspicions still upon him.	61-70968-0024	6.025	spoke the Squire, losing all patience; "and it was to you that I gave another purse in consolation!
+61-70970-0003	3.835	If, for a whim, you beggar yourself, I cannot stay you.	61-70968-0025	4.41	Come to me, men, here, here"! He raised his voice still louder.
+61-70970-0040	4.165	They regained their apartment, apparently without disturbing the household of Gamewell.	61-70968-0026	4.92	The strollers took their part in it with hearty zest now that they had some chance of beating off their foes.
+61-70970-0016	4.37	We will go out together to the bower; there is a way down to the court from my window.	61-70968-0027	6.87	Robin and the little tumbler between them tried to force the Squire to stand back, and very valiantly did these two comport themselves.
+61-70968-0046	3.55	Nottingham Castle was reached, and admittance was demanded.	61-70968-0030	5.685	Now, be silent, on your lives," he began; but the captured apprentice set up an instant shout.
+61-70968-0029	3.495	The Squire helped to thrust them all in and entered swiftly himself.	61-70968-0032	4.28	He felt for and found the wizard's black cloth. The Squire was quite out of breath.
+61-70970-0016	4.37	We will go out together to the bower; there is a way down to the court from my window.	61-70968-0033	5.685	Thrusting open the proper entrance of the tent, Robin suddenly rushed forth with his burden, with a great shout.
+61-70970-0016	4.37	We will go out together to the bower; there is a way down to the court from my window.	61-70968-0035	7.95	Taking advantage of this, the Squire's few men redoubled their efforts, and, encouraged by Robin's and the little stroller's cries, fought their way to him.
+61-70968-0036	2.934938	George Montfichet will never forget this day.	61-70968-0037	4.315	What is your name, lording"? asked the little stroller, presently.
+61-70970-0022	3.97	Robin entered the hut, dragging the unwilling esquire after him.	61-70968-0041	6.825	I like you, Will; you are the second Will that I have met and liked within two days; is there a sign in that"?
+61-70968-0003	4.315	He was like unto my father, in a way, and yet was not my father.	61-70968-0043	6.735	Friends," said Montfichet, faintly, to the wrestlers, "bear us escort so far as the Sheriff's house.
+61-70970-0013	4.35	There was no chance to alter his sleeping room to one nearer to Gamewell's chamber.	61-70968-0047	4.775	Master Monceux, the Sheriff of Nottingham, was mightily put about when told of the rioting.
+61-70968-0048	3.02	And Henry might return to England at any moment.	61-70968-0049	8.25	Have your will, child, if the boy also wills it," Montfichet answered, feeling too ill to oppose anything very strongly just then.
+61-70968-0042	2.785	Montfichet called out for Robin to give him an arm.	61-70968-0050	5.58	He made an effort to hide his condition from them all, and Robin felt his fingers tighten upon his arm.
+61-70970-0030	3.24	Save me, masters, but you startled me rarely"!	61-70968-0053	4.22	He is my esquire, excellency," returned Robin, with dignity.
+61-70970-0040	4.165	They regained their apartment, apparently without disturbing the household of Gamewell.	61-70968-0054	7.86	Mistress Fitzooth had been carried off by the Sheriff's daughter and her maids as soon as they had entered the house, so that Robin alone had the care of Montfichet.
+61-70968-0012	2.61	Cries of: "A Nottingham! A Nottingham"!	61-70968-0057	5.065	These escapades are not for old Gamewell, lad; his day has come to twilight.
+61-70968-0048	3.02	And Henry might return to England at any moment.	61-70968-0061	5.53	You are a worthy leech, Will," presently whispered Robin. "The wine has worked a marvel.
+8224-274384-0003	3.87	or hath he given us any gift?	8224-274384-0002	9.815	They informed the English parliament of this unexpected incident, and assured them that they had entered into no private treaty with the king.
+8224-274384-0003	3.87	or hath he given us any gift?	8224-274384-0005	8.745	Another preacher, after reproaching him to his face with his misgovernment, ordered this psalm to be sung:
+8224-274384-0003	3.87	or hath he given us any gift?	8224-274384-0006	6.81	The king stood up, and called for that psalm which begins with these words,
+8224-274384-0003	3.87	or hath he given us any gift?	8224-274384-0007	6.23	Have mercy, Lord, on me, I pray; For men would me devour".
+8224-274384-0003	3.87	or hath he given us any gift?	8224-274384-0009	4.805	The parliament and the Scots laid their proposals before the king.
+8224-274384-0003	3.87	or hath he given us any gift?	8224-274384-0013	5.44	His death, in this conjuncture, was a public misfortune.
+6829-68771-0035	4.39	Will you leave me alone in my own room, or must I go away to escape you"?	6829-68769-0001	9.315	It was a serious crime indeed, mister Watson told them, and Tom Gates bade fair to serve a lengthy term in state's prison as a consequence of his rash act.
+6829-68769-0046	2.57	You're foolish. Why should you do all this"?	6829-68769-0003	4.215	It was a deliberate theft from his employers to protect a girl he loved.
+6829-68769-0007	3.865	But under the circumstances I doubt if such an arrangement could be made".	6829-68769-0004	7.145	But they could not have proven a case against Lucy, if she was innocent, and all their threats of arresting her were probably mere bluff.
+6829-68769-0044	3.225	It has cost me twice sixty dollars in annoyance".	6829-68769-0005	6.72	He was soft hearted and impetuous," said Beth; "and, being in love, he didn't stop to count the cost".
+6829-68769-0022	4.115	We have heard something of your story," said Kenneth, "and are interested in it.	6829-68769-0006	7.195	If the prosecution were withdrawn and the case settled with the victim of the forged check, then the young man would be allowed his freedom.
+6829-68769-0022	4.115	We have heard something of your story," said Kenneth, "and are interested in it.	6829-68769-0009	4.22	They were received in the little office by a man named Markham, who was the jailer.
+6829-68769-0003	4.215	It was a deliberate theft from his employers to protect a girl he loved.	6829-68769-0011	4.685	I'm running for Representative on the Republican ticket," said Kenneth, quietly.
+6829-68769-0039	4.045	He looked up rather ungraciously, but motioned them to be seated.	6829-68769-0012	4.295	Oh, say! that's different," observed Markham, altering his demeanor.
+6829-68769-0003	4.215	It was a deliberate theft from his employers to protect a girl he loved.	6829-68769-0015	6.525	Sometimes I'm that yearning for a smoke I'm nearly crazy, an' I don't know which is worst, dying one way or another.
+6829-68769-0037	2.53	I've seen lots of that kind in my day.	6829-68769-0016	4.12	He unlocked the door, and called: "Here's visitors, Tom".
+6829-68771-0028	3.555	She even seemed mildly amused at the attention she attracted.	6829-68769-0020	5.125	Sit down, please," said Gates, in a cheerful and pleasant voice. "There's a bench here".
+6829-68769-0002	3.075	I can't see it in that light," said the old lawyer.	6829-68769-0021	7.895	A fresh, wholesome looking boy, was Tom Gates, with steady gray eyes, an intelligent forehead, but a sensitive, rather weak mouth.
+6829-68769-0009	4.22	They were received in the little office by a man named Markham, who was the jailer.	6829-68769-0022	4.115	We have heard something of your story," said Kenneth, "and are interested in it.
+6829-68771-0028	3.555	She even seemed mildly amused at the attention she attracted.	6829-68769-0023	4.89	I didn't stop to think whether it was foolish or not. I did it; and I'm glad I did".
+6829-68769-0007	3.865	But under the circumstances I doubt if such an arrangement could be made".	6829-68769-0025	5.735	Then Rogers wouldn't do anything but lead her around, and wait upon her, and the place went to rack and ruin".
+6829-68769-0051	3.545	There was a grim smile of amusement on his shrewd face.	6829-68769-0026	4.64	He spoke simply, but paced up and down the narrow cell in front of them.
+6829-68769-0012	4.295	Oh, say! that's different," observed Markham, altering his demeanor.	6829-68769-0030	4.91	I was bookkeeper, so it was easy to get a blank check and forge the signature.
+6829-68769-0037	2.53	I've seen lots of that kind in my day.	6829-68769-0031	5.555	As regards my robbing the company, I'll say that I saved them a heavy loss one day.
+6829-68769-0007	3.865	But under the circumstances I doubt if such an arrangement could be made".	6829-68769-0032	5.72	I discovered and put out a fire that would have destroyed the whole plant. But Marshall never even thanked me.
+6829-68769-0019	2.665	Sorry we haven't any reception room in the jail.	6829-68769-0033	4.02	It was better for him to think the girl unfeeling than to know the truth.
+6829-68769-0019	2.665	Sorry we haven't any reception room in the jail.	6829-68769-0034	6.055	I'm going to see mister Marshall," said Kenneth, "and discover what I can do to assist you". "Thank you, sir.
+6829-68771-0035	4.39	Will you leave me alone in my own room, or must I go away to escape you"?	6829-68769-0036	5.555	They left him then, for the jailer arrived to unlock the door, and escort them to the office.
+6829-68769-0017	3.545	Worse, Tom; worse 'n ever," replied the jailer, gloomily.	6829-68769-0039	4.045	He looked up rather ungraciously, but motioned them to be seated.
+6829-68771-0028	3.555	She even seemed mildly amused at the attention she attracted.	6829-68769-0040	4.77	Some girl has been here twice to interview my men and I have refused to admit her.
+6829-68769-0012	4.295	Oh, say! that's different," observed Markham, altering his demeanor.	6829-68769-0049	7.4	He detested the grasping disposition that would endeavor to take advantage of his evident desire to help young Gates.
+6829-68769-0010	3.14	We wish to talk with him," answered Kenneth. "Talk!	6829-68769-0052	4.6	He might have had that forged check for the face of it, if he'd been sharp.
+6829-68769-0051	3.545	There was a grim smile of amusement on his shrewd face.	6829-68769-0053	6.36	And to think we can save all that misery and despair by the payment of a hundred and fifty dollars!
+5142-33396-0015	4.31	As our boat flashed down the rollers into the water I made this song and sang it:	5142-36586-0003	5.055	But this subject will be more properly discussed when we treat of the different races of mankind.
+3570-5694-0019	3.755	But the general distinction is not on that account to be overlooked.	3570-5696-0002	7.51	Other circumstances permitting, that instinct disposes men to look with favor upon productive efficiency and on whatever is of human use.
+3570-5694-0012	3.205	There is a more or less elaborate system of rank and grades.	3570-5696-0004	4.7	The salient features of this development of domestic service have already been indicated.
+3570-5694-0012	3.205	There is a more or less elaborate system of rank and grades.	3570-5696-0006	4.16	As used in the speech of everyday life the word carries an undertone of deprecation.
+3570-5694-0019	3.755	But the general distinction is not on that account to be overlooked.	3570-5696-0007	9.5	The use of the word "waste" as a technical term, therefore, implies no deprecation of the motives or of the ends sought by the consumer under this canon of conspicuous waste.
+3570-5696-0006	4.16	As used in the speech of everyday life the word carries an undertone of deprecation.	3570-5696-0008	7.26	But it is, on other grounds, worth noting that the term "waste" in the language of everyday life implies deprecation of what is characterized as wasteful.
+3570-5694-0012	3.205	There is a more or less elaborate system of rank and grades.	3570-5696-0009	8.86	In strict accuracy nothing should be included under the head of conspicuous waste but such expenditure as is incurred on the ground of an invidious pecuniary comparison.
+3570-5694-0012	3.205	There is a more or less elaborate system of rank and grades.	3570-5696-0010	7.57	An article may be useful and wasteful both, and its utility to the consumer may be made up of use and waste in the most varying proportions.
+2830-3980-0042	3.02	The world brands this a pernicious doctrine.	2830-3980-0005	6.45	Do you suppose that God for the sake of a few Lutheran heretics would disown His entire Church?
+2830-3980-0071	3.96	We think that by some little work or merit we can dismiss sin.	2830-3980-0006	6.41	Against these boasting, false apostles, Paul boldly defends his apostolic authority and ministry.
+2830-3980-0046	2.84	Was it not enough to say, "from God the Father"?	2830-3980-0008	4.84	Paul takes pride in his ministry, not to his own praise but to the praise of God.
+2830-3980-0028	3.54	This should go far in shutting the mouths of the false apostles.	2830-3980-0010	6.525	Either He calls ministers through the agency of men, or He calls them directly as He called the prophets and apostles.
+2830-3980-0071	3.96	We think that by some little work or merit we can dismiss sin.	2830-3980-0011	5.525	Paul declares that the false apostles were called or sent neither by men, nor by man.
+2830-3980-0028	3.54	This should go far in shutting the mouths of the false apostles.	2830-3980-0013	4.145	He mentions the apostles first because they were appointed directly by God.
+2830-3980-0017	3.665	When I was a young man I thought Paul was making too much of his call.	2830-3980-0019	7.015	I knew nothing of the doctrine of faith because we were taught sophistry instead of certainty, and nobody understood spiritual boasting.
+2830-3980-0021	2.91	and God the Father, who raised him from the dead.	2830-3980-0023	6.16	These perverters of the righteousness of Christ resist the Father and the Son, and the works of them both.
+2830-3980-0020	3.46	This is no sinful pride. It is holy pride.	2830-3980-0025	8.795	By His resurrection Christ won the victory over law, sin, flesh, world, devil, death, hell, and every evil.
+2830-3980-0042	3.02	The world brands this a pernicious doctrine.	2830-3980-0029	9.075	Although the brethren with me are not apostles like myself, yet they are all of one mind with me, think, write, and teach as I do".
+2830-3980-0000	3.73	In every way they sought to undermine the authority of Saint Paul.	2830-3980-0030	5.25	They do not go where the enemies of the Gospel predominate. They go where the Christians are.
+2830-3980-0000	3.73	In every way they sought to undermine the authority of Saint Paul.	2830-3980-0031	8.485	Why do they not invade the Catholic provinces and preach their doctrine to godless princes, bishops, and doctors, as we have done by the help of God?
+2830-3980-0042	3.02	The world brands this a pernicious doctrine.	2830-3980-0032	7.22	We look for that reward which "eye hath not seen, nor ear heard, neither hath entered into the heart of man".
+2830-3980-0000	3.73	In every way they sought to undermine the authority of Saint Paul.	2830-3980-0036	5.765	Wherever the means of grace are found, there is the Holy Church, even though Antichrist reigns there.
+2830-3980-0058	2.69	Mohammed also speaks highly of Christ.	2830-3980-0037	6.42	So much for the title of the epistle. Now follows the greeting of the apostle. VERSE three.
+2830-3980-0042	3.02	The world brands this a pernicious doctrine.	2830-3980-0038	5.54	Grace be to you, and peace, from God the Father, and from our Lord Jesus Christ.
+2830-3980-0000	3.73	In every way they sought to undermine the authority of Saint Paul.	2830-3980-0039	5.195	The terms of grace and peace are common terms with Paul and are now pretty well understood.
+2830-3980-0064	2.88	How may we obtain remission of our sins?	2830-3980-0041	4.89	Grace involves the remission of sins, peace, and a happy conscience.
+2830-3980-0024	3.935	In this whole epistle Paul treats of the resurrection of Christ.	2830-3980-0047	7.865	To do so is to lose God altogether because God becomes intolerable when we seek to measure and to comprehend His infinite majesty.
+2830-3980-0071	3.96	We think that by some little work or merit we can dismiss sin.	2830-3980-0050	7.475	Did not Christ Himself say: "I am the way, and the truth, and the life: no man cometh unto the Father, but by me"?
+2830-3980-0001	3.945	They said to the Galatians: "You have no right to think highly of Paul.	2830-3980-0051	6.44	When you argue about the nature of God apart from the question of justification, you may be as profound as you like.
+2830-3980-0046	2.84	Was it not enough to say, "from God the Father"?	2830-3980-0052	4.88	We are to hear Christ, who has been appointed by the Father as our divine Teacher.
+2830-3980-0003	2.48	Paul came later and is beneath us.	2830-3980-0053	5.015	At the same time, Paul confirms our creed, "that Christ is very God".
+2830-3980-0071	3.96	We think that by some little work or merit we can dismiss sin.	2830-3980-0055	7.335	To bestow peace and grace lies in the province of God, who alone can create these blessings. The angels cannot.
+2830-3980-0060	2.675	He never loses sight of the purpose of his epistle.	2830-3980-0056	5.35	Otherwise Paul should have written: "Grace from God the Father, and peace from our Lord Jesus Christ".
+2830-3980-0040	2.62	The greeting of the Apostle is refreshing.	2830-3980-0057	8.07	The Arians took Christ for a noble and perfect creature, superior even to the angels, because by Him God created heaven and earth.
+2830-3979-0012	3.625	The Word of our God shall stand forever.	2830-3980-0061	7.12	Not gold, or silver, or paschal lambs, or an angel, but Himself. What for?
+2830-3980-0034	2.97	These means cannot be contaminated.	2830-3980-0062	5.44	Not for a crown, or a kingdom, or our goodness, but for our sins.
+2830-3980-0045	3.51	Men Should Not Speculate About the Nature of God	2830-3980-0063	5.415	Underscore these words, for they are full of comfort for sore consciences.
+2830-3980-0042	3.02	The world brands this a pernicious doctrine.	2830-3980-0065	6.515	Paul answers: "The man who is named Jesus Christ and the Son of God gave himself for our sins".
+2830-3980-0021	2.91	and God the Father, who raised him from the dead.	2830-3980-0066	6.085	Since Christ was given for our sins it stands to reason that they cannot be put away by our own efforts.
+2830-3980-0071	3.96	We think that by some little work or merit we can dismiss sin.	2830-3980-0067	8.13	This sentence also defines our sins as great, so great, in fact, that the whole world could not make amends for a single sin.
+2830-3980-0045	3.51	Men Should Not Speculate About the Nature of God	2830-3980-0068	5	The greatness of the ransom, Christ, the Son of God, indicates this.
+2830-3980-0040	2.62	The greeting of the Apostle is refreshing.	2830-3980-0069	5.555063	The vicious character of sin is brought out by the words "who gave himself for our sins".
+2830-3980-0042	3.02	The world brands this a pernicious doctrine.	2830-3980-0072	4.855	This passage, then, bears out the fact that all men are sold under sin.
+2830-3980-0060	2.675	He never loses sight of the purpose of his epistle.	2830-3980-0074	5.7	This attitude is universal and particularly developed in those who consider themselves better than others.
+2830-3980-0042	3.02	The world brands this a pernicious doctrine.	2830-3980-0075	5.79	But the real significance and comfort of the words "for our sins" is lost upon them.
+2830-3980-0046	2.84	Was it not enough to say, "from God the Father"?	2830-3980-0076	4.81	On the other hand, we are not to regard them as so terrible that we must despair.
+5105-28241-0014	2.995	Another circumstance was most remarkable.	5105-28233-0000	4.51	Length of service: Fourteen years, three months, and five days.
+5105-28240-0018	2.885	You will take me on board, count, will you not"?	5105-28233-0001	4.49	He seemed born to please without being conscious of the power he possessed.
+5105-28240-0018	2.885	You will take me on board, count, will you not"?	5105-28233-0002	8.285	It must be owned, and no one was more ready to confess it than himself, that his literary attainments were by no means of a high order.
+5105-28233-0001	4.49	He seemed born to please without being conscious of the power he possessed.	5105-28233-0004	4.735	Once, in action, he was leading a detachment of infantry through an intrenchment.
+5105-28241-0003	3.98	Steam up and canvas spread, the schooner started eastwards.	5105-28233-0006	5.505	No cathedral - not even Burgos itself - could vie with the church at Montmartre.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0000	4.665	Socrates begins the Timaeus with a summary of the Republic.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0001	9.185	And now he desires to see the ideal State set in motion; he would like to know how she behaved in some great struggle.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0003	4.73	I will, if Timaeus approves'. 'I approve.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0006	4.6	And what was the subject of the poem'? said the person who made the remark.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0007	8.505	The subject was a very noble one; he described the most famous action in which the Athenian people were ever engaged.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0008	7.155	But the memory of their exploits has passed away owing to the lapse of time and the extinction of the actors.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0009	5.705	Tell us,' said the other, 'the whole story, and where Solon heard the story.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0010	7.83	But in Egypt the traditions of our own and other lands are by us registered for ever in our temples.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0011	7.815	The genealogies which you have recited to us out of your own annals, Solon, are a mere children's story.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0013	5.12	Solon marvelled, and desired to be informed of the particulars.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0014	9.565	Nine thousand years have elapsed since she founded yours, and eight thousand since she founded ours, as our annals record.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0015	6.815	Many laws exist among us which are the counterpart of yours as they were in the olden time.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0016	7.815	I will briefly describe them to you, and you shall read the account of them at your leisure in the sacred registers.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0017	9.73	Observe again, what care the law took in the pursuit of wisdom, searching out the deep things of the world, and applying them to the use of man.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0018	5.29	The most famous of them all was the overthrow of the island of Atlantis.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0020	6.125	This is the explanation of the shallows which are found in that part of the Atlantic ocean.
+2961-961-0005	3.775	Some poems of Solon were recited by the boys.	2961-961-0021	4.94	But I would not speak at the time, because I wanted to refresh my memory.
+1995-1837-0015	4.485	The squares of cotton, sharp edged, heavy, were just about to burst to bolls!	1995-1837-0001	8.73	It was the first great sorrow of his life; it was not so much the loss of the cotton itself - but the fantasy, the hopes, the dreams built around it.
+1995-1837-0015	4.485	The squares of cotton, sharp edged, heavy, were just about to burst to bolls!	1995-1837-0003	7.36	The revelation of his love lighted and brightened slowly till it flamed like a sunrise over him and left him in burning wonder.
+1995-1826-0008	2.895	Some others, too; big cotton county".	1995-1837-0004	6.36	He panted to know if she, too, knew, or knew and cared not, or cared and knew not.
+1995-1837-0005	2.635	She was so strange and human a creature.	1995-1837-0007	8.8	Then of a sudden, at midday, the sun shot out, hot and still; no breath of air stirred; the sky was like blue steel; the earth steamed.
+1995-1837-0009	3.76	The lagoon had been level with the dykes a week ago; and now?	1995-1837-0012	8.245	He splashed and stamped along, farther and farther onward until he neared the rampart of the clearing, and put foot upon the tree bridge.
+1995-1826-0003	3.09	Better go," he had counselled, sententiously.	1995-1837-0016	7.19	For one long moment he paused, stupid, agape with utter amazement, then leaned dizzily against a tree.
+1995-1837-0013	3.195	Then he looked down. The lagoon was dry.	1995-1837-0019	5.38	He sat down weak, bewildered, and one thought was uppermost - Zora!
+1995-1836-0007	3.435	But you believe in some education"? asked Mary Taylor.	1995-1837-0024	5.385	For a while she lay in her chair, in happy, dreamy pleasure at sun and bird and tree.
+1995-1836-0007	3.435	But you believe in some education"? asked Mary Taylor.	1995-1837-0025	9.505062	She rose with a fleeting glance, gathered the shawl round her, then gliding forward, wavering, tremulous, slipped across the road and into the swamp.
+1995-1837-0021	3.09	The hope and dream of harvest was upon the land.	1995-1837-0026	8.095	She had been born within its borders; within its borders she had lived and grown, and within its borders she had met her love.
+1995-1826-0003	3.09	Better go," he had counselled, sententiously.	1995-1837-0027	6.705	On she hurried until, sweeping down to the lagoon and the island, lo! the cotton lay before her!
+1995-1826-0025	3.295	Some time you'll tell me, please, won't you"?	1995-1837-0029	5.58	He darted through the trees and paused, a tall man strongly but slimly made.
+5639-40744-0010	4.12	It is the only amends I ask of you for the wrong you have done me".	5639-40744-0002	8.91	Rodolfo and his companions, with their faces muffled in their cloaks, stared rudely and insolently at the mother, the daughter, and the servant maid.
+5639-40744-0010	4.12	It is the only amends I ask of you for the wrong you have done me".	5639-40744-0005	5.645	Finally, the one party went off exulting, and the other was left in desolation and woe.
+5639-40744-0010	4.12	It is the only amends I ask of you for the wrong you have done me".	5639-40744-0006	8.045	Rodolfo arrived at his own house without any impediment, and Leocadia's parents reached theirs heart broken and despairing.
+5639-40744-0010	4.12	It is the only amends I ask of you for the wrong you have done me".	5639-40744-0007	5.825	Meanwhile Rodolfo had Leocadia safe in his custody, and in his own apartment.
+5639-40744-0011	2.665	She found the door, but it was locked outside.	5639-40744-0010	4.12	It is the only amends I ask of you for the wrong you have done me".
+5639-40744-0011	2.665	She found the door, but it was locked outside.	5639-40744-0012	8.595	She succeeded in opening the window; and the moonlight shone in so brightly, that she could distinguish the colour of some damask hangings in the room.
+5639-40744-0010	4.12	It is the only amends I ask of you for the wrong you have done me".	5639-40744-0013	6.865	She saw that the bed was gilded, and so rich, that it seemed that of a prince rather than of a private gentleman.
+5639-40744-0010	4.12	It is the only amends I ask of you for the wrong you have done me".	5639-40744-0014	7.72	Among other things on which she cast her eyes was a small crucifix of solid silver, standing on a cabinet near the window.
+5639-40744-0011	2.665	She found the door, but it was locked outside.	5639-40744-0016	9.49	On the contrary, he resolved to tell them, that repenting of his violence, and moved by her tears, he had only carried her half way towards his house, and then let her go.
+5639-40744-0010	4.12	It is the only amends I ask of you for the wrong you have done me".	5639-40744-0017	5.88	Choking with emotion, Leocadi made a sign to her parents that she wished to be alone with them.
+5639-40744-0010	4.12	It is the only amends I ask of you for the wrong you have done me".	5639-40744-0020	9.82	Thus did this humane and right minded father comfort his unhappy daughter; and her mother embracing her again did all she could to soothe her feelings.
+5639-40744-0011	2.665	She found the door, but it was locked outside.	5639-40744-0024	8.845	One day, when the boy was sent by his grandfather with a message to a relation, he passed along a street in which there was a great concourse of horsemen.
+5639-40744-0011	2.665	She found the door, but it was locked outside.	5639-40744-0025	8.785	The bed she too well remembered was there; and, above all, the cabinet, on which had stood the image she had taken away, was still on the same spot.
+5639-40744-0010	4.12	It is the only amends I ask of you for the wrong you have done me".	5639-40744-0029	7.305	This truth which I have learned from her lips is confirmed by his face, in which we have both beheld that of our son".
+5639-40744-0010	4.12	It is the only amends I ask of you for the wrong you have done me".	5639-40744-0033	9.15	Her bearing was graceful and animated; she led her son by the hand, and before her walked two maids with wax lights and silver candlesticks.
+260-123440-0003	3.585	Oh! won't she be savage if I've kept her waiting"!	260-123440-0010	8.315	How cheerfully he seems to grin, How neatly spread his claws, And welcome little fishes in With gently smiling jaws"!
+260-123440-0003	3.585	Oh! won't she be savage if I've kept her waiting"!	260-123440-0011	4.87	No, I've made up my mind about it; if I'm Mabel, I'll stay down here!
+260-123288-0019	2.955	At noon the violence of the storm redoubles.	260-123440-0012	5.245	It'll be no use their putting their heads down and saying 'Come up again, dear!
+260-123286-0022	3.235	Two hours afterwards a terrible shock awoke me.	260-123440-0015	6.2	I wish I hadn't cried so much"! said Alice, as she swam about, trying to find her way out.
+260-123286-0024	3.04	There's a whale, a whale"! cried the Professor.	260-123440-0016	4.895	I shall be punished for it now, I suppose, by being drowned in my own tears!
+260-123288-0009	3.435	Those clouds seem as if they were going to crush the sea".	260-123440-0019	6.63	cried Alice again, for this time the Mouse was bristling all over, and she felt certain it must be really offended.
+260-123440-0018	3.64	I am very tired of swimming about here, O Mouse"!	260-123440-0020	4.995	We won't talk about her any more if you'd rather not". "We indeed"!
+2300-131720-0006	4.12	There seems no good reason for believing that it will change.	2300-131720-0000	5.08	The Paris plant, like that at the Crystal Palace, was a temporary exhibit.
+2300-131720-0014	3.75	mister Edison was a leader far ahead of the time.	2300-131720-0005	6.9	Why, if we erect a station at the falls, it is a great economy to get it up to the city.
+2300-131720-0041	3.75	We had meters in which there were two bottles of liquid.	2300-131720-0006	4.12	There seems no good reason for believing that it will change.
+2300-131720-0014	3.75	mister Edison was a leader far ahead of the time.	2300-131720-0008	9.125	Everything he has done has been aimed at the conservation of energy, the contraction of space, the intensification of culture.
+2300-131720-0041	3.75	We had meters in which there were two bottles of liquid.	2300-131720-0009	9.605	For some years it was not found feasible to operate motors on alternating current circuits, and that reason was often urged against it seriously.
+2300-131720-0006	4.12	There seems no good reason for believing that it will change.	2300-131720-0015	8.875	He obtained the desired speed and load with a friction brake; also regulator of speed; but waited for an indicator to verify it.
+2300-131720-0041	3.75	We had meters in which there were two bottles of liquid.	2300-131720-0024	4.77	But the plant ran, and it was the first three wire station in this country".
+2300-131720-0014	3.75	mister Edison was a leader far ahead of the time.	2300-131720-0027	8.62	Edison held that the electricity sold must be measured just like gas or water, and he proceeded to develop a meter.
+2300-131720-0014	3.75	mister Edison was a leader far ahead of the time.	2300-131720-0029	6.425	Hence the Edison electrolytic meter is no longer used, despite its excellent qualities.
+2300-131720-0006	4.12	There seems no good reason for believing that it will change.	2300-131720-0030	9.98	The principle employed in the Edison electrolytic meter is that which exemplifies the power of electricity to decompose a chemical substance.
+2300-131720-0041	3.75	We had meters in which there were two bottles of liquid.	2300-131720-0034	8.605	the others having been in operation too short a time to show definite results, although they also went quickly to a dividend basis.
+2300-131720-0014	3.75	mister Edison was a leader far ahead of the time.	2300-131720-0037	7.965	He weighed and reweighed the meter plates, and pursued every line of investigation imaginable, but all in vain.
+2300-131720-0041	3.75	We had meters in which there were two bottles of liquid.	2300-131720-0038	5.61	He felt he was up against it, and that perhaps another kind of a job would suit him better.
+2300-131720-0041	3.75	We had meters in which there were two bottles of liquid.	2300-131720-0040	5.455	We were more interested in the technical condition of the station than in the commercial part.
+908-157963-0002	2.755	why fades the lotus of the water?	908-31957-0002	4.79	I did not wrong myself so, but I placed A wrong on thee.
+908-157963-0018	4.255	And fearest thou because I vanish and am seen no more.	908-31957-0003	6.565	When called before, I told how hastily I dropped my flowers or brake off from a game.
+908-157963-0001	2.885	O life of this our spring!	908-31957-0005	4.49	Alas, I have grieved so I am hard to love.
+908-31957-0018	3.915	But thou art not such A lover, my Beloved!	908-31957-0006	5.89	Open thy heart wide, And fold within, the wet wings of thy dove.
+908-157963-0002	2.755	why fades the lotus of the water?	908-31957-0007	5.8	Could it mean To last, a love set pendulous between Sorrow and sorrow?
+908-157963-0029	3.63	Why a Tongue impressed with honey from every wind?	908-31957-0009	7.705	And, though I have grown serene And strong since then, I think that God has willed A still renewable fear...
+908-31957-0005	4.49	Alas, I have grieved so I am hard to love.	908-31957-0012	7.615	if he, to keep one oath, Must lose one joy, by his life's star foretold.
+908-157963-0002	2.755	why fades the lotus of the water?	908-31957-0013	6.18	Slow to world greetings, quick with its "O, list," When the angels speak.
+908-157963-0024	3.44	image of weakness, art thou but a Worm?	908-31957-0014	7.56	A ring of amethyst I could not wear here, plainer to my sight, Than that first kiss.
+908-31957-0005	4.49	Alas, I have grieved so I am hard to love.	908-31957-0016	6.48	Dearest, teach me so To pour out gratitude, as thou dost, good!
+908-31957-0018	3.915	But thou art not such A lover, my Beloved!	908-31957-0017	7.795	Mussulmans and Giaours Throw kerchiefs at a smile, and have no ruth For any weeping.
+908-157963-0002	2.755	why fades the lotus of the water?	908-31957-0019	9.54	thou canst wait Through sorrow and sickness, to bring souls to touch, And think it soon when others cry "Too late".
+908-157963-0013	4.315	And why it scatters its bright beauty thro the humid air.	908-31957-0020	5.895	I thank all who have loved me in their hearts, With thanks and love from mine.
+908-31957-0018	3.915	But thou art not such A lover, my Beloved!	908-31957-0023	8.515	I love thee freely, as men strive for Right; I love thee purely, as they turn from Praise.
+908-157963-0002	2.755	why fades the lotus of the water?	908-31957-0024	7.54	I love thee with the passion put to use In my old griefs, and with my childhood's faith.
+4992-41797-0016	3.3	They couldn't run nor move; they're just pasteboard".	4992-41806-0001	8.31	To night there was no need of extra heat, and there were great ceremonies to be observed in lighting the fires on the hearthstones.
+4992-41797-0016	3.3	They couldn't run nor move; they're just pasteboard".	4992-41806-0003	9.24	Kathleen waved the torch to and fro as she recited some beautiful lines written for some such purpose as that which called them together to night.
+4992-41797-0016	3.3	They couldn't run nor move; they're just pasteboard".	4992-41806-0009	4.355	exclaimed Bill Harmon to his wife as they went through the lighted hall.
+4992-23283-0007	4.045	To ask any more questions of you, I believe, would be unfair.	4992-41806-0011	7.84	Mother Carey poured coffee, Nancy chocolate, and the others helped serve the sandwiches and cake, doughnuts and tarts.
+4992-23283-0016	4.495	Again he searched his own thoughts; nor ineffectually as before.	4992-41806-0012	6.73	At that moment the gentleman entered, bearing a huge object concealed by a piece of green felt.
+4992-41797-0016	3.3	They couldn't run nor move; they're just pasteboard".	4992-41806-0013	6.02	Approaching the dining table, he carefully placed the article in the centre and removed the cloth.
+7021-85628-0004	2.805	Yes, why not"? thought Anders.	7021-85628-0002	6.455	He was such a big boy that he wore high boots and carried a jack knife.
+7021-85628-0006	3.58	I am going to the court ball," answered Anders.	7021-85628-0005	5.015	Seeing that I am so fine, I may as well go and visit the King".
+7021-85628-0025	2.775	But his mother hugged him close.	7021-85628-0008	7.125	For, like as not, they must have thought him a prince when they saw his fine cap.
+7021-79759-0001	2.48	That is comparatively nothing.	7021-85628-0009	8.54	At the farther end of the largest hall a table was set with golden cups and golden plates in long rows.
+7021-85628-0019	3.255	With one jump Anders got out of his chair.	7021-85628-0010	8.015	On huge silver platters were pyramids of tarts and cakes, and red wine sparkled in glittering decanters.
+7021-79740-0012	3.26	said she, pointing to the playthings; "see!	7021-85628-0011	8.995	The Princess sat down under a blue canopy with bouquets of roses; and she let Anders sit in a golden chair by her side.
+7021-79740-0009	3.635	They were now playing with their dolls in the parlor.	7021-85628-0012	5.33	But you must not eat with your cap on your head," she said, and was going to take it off.
+7021-85628-0026	2.74	No, my little son," she said.	7021-85628-0016	4.28	That is a very fine cap you have," he said.
+7021-79740-0012	3.26	said she, pointing to the playthings; "see!	7021-85628-0018	8.22	And it is made of mother's best yarn, and she knitted it herself, and everybody wants to get it away from me".
+7021-79740-0012	3.26	said she, pointing to the playthings; "see!	7021-85628-0020	6.45	He darted like an arrow through all the halls, down all the stairs, and across the yard.
+7021-79740-0009	3.635	They were now playing with their dolls in the parlor.	7021-85628-0021	5.365	He still held on to it with both hands as he rushed into his mother's cottage.
+7021-79740-0009	3.635	They were now playing with their dolls in the parlor.	7021-85628-0022	5.145	And all his brothers and sisters stood round and listened with their mouths open.
+7021-79740-0009	3.635	They were now playing with their dolls in the parlor.	7021-85628-0023	9.03	But when his big brother heard that he had refused to give his cap for a King's golden crown, he said that Anders was a stupid.
+7021-85628-0019	3.255	With one jump Anders got out of his chair.	7021-85628-0027	8.5	If you dressed in silk and gold from top to toe, you could not look any nicer than in your little red cap".
+1284-1181-0007	4.04	She poured into the dish a quantity from each of these bottles.	1284-134647-0000	8.53	The grateful applause of the clergy has consecrated the memory of a prince who indulged their passions and promoted their interest.
+4970-29095-0011	3.355	Does thee think thee could stand it six months?	4970-29095-0002	5.48	Well, mother," said the young student, looking up, with a shade of impatience.
+4970-29095-0006	4.47	Is thy father willing thee should go away to a school of the world's people"?	4970-29095-0004	9.61	I heard father tell cousin Abner that he was whipped so often for whistling when he was a boy that he was determined to have what compensation he could get now".
+4970-29095-0014	3.26	Where thee and thy family are known"?	4970-29095-0005	4.65	Thy ways greatly try me, Ruth, and all thy relations.
+4970-29093-0015	3.325	You can begin by carrying a rod, and putting down the figures.	4970-29095-0006	4.47	Is thy father willing thee should go away to a school of the world's people"?
+4970-29095-0014	3.26	Where thee and thy family are known"?	4970-29095-0009	5.6	Margaret Bolton almost lost for a moment her habitual placidity.
+4970-29095-0000	2.865	She was tired of other things.	4970-29095-0012	4.68	And, besides, suppose thee does learn medicine"?
+4970-29095-0014	3.26	Where thee and thy family are known"?	4970-29095-0016	6.945	Ruth sat quite still for a time, with face intent and flushed. It was out now.
+4970-29095-0011	3.355	Does thee think thee could stand it six months?	4970-29095-0022	4.765	Is thee going to the Yearly Meeting, Ruth"? asked one of the girls.
+4970-29093-0000	3.03	You'll never dig it out of the Astor Library".	4970-29095-0024	6.04	It has occupied mother a long time, to find at the shops the exact shade for her new bonnet.
+4970-29093-0017	2.865	I've been ready to go anywhere for six months.	4970-29095-0027	9.795	It's such a crush at the Yearly Meeting at Arch Street, and then there's the row of sleek looking young men who line the curbstone and stare at us as we come out.
+4970-29093-0008	3.58	He wanted to begin at the top of the ladder.	4970-29095-0030	4.67	Father, thee's unjust to Philip. He's going into business".
+4970-29095-0017	3.93	The sight seers returned in high spirits from the city.	4970-29095-0032	6.61	But Philip is honest, and he has talent enough, if he will stop scribbling, to make his way.
+4970-29093-0008	3.58	He wanted to begin at the top of the ladder.	4970-29095-0034	5.81	Why should I rust, and be stupid, and sit in inaction because I am a girl?
+4970-29095-0011	3.355	Does thee think thee could stand it six months?	4970-29095-0035	4.75	And if I had a fortune, would thee want me to lead a useless life"?
+4970-29093-0017	2.865	I've been ready to go anywhere for six months.	4970-29095-0036	5.25	Has thee consulted thy mother about a career, I suppose it is a career thee wants"?
+4970-29093-0000	3.03	You'll never dig it out of the Astor Library".	4970-29095-0037	6.885	But that wise and placid woman understood the sweet rebel a great deal better than Ruth understood herself.
+4970-29093-0000	3.03	You'll never dig it out of the Astor Library".	4970-29095-0038	8.74	Ruth was glad to hear that Philip had made a push into the world, and she was sure that his talent and courage would make a way for him.
+121-127105-0032	3.17	Yes, but that's just the beauty of her passion".	121-127105-0000	9.875	It was this observation that drew from Douglas not immediately, but later in the evening a reply that had the interesting consequence to which I call attention.
+121-127105-0018	2.77	cried the ladies whose departure had been fixed.	121-127105-0001	5.025	Someone else told a story not particularly effective, which I saw he was not following.
+121-127105-0032	3.17	Yes, but that's just the beauty of her passion".	121-127105-0002	7.495	cried one of the women. He took no notice of her; he looked at me, but as if, instead of me, he saw what he spoke of.
+121-127105-0036	4.15	But was that all her reward"? one of the ladies asked.	121-127105-0003	7.725	There was a unanimous groan at this, and much reproach; after which, in his preoccupied way, he explained.
+121-127105-0032	3.17	Yes, but that's just the beauty of her passion".	121-127105-0005	5.82	I could write to my man and enclose the key; he could send down the packet as he finds it".
+121-127105-0018	2.77	cried the ladies whose departure had been fixed.	121-127105-0006	4.725	The others resented postponement, but it was just his scruples that charmed me.
+121-127105-0036	4.15	But was that all her reward"? one of the ladies asked.	121-127105-0007	5.79	To this his answer was prompt. "Oh, thank God, no"! "And is the record yours?
+121-127105-0010	2.85	She sent me the pages in question before she died".	121-127105-0011	5.78	She was the most agreeable woman I've ever known in her position; she would have been worthy of any whatever.
+121-127105-0010	2.85	She sent me the pages in question before she died".	121-127105-0012	4.83	It wasn't simply that she said so, but that I knew she hadn't. I was sure; I could see.
+121-127105-0010	2.85	She sent me the pages in question before she died".	121-127105-0013	5.895	You'll easily judge why when you hear". "Because the thing had been such a scare"? He continued to fix me.
+121-127105-0036	4.15	But was that all her reward"? one of the ladies asked.	121-127105-0022	5.075	Well, if I don't know who she was in love with, I know who he was".
+121-127105-0018	2.77	cried the ladies whose departure had been fixed.	121-127105-0026	7.53	The first of these touches conveyed that the written statement took up the tale at a point after it had, in a manner, begun.
+121-127105-0018	2.77	cried the ladies whose departure had been fixed.	121-127105-0028	6.75	The awkward thing was that they had practically no other relations and that his own affairs took up all his time.
+121-127105-0015	2.96	He quitted the fire and dropped back into his chair.	121-127105-0029	7.31	There were plenty of people to help, but of course the young lady who should go down as governess would be in supreme authority.
+121-127105-0036	4.15	But was that all her reward"? one of the ladies asked.	121-127105-0034	7.41	It sounded dull it sounded strange; and all the more so because of his main condition". "Which was-"?
+121-127105-0008	2.76	He hung fire again. "A woman's.	121-127105-0036	4.15	But was that all her reward"? one of the ladies asked.
+260-123288-0012	3.545	That will be safest". "No, no! Never"!	260-123286-0000	7.04	Saturday, august fifteenth. - The sea unbroken all round. No land in sight.
+260-123286-0012	2.43	But there seemed no reason to fear.	260-123286-0002	9.985	All my danger and sufferings were needed to strike a spark of human feeling out of him; but now that I am well his nature has resumed its sway.
+260-123440-0005	3.105	And yesterday things went on just as usual.	260-123286-0003	7.37	You seem anxious, my uncle," I said, seeing him continually with his glass to his eye. "Anxious!
+260-123286-0024	3.04	There's a whale, a whale"! cried the Professor.	260-123286-0005	4.81	I am not complaining that the rate is slow, but that the sea is so wide".
+260-123288-0019	2.955	At noon the violence of the storm redoubles.	260-123286-0006	7.405	We are losing time, and the fact is, I have not come all this way to take a little sail upon a pond on a raft".
+260-123288-0019	2.955	At noon the violence of the storm redoubles.	260-123286-0007	4.55	He called this sea a pond, and our long voyage, taking a little sail!
+260-123286-0022	3.235	Two hours afterwards a terrible shock awoke me.	260-123286-0009	5.795	I take this as my answer, and I leave the Professor to bite his lips with impatience.
+260-123286-0001	3.07	The horizon seems extremely distant.	260-123286-0011	4.255	Nothing new. Weather unchanged. The wind freshens.
+260-123286-0024	3.04	There's a whale, a whale"! cried the Professor.	260-123286-0013	4.73	The shadow of the raft was clearly outlined upon the surface of the waves.
+260-123440-0018	3.64	I am very tired of swimming about here, O Mouse"!	260-123286-0015	5.21	It must be as wide as the Mediterranean or the Atlantic - and why not?
+260-123288-0019	2.955	At noon the violence of the storm redoubles.	260-123286-0016	7	These thoughts agitated me all day, and my imagination scarcely calmed down after several hours' sleep.
+260-123440-0006	2.715	I wonder if I've been changed in the night?	260-123286-0018	5.67	I saw at the Hamburg museum the skeleton of one of these creatures thirty feet in length.
+260-123288-0009	3.435	Those clouds seem as if they were going to crush the sea".	260-123286-0023	5.875	The raft was heaved up on a watery mountain and pitched down again, at a distance of twenty fathoms.
+260-123286-0024	3.04	There's a whale, a whale"! cried the Professor.	260-123286-0025	9.205	Flight was out of the question now. The reptiles rose; they wheeled around our little raft with a rapidity greater than that of express trains.
+260-123288-0020	2.9	Each of us is lashed to some part of the raft.	260-123286-0026	6.94	Two monsters only were creating all this commotion; and before my eyes are two reptiles of the primitive world.
+260-123286-0022	3.235	Two hours afterwards a terrible shock awoke me.	260-123286-0027	7.17	I can distinguish the eye of the ichthyosaurus glowing like a red hot coal, and as large as a man's head.
+260-123286-0024	3.04	There's a whale, a whale"! cried the Professor.	260-123286-0029	4.545	Those huge creatures attacked each other with the greatest animosity.
+260-123440-0005	3.105	And yesterday things went on just as usual.	260-123286-0030	7.53	Suddenly the ichthyosaurus and the plesiosaurus disappear below, leaving a whirlpool eddying in the water.
+260-123288-0022	3.705	They seem to be 'We are lost'; but I am not sure.	260-123286-0031	5.06	As for the ichthyosaurus - has he returned to his submarine cavern?
+3575-170457-0031	4	On august twenty seventh, eighteen thirty seven, she writes:	3575-170457-0000	8.23	And often has my mother said, While on her lap I laid my head, She feared for time I was not made, But for Eternity.
+3575-170457-0032	3.03	Come, come. I am getting really tired of your absence.	3575-170457-0003	7.595	Surely, it must be because we are in danger of loving each other too well - of losing sight of the Creator in idolatry of the creature.
+3575-170457-0049	2.715	This decision was communicated to the girls.	3575-170457-0005	7.34	She, a Tory and clergyman's daughter, was always in a minority of one in our house of violent Dissent and Radicalism.
+3575-170457-0052	3	She had another weight on her mind this Christmas.	3575-170457-0006	8.3	Her feeble health gave her her yielding manner, for she could never oppose any one without gathering up all her strength for the struggle.
+3575-170457-0031	4	On august twenty seventh, eighteen thirty seven, she writes:	3575-170457-0007	7.775	He spoke French perfectly, I have been told, when need was; but delighted usually in talking the broadest Yorkshire.
+3575-170457-0031	4	On august twenty seventh, eighteen thirty seven, she writes:	3575-170457-0010	4.79	I am not depreciating it when I say that in these times it is not rare.
+3575-170457-0049	2.715	This decision was communicated to the girls.	3575-170457-0011	7.015	But it is not with a view to distinction that you should cultivate this talent, if you consult your own happiness.
+3575-170457-0049	2.715	This decision was communicated to the girls.	3575-170457-0012	5.850062	You will say that a woman has no need of such a caution; there can be no peril in it for her.
+3575-170457-0031	4	On august twenty seventh, eighteen thirty seven, she writes:	3575-170457-0013	9.175	The more she is engaged in her proper duties, the less leisure will she have for it, even as an accomplishment and a recreation.
+3575-170457-0031	4	On august twenty seventh, eighteen thirty seven, she writes:	3575-170457-0014	6.68	To those duties you have not yet been called, and when you are you will be less eager for celebrity.
+3575-170457-0004	3.105	We used to dispute about politics and religion.	3575-170457-0019	6.155	I had not ventured to hope for such a reply; so considerate in its tone, so noble in its spirit.
+3575-170457-0056	3.370062	I doubt whether Branwell was maintaining himself at this time.	3575-170457-0020	8.645	I know the first letter I wrote to you was all senseless trash from beginning to end; but I am not altogether the idle dreaming being it would seem to denote.
+3575-170457-0032	3.03	Come, come. I am getting really tired of your absence.	3575-170457-0021	4.18	I thought it therefore my duty, when I left school, to become a governess.
+3575-170457-0004	3.105	We used to dispute about politics and religion.	3575-170457-0022	5.825	In the evenings, I confess, I do think, but I never trouble any one else with my thoughts.
+3575-170457-0049	2.715	This decision was communicated to the girls.	3575-170457-0023	9.095	I carefully avoid any appearance of preoccupation and eccentricity, which might lead those I live amongst to suspect the nature of my pursuits.
+3575-170457-0034	3.495	in this monotonous life of mine, that was a pleasant event.	3575-170457-0025	9.205	Again I thank you. This incident, I suppose, will be renewed no more; if I live to be an old woman, I shall remember it thirty years hence as a bright dream.
+3575-170457-0031	4	On august twenty seventh, eighteen thirty seven, she writes:	3575-170457-0027	4.58	I cannot deny myself the gratification of inserting Southey's reply:
+3575-170457-0049	2.715	This decision was communicated to the girls.	3575-170457-0029	6.055	Your letter has given me great pleasure, and I should not forgive myself if I did not tell you so.
+3575-170457-0049	2.715	This decision was communicated to the girls.	3575-170457-0030	8.945063	Of this second letter, also, she spoke, and told me that it contained an invitation for her to go and see the poet if ever she visited the Lakes.
+3575-170457-0021	4.18	I thought it therefore my duty, when I left school, to become a governess.	3575-170457-0033	8.5	Saturday after Saturday comes round, and I can have no hope of hearing your knock at the door, and then being told that 'Miss E. is come'. Oh, dear!
+3575-170457-0049	2.715	This decision was communicated to the girls.	3575-170457-0035	9.37	I wish it would recur again; but it will take two or three interviews before the stiffness - the estrangement of this long separation - will wear away".
+3575-170457-0034	3.495	in this monotonous life of mine, that was a pleasant event.	3575-170457-0040	6.905	Indeed, there were only one or two strangers who could be admitted among the sisters without producing the same result.
+3575-170457-0049	2.715	This decision was communicated to the girls.	3575-170457-0044	9.72	After this disappointment, I never dare reckon with certainty on the enjoyment of a pleasure again; it seems as if some fatality stood between you and me.
+3575-170457-0001	2.99	Why are we to be denied each other's society?	3575-170457-0045	6.52	I am not good enough for you, and you must be kept from the contamination of too intimate society.
+3575-170457-0049	2.715	This decision was communicated to the girls.	3575-170457-0047	6.525	Tabby had lived with them for ten or twelve years, and was, as Charlotte expressed it, "one of the family".
+3575-170457-0052	3	She had another weight on her mind this Christmas.	3575-170457-0048	5.555	He refused at first to listen to the careful advice; it was repugnant to his liberal nature.
+3575-170457-0049	2.715	This decision was communicated to the girls.	3575-170457-0050	6.405	Tabby had tended them in their childhood; they, and none other, should tend her in her infirmity and age.
+3575-170457-0056	3.370062	I doubt whether Branwell was maintaining himself at this time.	3575-170457-0051	4.915	At tea time, they were sad and silent, and the meal went away untouched by any of the three.
+3575-170457-0031	4	On august twenty seventh, eighteen thirty seven, she writes:	3575-170457-0054	8.005	Stung by anxiety for this little sister, she upbraided Miss W -- for her fancied indifference to Anne's state of health.
+4970-29093-0008	3.58	He wanted to begin at the top of the ladder.	4970-29093-0007	6.995	It is such a noble ambition, that it is a pity it has usually such a shallow foundation.
+4970-29095-0011	3.355	Does thee think thee could stand it six months?	4970-29093-0009	9.12	Philip therefore read diligently in the Astor library, planned literary works that should compel attention, and nursed his genius.
+4970-29093-0017	2.865	I've been ready to go anywhere for six months.	4970-29093-0012	8.71	But Philip did afford it, and he wrote, thanking his friends, and declining because he said the political scheme would fail, and ought to fail.
+4970-29093-0017	2.865	I've been ready to go anywhere for six months.	4970-29093-0013	8.01	And he went back to his books and to his waiting for an opening large enough for his dignified entrance into the literary world.
+4970-29095-0008	3.04	Mother, I'm going to study medicine"?	4970-29093-0014	4.275	Well, I'm going as an engineer. You can go as one".
+4970-29093-0017	2.865	I've been ready to go anywhere for six months.	4970-29093-0018	9.715	The two young men who were by this time full of the adventure, went down to the Wall street office of Henry's uncle and had a talk with that wily operator.
+4970-29093-0015	3.325	You can begin by carrying a rod, and putting down the figures.	4970-29093-0019	7.47	The night was spent in packing up and writing letters, for Philip would not take such an important step without informing his friends.
+4970-29093-0004	3.75	He was unable to decide exactly what it should be.	4970-29093-0020	5.58	Why, it's in Missouri somewhere, on the frontier I think. We'll get a map".
+4970-29093-0017	2.865	I've been ready to go anywhere for six months.	4970-29093-0022	6.22	He knew his uncle would be glad to hear that he had at last turned his thoughts to a practical matter.
+4970-29095-0011	3.355	Does thee think thee could stand it six months?	4970-29093-0023	8.07	He well knew the perils of the frontier, the savage state of society, the lurking Indians and the dangers of fever.
+1284-1181-0019	3.2	I now use them as ornamental statuary in my garden.	1284-1180-0000	8.12	He wore blue silk stockings, blue knee pants with gold buckles, a blue ruffled waist and a jacket of bright blue braided with gold.
+1284-1181-0007	4.04	She poured into the dish a quantity from each of these bottles.	1284-1180-0001	7.755	His hat had a peaked crown and a flat brim, and around the brim was a row of tiny golden bells that tinkled when he moved.
+1284-1181-0021	2.7	asked the voice, in scornful accents.	1284-1180-0002	7.68	Instead of shoes, the old man wore boots with turnover tops and his blue coat had wide cuffs of gold braid.
+1284-1180-0004	4.285	When they were outside, Unc simply latched the door and started up the path.	1284-1180-0003	4.835	For a long time he had wished to explore the beautiful Land of Oz in which they lived.
+1284-1180-0014	3.665	Ojo had never eaten such a fine meal in all his life.	1284-1180-0004	4.285	When they were outside, Unc simply latched the door and started up the path.
+1284-1181-0002	3.835	The head of the Patchwork Girl was the most curious part of her.	1284-1180-0005	6.55	No one would disturb their little house, even if anyone came so far into the thick forest while they were gone.
+1284-1180-0004	4.285	When they were outside, Unc simply latched the door and started up the path.	1284-1180-0006	6.865	At the foot of the mountain that separated the Country of the Munchkins from the Country of the Gillikins, the path divided.
+1284-1180-0004	4.285	When they were outside, Unc simply latched the door and started up the path.	1284-1180-0007	6.265	He knew it would take them to the house of the Crooked Magician, whom he had never seen but who was their nearest neighbor.
+1284-1180-0014	3.665	Ojo had never eaten such a fine meal in all his life.	1284-1180-0009	6.285	Then they started on again and two hours later came in sight of the house of doctor Pipt.
+1284-1181-0002	3.835	The head of the Patchwork Girl was the most curious part of her.	1284-1180-0010	8.635	Unc knocked at the door of the house and a chubby, pleasant faced woman, dressed all in blue, opened it and greeted the visitors with a smile.
+1284-1180-0014	3.665	Ojo had never eaten such a fine meal in all his life.	1284-1180-0011	4.275	I am, my dear, and all strangers are welcome to my home".
+1284-1180-0011	4.275	I am, my dear, and all strangers are welcome to my home".	1284-1180-0012	4.88	We have come from a far lonelier place than this". "A lonelier place!
+1284-1180-0022	2.885	I'm afraid I don't know much about the Land of Oz.	1284-1180-0015	5.835	We are traveling," replied Ojo, "and we stopped at your house just to rest and refresh ourselves.
+1284-1180-0004	4.285	When they were outside, Unc simply latched the door and started up the path.	1284-1180-0020	5.87	The first lot we tested on our Glass Cat, which not only began to live but has lived ever since.
+1284-1181-0002	3.835	The head of the Patchwork Girl was the most curious part of her.	1284-1180-0021	9.84	I think the next Glass Cat the Magician makes will have neither brains nor heart, for then it will not object to catching mice and may prove of some use to us".
+1284-1180-0022	2.885	I'm afraid I don't know much about the Land of Oz.	1284-1180-0023	5.61	You see, I've lived all my life with Unc Nunkie, the Silent One, and there was no one to tell me anything".
+1284-1181-0007	4.04	She poured into the dish a quantity from each of these bottles.	1284-1180-0024	5.26	That is one reason you are Ojo the Unlucky," said the woman, in a sympathetic tone.
+1284-1181-0007	4.04	She poured into the dish a quantity from each of these bottles.	1284-1180-0025	8.705	I think I must show you my Patchwork Girl," said Margolotte, laughing at the boy's astonishment, "for she is rather difficult to explain.
+1284-1180-0004	4.285	When they were outside, Unc simply latched the door and started up the path.	1284-1180-0026	8.29	But first I will tell you that for many years I have longed for a servant to help me with the housework and to cook the meals and wash the dishes.
+1284-1181-0007	4.04	She poured into the dish a quantity from each of these bottles.	1284-1180-0028	6.045	A bed quilt made of patches of different kinds and colors of cloth, all neatly sewed together.
+1284-1181-0002	3.835	The head of the Patchwork Girl was the most curious part of her.	1284-1180-0029	5.335	Sometimes it is called a 'crazy quilt,' because the patches and colors are so mixed up.
+1284-1180-0004	4.285	When they were outside, Unc simply latched the door and started up the path.	1284-1180-0031	4.825	At the Emerald City, where our Princess Ozma lives, green is the popular color.
+1284-1181-0002	3.835	The head of the Patchwork Girl was the most curious part of her.	1284-1180-0032	5.78	I will show you what a good job I did," and she went to a tall cupboard and threw open the doors.
+3570-5694-0022	4.295	The livery becomes obnoxious to nearly all who are required to wear it.	3570-5694-0001	5.675	The utility of consumption as an evidence of wealth is to be classed as a derivative growth.
+3570-5694-0022	4.295	The livery becomes obnoxious to nearly all who are required to wear it.	3570-5694-0004	5.33	In the nature of things, luxuries and the comforts of life belong to the leisure class.
+3570-5694-0022	4.295	The livery becomes obnoxious to nearly all who are required to wear it.	3570-5694-0005	8.405	Under the tabu, certain victuals, and more particularly certain beverages, are strictly reserved for the use of the superior class.
+3570-5694-0022	4.295	The livery becomes obnoxious to nearly all who are required to wear it.	3570-5694-0008	9.495	The consumption of luxuries, in the true sense, is a consumption directed to the comfort of the consumer himself, and is, therefore, a mark of the master.
+3570-5694-0012	3.205	There is a more or less elaborate system of rank and grades.	3570-5694-0013	5.61	This differentiation is furthered by the inheritance of wealth and the consequent inheritance of gentility.
+3570-5694-0022	4.295	The livery becomes obnoxious to nearly all who are required to wear it.	3570-5694-0015	8.435	So many of them, however, as make up the retainer and hangers on of the patron may be classed as vicarious consumer without qualification.
+3570-5694-0019	3.755	But the general distinction is not on that account to be overlooked.	3570-5694-0017	8.335	The wearing of uniforms or liveries implies a considerable degree of dependence, and may even be said to be a mark of servitude, real or ostensible.
+3570-5694-0022	4.295	The livery becomes obnoxious to nearly all who are required to wear it.	3570-5694-0018	7.815	The wearers of uniforms and liveries may be roughly divided into two classes the free and the servile, or the noble and the ignoble.
+3570-5694-0019	3.755	But the general distinction is not on that account to be overlooked.	3570-5694-0022	4.295	The livery becomes obnoxious to nearly all who are required to wear it.
+8463-287645-0010	4.325	He worked me very hard; he wanted to be beating me all the time".	8463-294828-0001	9.19	THREE SECONDS before the arrival of JB Hobson's letter, I no more dreamed of chasing the unicorn than of trying for the Northwest Passage.
+8463-287645-0014	3.02	of starting. I didn't know the way to come.	8463-294828-0002	6.19	Even so, I had just returned from an arduous journey, exhausted and badly needing a rest.
+8463-294828-0021	2.735	A route slightly less direct, that's all.	8463-294828-0003	9.34	I wanted nothing more than to see my country again, my friends, my modest quarters by the Botanical Gardens, my dearly beloved collections!
+8463-294828-0026	2.745	We have a commander who's game for anything"!	8463-294828-0006	7.32	From rubbing shoulders with scientists in our little universe by the Botanical Gardens, the boy had come to know a thing or two.
+8463-294828-0026	2.745	We have a commander who's game for anything"!	8463-294828-0009	4.17	Not once did he comment on the length or the hardships of a journey.
+8463-287645-0009	3.71	I never knew of but one man who could ever please him.	8463-294828-0010	8.34	Never did he object to buckling up his suitcase for any country whatever, China or the Congo, no matter how far off it was.
+8463-294828-0011	3.91	He went here, there, and everywhere in perfect contentment.	8463-294828-0012	4.905	Please forgive me for this underhanded way of admitting I had turned forty.
+8463-287645-0008	3.325	As usual nothing was done in the way of punishment".	8463-294828-0013	7.2	He was a fanatic on formality, and he only addressed me in the third person to the point where it got tiresome.
+8463-287645-0009	3.71	I never knew of but one man who could ever please him.	8463-294828-0014	5.725	There was good reason to stop and think, even for the world's most emotionless man.
+8463-294828-0005	2.44	Conseil was my manservant.	8463-294828-0015	4.88	Conseil"! I called a third time. Conseil appeared.
+8463-287645-0008	3.325	As usual nothing was done in the way of punishment".	8463-294828-0017	9.3	Pack as much into my trunk as you can, my traveling kit, my suits, shirts, and socks, don't bother counting, just squeeze it all in and hurry"!
+8463-294825-0008	3.98	But much of the novel's brooding power comes from Captain Nemo.	8463-294828-0019	4.53	Anyhow, we'll leave instructions to ship the whole menagerie to France".
+8463-287645-0001	3.545	It is hardly necessary to say more of them here.	8463-294828-0020	5.915	Yes, we are... certainly...," I replied evasively, "but after we make a detour".
+8463-287645-0008	3.325	As usual nothing was done in the way of punishment".	8463-294828-0023	4.745	You see, my friend, it's an issue of the monster, the notorious narwhale.
+8463-294828-0026	2.745	We have a commander who's game for anything"!	8463-294828-0027	5.98	I left instructions for shipping my containers of stuffed animals and dried plants to Paris, France.
+8463-294828-0034	3.505	We'll be quite comfortable here," I told Conseil.	8463-294828-0028	7.915	I opened a line of credit sufficient to cover the babirusa and, Conseil at my heels, I jumped into a carriage.
+8463-294828-0011	3.91	He went here, there, and everywhere in perfect contentment.	8463-294828-0029	5.285	Our baggage was immediately carried to the deck of the frigate. I rushed aboard.
+8463-294828-0026	2.745	We have a commander who's game for anything"!	8463-294828-0031	7.765	One of the sailors led me to the afterdeck, where I stood in the presence of a smart looking officer who extended his hand to me.
+8463-287645-0008	3.325	As usual nothing was done in the way of punishment".	8463-294828-0032	4.395	In person. Welcome aboard, professor. Your cabin is waiting for you".
+8463-294825-0008	3.98	But much of the novel's brooding power comes from Captain Nemo.	8463-294828-0033	6.365	I was well satisfied with my cabin, which was located in the stern and opened into the officers' mess.
+8463-294828-0009	4.17	Not once did he comment on the length or the hardships of a journey.	8463-294828-0036	6.985	The wharves of Brooklyn, and every part of New York bordering the East River, were crowded with curiosity seekers.
+7127-75947-0008	4.155	The arrow pierced his heart and wounded him mortally.	7127-75947-0001	6.64	Upon this Madame deigned to turn her eyes languishingly towards the comte, observing.
+7127-75947-0002	3.235	Do you think so"? she replied with indifference.	7127-75947-0003	5.98	Yes; the character which your royal highness assumed is in perfect harmony with your own".
+7127-75946-0025	3.96	The ballet began; the effect was more than beautiful.	7127-75947-0007	5.46	She then rose, humming the air to which she was presently going to dance.
+7127-75946-0005	2.67	What do you mean"? inquired Louis,	7127-75947-0008	4.155	The arrow pierced his heart and wounded him mortally.
+7127-75947-0002	3.235	Do you think so"? she replied with indifference.	7127-75947-0010	8.865	When she perceived the young man, she rose, like a woman surprised in the midst of ideas she was desirous of concealing from herself.
+7127-75946-0005	2.67	What do you mean"? inquired Louis,	7127-75947-0013	5.045	I remember now, and I congratulate myself. Do you love any one"?
+7127-75947-0018	4.04	I have been here this quarter of an hour," replied La Valliere.	7127-75947-0015	6.26	There cannot be a doubt he received you kindly, for, in fact, you returned without his permission".
+7127-75946-0010	3.6	Your majesty's plan, then, in this affair, is	7127-75947-0016	7.48	Oh! mademoiselle, why have I not a devoted sister, or a true friend, such as yourself"?
+7127-75947-0018	4.04	I have been here this quarter of an hour," replied La Valliere.	7127-75947-0024	7.33	Look yonder, do you not see the moon slowly rising, silvering the topmost branches of the chestnuts and the oaks.
+7127-75947-0018	4.04	I have been here this quarter of an hour," replied La Valliere.	7127-75947-0025	5.57	exquisite soft turf of the woods, the happiness which your friendship confers upon me!
+7127-75946-0025	3.96	The ballet began; the effect was more than beautiful.	7127-75947-0028	7.46	Quick, quick, then, among the high reed grass," said Montalais; "stoop, Athenais, you are so tall".
+7127-75946-0025	3.96	The ballet began; the effect was more than beautiful.	7127-75947-0029	5.285	The young girls had, indeed, made themselves small - indeed invisible.
+7127-75947-0019	3.875	Did not the dancing amuse you"? "No".	7127-75947-0032	4.745	Yes; but perhaps I frightened her". "In what way"?
+7127-75947-0018	4.04	I have been here this quarter of an hour," replied La Valliere.	7127-75947-0035	4.415	Good gracious! has the king any right to interfere in matters of that kind?
+7127-75947-0018	4.04	I have been here this quarter of an hour," replied La Valliere.	7127-75947-0037	8.824938	Oh! I am speaking seriously," replied Montalais, "and my opinion in this case is quite as good as the king's, I suppose; is it not, Louise"?
+121-121726-0011	4.035	HUSBAND The next thing to a wife.	121-123859-0004	9.505	So I return rebuked to my content, And gain by ill thrice more than I have spent.
+908-31957-0005	4.49	Alas, I have grieved so I am hard to love.	908-157963-0005	7.035	Like the doves voice, like transient day, like music in the air: Ah!
+908-157963-0009	4.06	Why should the mistress of the vales of Har, utter a sigh.	908-157963-0006	8.11	And gentle sleep the sleep of death, and gently hear the voice Of him that walketh in the garden in the evening time.
+908-157963-0029	3.63	Why a Tongue impressed with honey from every wind?	908-157963-0009	4.06	Why should the mistress of the vales of Har, utter a sigh.
+908-31957-0018	3.915	But thou art not such A lover, my Beloved!	908-157963-0010	6.28	She ceasd and smiled in tears, then sat down in her silver shrine.
+908-157963-0018	4.255	And fearest thou because I vanish and am seen no more.	908-157963-0013	4.315	And why it scatters its bright beauty thro the humid air.
+908-157963-0018	4.255	And fearest thou because I vanish and am seen no more.	908-157963-0014	4.52	Descend O little cloud and hover before the eyes of Thel.
+908-31957-0018	3.915	But thou art not such A lover, my Beloved!	908-157963-0016	5.105	I pass away, yet I complain, and no one hears my voice.
+908-157963-0013	4.315	And why it scatters its bright beauty thro the humid air.	908-157963-0017	4.95	The Cloud then shewd his golden head and his bright form emerged.
+908-157963-0003	3.08	Why fade these children of the spring?	908-157963-0018	4.255	And fearest thou because I vanish and am seen no more.
+908-157963-0024	3.44	image of weakness, art thou but a Worm?	908-157963-0020	9.8	Till we arise linked in a golden band and never part: But walk united bearing food to all our tender flowers.
+908-157963-0013	4.315	And why it scatters its bright beauty thro the humid air.	908-157963-0022	4.61	Come forth worm and the silent valley, to thy pensive queen.
+908-157963-0002	2.755	why fades the lotus of the water?	908-157963-0023	9.625	The helpless worm arose and sat upon the Lillys leaf, And the bright Cloud saild on, to find his partner in the vale.
+908-157963-0024	3.44	image of weakness, art thou but a Worm?	908-157963-0025	9.265	I see they lay helpless and naked: weeping And none to answer, none to cherish thee with mothers smiles.
+908-157963-0029	3.63	Why a Tongue impressed with honey from every wind?	908-157963-0026	8.1	And says; Thou mother of my children, I have loved thee And I have given thee a crown that none can take away.
+908-157963-0024	3.44	image of weakness, art thou but a Worm?	908-157963-0027	5.225	And lay me down in thy cold bed, and leave my shining lot.
+908-157963-0003	3.08	Why fade these children of the spring?	908-157963-0028	4.955	Or an Eye of gifts and graces showring fruits and coined gold!
+908-157963-0024	3.44	image of weakness, art thou but a Worm?	908-157963-0030	4.52	Why an Ear, a whirlpool fierce to draw creations in?
+4446-2271-0003	3.7	It's been on only two weeks, and I've been half a dozen times already.	4446-2271-0001	6.35	He had preconceived ideas about everything, and his idea about Americans was that they should be engineers or mechanics.
+4446-2275-0005	4.445	I felt it in my bones when I woke this morning that something splendid was going to turn up.	4446-2271-0008	5.495	Irene Burgoyne, one of her family, told me in confidence that there was a romance somewhere back in the beginning.
+4446-2271-0005	3.395	She saves her hand, too. She's at her best in the second act.	4446-2271-0009	7.82	Mainhall vouched for her constancy with a loftiness that made Alexander smile, even while a kind of rapid excitement was tingling through him.
+4446-2273-0009	4.015	It's not particularly rare," she said, "but some of it was my mother's.	4446-2271-0013	4.4	Do you know, I thought the dance a bit conscious to night, for the first time.
+4446-2273-0002	3.295	Lamb wouldn't care a great deal about many of them, I fancy".	4446-2271-0014	5.34	Westmere and I were back after the first act, and we thought she seemed quite uncertain of herself.
+4446-2273-0033	3.3	For a long time neither Hilda nor Bartley spoke.	4446-2271-0018	5.715	She considered a moment and then said "No, I think not, though I am glad you ask me.
+4446-2275-0045	2.635	We've tortured each other enough for tonight.	4446-2271-0020	7.55	Of course," he reflected, "she always had that combination of something homely and sensible, and something utterly wild and daft.
+4446-2271-0005	3.395	She saves her hand, too. She's at her best in the second act.	4446-2273-0000	8.995	Hilda was very nice to him, and he sat on the edge of his chair, flushed with his conversational efforts and moving his chin about nervously over his high collar.
+4446-2273-0002	3.295	Lamb wouldn't care a great deal about many of them, I fancy".	4446-2273-0001	4.66	They asked him to come to see them in Chelsea, and they spoke very tenderly of Hilda.
+4446-2273-0002	3.295	Lamb wouldn't care a great deal about many of them, I fancy".	4446-2273-0003	7.835	When Bartley arrived at Bedford Square on Sunday evening, Marie, the pretty little French girl, met him at the door and conducted him upstairs.
+4446-2275-0022	3.28	But why didn't you tell me when you were here in the summer"?	4446-2273-0004	5.435	I should never have asked you if Molly had been here, for I remember you don't like English cookery".
+4446-2273-0034	3.59	He felt a tremor run through the slender yellow figure in front of him.	4446-2273-0005	4.125	I haven't had a chance yet to tell you what a jolly little place I think this is.
+4446-2273-0002	3.295	Lamb wouldn't care a great deal about many of them, I fancy".	4446-2273-0008	7.715	I've managed to save something every year, and that with helping my three sisters now and then, and tiding poor Cousin Mike over bad seasons.
+4446-2271-0013	4.4	Do you know, I thought the dance a bit conscious to night, for the first time.	4446-2273-0009	4.015	It's not particularly rare," she said, "but some of it was my mother's.
+4446-2271-0000	3.495	Mainhall liked Alexander because he was an engineer.	4446-2273-0015	4.505	Don't I, though! I'm so sorry to hear it. How did her son turn out?
+4446-2271-0005	3.395	She saves her hand, too. She's at her best in the second act.	4446-2273-0016	9.645	Her hair is still like flax, and her blue eyes are just like a baby's, and she has the same three freckles on her little nose, and talks about going back to her bains de mer".
+4446-2275-0015	2.98	He pulled up a window as if the air were heavy.	4446-2273-0021	5.255	What she wanted from us was neither our flowers nor our francs, but just our youth.
+4446-2271-0013	4.4	Do you know, I thought the dance a bit conscious to night, for the first time.	4446-2273-0022	5.865	They were both remembering what the woman had said when she took the money: "God give you a happy love"!
+4446-2273-0012	2.98	Thank you. But I don't like it so well as this".	4446-2273-0023	6.1	The strange woman, and her passionate sentence that rang out so sharply, had frightened them both.
+4446-2271-0024	3.16	I shouldn't wonder if she could laugh about it with me now.	4446-2273-0024	4.825	Bartley started when Hilda rang the little bell beside her. "Dear me, why did you do that?
+4446-2271-0011	3.945	Sir Harry Towne, mister Bartley Alexander, the American engineer".	4446-2273-0025	4.83	It was very jolly," he murmured lazily, as Marie came in to take away the coffee.
+4446-2271-0013	4.4	Do you know, I thought the dance a bit conscious to night, for the first time.	4446-2273-0028	5.405	Nonsense. Of course I can't really sing, except the way my mother and grandmother did before me.
+4446-2273-0011	2.79	There is nothing else that looks so jolly".	4446-2273-0032	7.835	He stood a little behind her, and tried to steady himself as he said: "It's soft and misty. See how white the stars are".
+4446-2271-0012	3.78	I say, Sir Harry, the little girl's going famously to night, isn't she"?	4446-2273-0035	6.15	Bartley leaned over her shoulder, without touching her, and whispered in her ear: "You are giving me a chance"? "Yes.
+1188-133604-0013	3.02	It must, remember, be one or the other.	1188-133604-0001	9.04	They unite every quality; and sometimes you will find me referring to them as colorists, sometimes as chiaroscurists.
+1188-133604-0031	4.25	There's one, and there's another - the "Dudley" and the "Flint".	1188-133604-0005	8.56	It is the head of a parrot with a little flower in his beak from a picture of Carpaccio's, one of his series of the Life of Saint George.
+1188-133604-0031	4.25	There's one, and there's another - the "Dudley" and the "Flint".	1188-133604-0010	6.095	But in this vignette, copied from Turner, you have the two principles brought out perfectly.
+1188-133604-0040	3.23	The crampness and the poverty are all intended.	1188-133604-0014	4.39	Do not, therefore, think that the Gothic school is an easy one.
+1188-133604-0031	4.25	There's one, and there's another - the "Dudley" and the "Flint".	1188-133604-0017	4.615	That a style is restrained or severe does not mean that it is also erroneous.
+1188-133604-0014	4.39	Do not, therefore, think that the Gothic school is an easy one.	1188-133604-0022	9.63	You must look at him in the face - fight him - conquer him with what scathe you may: you need not think to keep out of the way of him.
+1188-133604-0031	4.25	There's one, and there's another - the "Dudley" and the "Flint".	1188-133604-0025	7.45	You know I have just been telling you how this school of materialism and clay involved itself at last in cloud and fire.
+1188-133604-0040	3.23	The crampness and the poverty are all intended.	1188-133604-0031	4.25	There's one, and there's another - the "Dudley" and the "Flint".
+1188-133604-0006	2.4	Then he comes to the beak of it.	1188-133604-0033	6.625	Every plant in the grass is set formally, grows perfectly, and may be realized completely.
+1188-133604-0014	4.39	Do not, therefore, think that the Gothic school is an easy one.	1188-133604-0036	7.97	In both these high mythical subjects the surrounding nature, though suffering, is still dignified and beautiful.
+1188-133604-0031	4.25	There's one, and there's another - the "Dudley" and the "Flint".	1188-133604-0038	5.365	But now here is a subject of which you will wonder at first why Turner drew it at all.
+1188-133604-0031	4.25	There's one, and there's another - the "Dudley" and the "Flint".	1188-133604-0039	6.625	It has no beauty whatsoever, no specialty of picturesqueness; and all its lines are cramped and poor.
+1188-133604-0031	4.25	There's one, and there's another - the "Dudley" and the "Flint".	1188-133604-0043	4.885	See that your lives be in nothing worse than a boy's climbing for his entangled kite.
+7729-102255-0000	3.285	The bogus Legislature numbered thirty six members.	7729-102255-0002	8.3	That summer's emigration, however, being mainly from the free States, greatly changed the relative strength of the two parties.
+7729-102255-0034	2.71	To their sorrow they were soon undeceived.	7729-102255-0005	5.18	This was a formidable array of advantages; slavery was playing with loaded dice.
+7729-102255-0013	2.675	It was, in fact, the best weapon of its day.	7729-102255-0010	8.54	Of the lynchings, the mobs, and the murders, it would be impossible, except in a very extended work, to note the frequent and atrocious details.
+7729-102255-0034	2.71	To their sorrow they were soon undeceived.	7729-102255-0012	4.075	Several hundred free State men promptly responded to the summons.
+7729-102255-0034	2.71	To their sorrow they were soon undeceived.	7729-102255-0014	5.295	The leaders of the conspiracy became distrustful of their power to crush the town.
+7729-102255-0001	3.45	This was at the March election, eighteen fifty five.	7729-102255-0021	7.93	But the affair was magnified as a crowning proof that the free State men were insurrectionists and outlaws.
+7729-102255-0001	3.45	This was at the March election, eighteen fifty five.	7729-102255-0023	5.5	Their distinctive characters, however, display one broad and unfailing difference.
+7729-102255-0001	3.45	This was at the March election, eighteen fifty five.	7729-102255-0025	5.485	Their assumed character changed with their changing opportunities or necessities.
+7729-102255-0001	3.45	This was at the March election, eighteen fifty five.	7729-102255-0028	9.6	Private persons who had leased the Free State Hotel vainly besought the various authorities to prevent the destruction of their property.
+7729-102255-0001	3.45	This was at the March election, eighteen fifty five.	7729-102255-0029	7.06	Ten days were consumed in these negotiations; but the spirit of vengeance refused to yield.
+7729-102255-0001	3.45	This was at the March election, eighteen fifty five.	7729-102255-0030	7.25	He summoned half a dozen citizens to join his posse, who followed, obeyed, and assisted him.
+7729-102255-0001	3.45	This was at the March election, eighteen fifty five.	7729-102255-0031	6.75	He continued his pretended search and, to give color to his errand, made two arrests.
+7729-102255-0001	3.45	This was at the March election, eighteen fifty five.	7729-102255-0033	6.775	As he had promised to protect the hotel, the reassured citizens began to laugh at their own fears.
+7729-102255-0034	2.71	To their sorrow they were soon undeceived.	7729-102255-0035	5.625	The military force, partly rabble, partly organized, had meanwhile moved into the town.
+7729-102255-0012	4.075	Several hundred free State men promptly responded to the summons.	7729-102255-0036	7.705	He planted a company before the hotel, and demanded a surrender of the arms belonging to the free- State military companies.
+7729-102255-0001	3.45	This was at the March election, eighteen fifty five.	7729-102255-0038	7.92	Atchison, who had been haranguing the mob, planted his two guns before the building and trained them upon it.
+7729-102255-0001	3.45	This was at the March election, eighteen fifty five.	7729-102255-0039	6.815	The inmates being removed, at the appointed hour a few cannon balls were fired through the stone walls.
+7729-102255-0001	3.45	This was at the March election, eighteen fifty five.	7729-102255-0045	6.805	Captain Martin said: 'I shall give you a pistol to help protect yourself if worse comes to worst!
+3570-5694-0022	4.295	The livery becomes obnoxious to nearly all who are required to wear it.	3570-5695-0000	4.83	In a general way, though not wholly nor consistently, these two groups coincide.
+3570-5694-0012	3.205	There is a more or less elaborate system of rank and grades.	3570-5695-0002	7.805	But as we descend the social scale, the point is presently reached where the duties of vicarious leisure and consumption devolve upon the wife alone.
+3570-5694-0012	3.205	There is a more or less elaborate system of rank and grades.	3570-5695-0003	5.355	In the communities of the Western culture, this point is at present found among the lower middle class.
+3570-5694-0019	3.755	But the general distinction is not on that account to be overlooked.	3570-5695-0006	7.47	Very much of squalor and discomfort will be endured before the last trinket or the last pretense of pecuniary decency is put away.
+3570-5694-0019	3.755	But the general distinction is not on that account to be overlooked.	3570-5695-0007	9.755	There is no class and no country that has yielded so abjectly before the pressure of physical want as to deny themselves all gratification of this higher or spiritual need.
+3570-5694-0019	3.755	But the general distinction is not on that account to be overlooked.	3570-5695-0008	6.845	The question is, which of the two methods will most effectively reach the persons whose convictions it is desired to affect.
+3570-5694-0019	3.755	But the general distinction is not on that account to be overlooked.	3570-5695-0009	5.025	Each will therefore serve about equally well during the earlier stages of social growth.
+3570-5694-0019	3.755	But the general distinction is not on that account to be overlooked.	3570-5695-0010	4.665	The modern organization of industry works in the same direction also by another line.
+3570-5696-0006	4.16	As used in the speech of everyday life the word carries an undertone of deprecation.	3570-5695-0011	8.26	It is evident, therefore, that the present trend of the development is in the direction of heightening the utility of conspicuous consumption as compared with leisure.
+3570-5696-0006	4.16	As used in the speech of everyday life the word carries an undertone of deprecation.	3570-5695-0013	4.64	Consumption becomes a larger element in the standard of living in the city than in the country.
+3570-5694-0012	3.205	There is a more or less elaborate system of rank and grades.	3570-5695-0015	7.95	The result is a great mobility of the labor employed in printing; perhaps greater than in any other equally well defined and considerable body of workmen.
+260-123440-0008	3.745	I'll try if I know all the things I used to know.	260-123288-0001	5.08	The weather - if we may use that term - will change before long.
+260-123288-0020	2.9	Each of us is lashed to some part of the raft.	260-123288-0002	7.25	The atmosphere is charged with vapours, pervaded with the electricity generated by the evaporation of saline waters.
+260-123288-0009	3.435	Those clouds seem as if they were going to crush the sea".	260-123288-0003	8.905	The electric light can scarcely penetrate through the dense curtain which has dropped over the theatre on which the battle of the elements is about to be waged.
+260-123286-0020	3.06	Tuesday, august eighteenth.	260-123288-0004	4.31	The air is heavy; the sea is calm.
+260-123440-0005	3.105	And yesterday things went on just as usual.	260-123288-0006	4.88	The atmosphere is evidently charged and surcharged with electricity.
+260-123440-0008	3.745	I'll try if I know all the things I used to know.	260-123288-0008	5.515	There's a heavy storm coming on," I cried, pointing towards the horizon.
+260-123440-0006	2.715	I wonder if I've been changed in the night?	260-123288-0011	8.98	But if we have now ceased to advance why do we yet leave that sail loose, which at the first shock of the tempest may capsize us in a moment?
+260-123288-0019	2.955	At noon the violence of the storm redoubles.	260-123288-0016	4.865	I refer to the thermometer; it indicates... (the figure is obliterated).
+260-123440-0006	2.715	I wonder if I've been changed in the night?	260-123288-0017	5.225	Is the atmospheric condition, having once reached this density, to become final?
+260-123440-0005	3.105	And yesterday things went on just as usual.	260-123288-0027	6.305	A suffocating smell of nitrogen fills the air, it enters the throat, it fills the lungs.
+8455-210777-0062	3.05	When do you intend that the John Bright shall start"?	8455-210777-0000	8.745	I remained there alone for many hours, but I must acknowledge that before I left the chambers I had gradually brought myself to look at the matter in another light.
+8455-210777-0026	3	And the death of which I dreamt could not, alas!	8455-210777-0002	6.24	On arriving at home at my own residence, I found that our salon was filled with a brilliant company.
+8455-210777-0026	3	And the death of which I dreamt could not, alas!	8455-210777-0005	5.685	We have our little struggles here as elsewhere, and all things cannot be done by rose water.
+8455-210777-0047	2.54	You propose to kidnap me," I said.	8455-210777-0006	4.52	We are quite satisfied now, Captain Battleax," said my wife.
+8455-210777-0049	4.11	Lieutenant Crosstrees is a very gallant officer.	8455-210777-0009	4.58	No doubt, in process of time the ladies will follow
+8455-210777-0025	3.63	What could I do now but just lay myself down and die?	8455-210777-0011	6.63	I did not mean," said Captain Battleax, "to touch upon public subjects at such a moment as this.
+8455-210777-0050	3.945	One of us always remains on board while the other is on shore.	8455-210777-0013	7.41	Jack had been standing in the far corner of the room talking to Eva, and was now reduced to silence by his praises.
+8455-210777-0066	2.76	They, of course, must all be altered".	8455-210777-0014	4.12	Sir Kennington Oval is a very fine player," said my wife.
+8455-210777-0014	4.12	Sir Kennington Oval is a very fine player," said my wife.	8455-210777-0015	8.615	I and my wife and son, and the two Craswellers, and three or four others, agreed to dine on board the ship on the next.
+8455-210777-0026	3	And the death of which I dreamt could not, alas!	8455-210777-0017	5.330063	My wife, on the spur of the moment, managed to give the gentlemen a very good dinner.
+8455-210777-0025	3.63	What could I do now but just lay myself down and die?	8455-210777-0018	5.925	This, she said, was true hospitality; and I am not sure that I did not agree with her.
+8455-210777-0062	3.05	When do you intend that the John Bright shall start"?	8455-210777-0019	8.105	Then there were three or four leading men of the community, with their wives, who were for the most part the fathers and mothers of the young ladies.
+8455-210777-0026	3	And the death of which I dreamt could not, alas!	8455-210777-0023	4.73	We sat with the officers some little time after dinner, and then went ashore.
+8455-210777-0043	3.145	But what is the delicate mission"? I asked.	8455-210777-0024	7.56	How much of evil, - of real accomplished evil, - had there not occurred to me during the last few days!
+8455-210777-0068	2.59	Your power is sufficient," I said.	8455-210777-0028	7.735	Jack would become Eva's happy husband, and would remain amidst the hurried duties of the eager world.
+8455-210777-0025	3.63	What could I do now but just lay myself down and die?	8455-210777-0031	7.67	You have received us with all that courtesy and hospitality for which your character in England stands so high.
+8455-210777-0026	3	And the death of which I dreamt could not, alas!	8455-210777-0033	7.51	But your power is so superior to any that I can advance, as to make us here feel that there is no disgrace in yielding to it.
+8455-210777-0050	3.945	One of us always remains on board while the other is on shore.	8455-210777-0034	7.7	Not a doubt but had your force been only double or treble our own, I should have found it my duty to struggle with you.
+8455-210777-0068	2.59	Your power is sufficient," I said.	8455-210777-0037	4.735	You have come to us threatening us with absolute destruction.
+8455-210777-0026	3	And the death of which I dreamt could not, alas!	8455-210777-0039	5.59	I can assure you he has not even allowed me to see the trigger since I have been on board.
+8455-210777-0025	3.63	What could I do now but just lay myself down and die?	8455-210777-0040	6.195	Then," said Sir Ferdinando, "there is nothing for it but that he must take you with him".
+8455-210777-0026	3	And the death of which I dreamt could not, alas!	8455-210777-0041	6.37	There came upon me a sudden shock when I heard these words, which exceeded anything which I had yet felt.
+8455-210777-0050	3.945	One of us always remains on board while the other is on shore.	8455-210777-0044	7.17	I was to be taken away and carried to England or elsewhere, - or drowned upon the voyage, it mattered not which.
+8455-210777-0062	3.05	When do you intend that the John Bright shall start"?	8455-210777-0046	9.33	You may be quite sure it's there," said Captain Battleax, "and that I can so use it as to half obliterate your town within two minutes of my return on board".
+8455-210777-0020	3.155	Oh yes," said Jack, "and I'm nowhere.	8455-210777-0049	4.11	Lieutenant Crosstrees is a very gallant officer.
+8455-210777-0048	3.43	What would become of your gun were I to kidnap you"?	8455-210777-0052	4.94	You will allow me to suggest," said he, "that that is a matter of opinion.
+8455-210777-0062	3.05	When do you intend that the John Bright shall start"?	8455-210777-0053	6.955	Were I to comply with your orders without expressing my own opinion, I should seem to have done so willingly hereafter.
+8455-210777-0025	3.63	What could I do now but just lay myself down and die?	8455-210777-0055	9.555	SIR, - I have it in command to inform your Excellency that you have been appointed Governor of the Crown colony which is called Britannula.
+8455-210777-0025	3.63	What could I do now but just lay myself down and die?	8455-210777-0056	5.545	The peculiar circumstances of the colony are within your Excellency's knowledge.
+8455-210777-0050	3.945	One of us always remains on board while the other is on shore.	8455-210777-0058	7.16	It is founded on the acknowledged weakness of those who survive that period of life at which men cease to work.
+8455-210777-0064	3.835	And I have no one ready to whom I can give up the archives of the Government".	8455-210777-0059	5.535	But it is surmised that you will find difficulties in the way of your entering at once upon your government.
+8455-210777-0062	3.05	When do you intend that the John Bright shall start"?	8455-210777-0060	7.075	The John Bright is armed with a weapon of great power, against which it is impossible that the people of Britannula should prevail.
+8455-210777-0064	3.835	And I have no one ready to whom I can give up the archives of the Government".	8455-210777-0069	8.915	If you will give us your promise to meet Captain Battleaxe here at this time tomorrow, we will stretch a point and delay the departure of the John Bright for twenty four hours".
+8455-210777-0026	3	And the death of which I dreamt could not, alas!	8455-210777-0070	5.945	And this plan was adopted, too, in order to extract from me a promise that I would depart in peace.
+6829-68769-0043	2.59	And he deserves a term in state's prison".	6829-68771-0002	8.94	The "weak kneed" contingency must be strengthened and fortified, and a couple of hundred votes in one way or another secured from the opposition.
+6829-68769-0016	4.12	He unlocked the door, and called: "Here's visitors, Tom".	6829-68771-0003	4.015	The Democratic Committee figured out a way to do this.
+6829-68769-0014	3.655	They followed the jailer along a succession of passages.	6829-68771-0004	8.44	Under ordinary conditions Reynolds was sure to be elected, but the Committee proposed to sacrifice him in order to elect Hopkins.
+6829-68769-0037	2.53	I've seen lots of that kind in my day.	6829-68771-0005	6.165	The only thing necessary was to "fix" Seth Reynolds, and this Hopkins arranged personally.
+6829-68769-0012	4.295	Oh, say! that's different," observed Markham, altering his demeanor.	6829-68771-0006	5.92	And this was why Kenneth and Beth discovered him conversing with the young woman in the buggy.
+6829-68769-0039	4.045	He looked up rather ungraciously, but motioned them to be seated.	6829-68771-0008	7.18	These women were flattered by the attention of the young lady and had promised to assist in electing mister Forbes.
+6829-68769-0051	3.545	There was a grim smile of amusement on his shrewd face.	6829-68771-0010	9.82	The Fairview band was engaged to discourse as much harmony as it could produce, and the resources of the great house were taxed to entertain the guests.
+6829-68769-0037	2.53	I've seen lots of that kind in my day.	6829-68771-0011	5.625	Tables were spread on the lawn and a dainty but substantial repast was to be served.
+6829-68769-0028	3.29	He is supposed to sign all the checks of the concern.	6829-68771-0014	4.77	We ought to have more attendants, Beth," said Louise, approaching her cousin.
+6829-68769-0033	4.02	It was better for him to think the girl unfeeling than to know the truth.	6829-68771-0015	4.525	Won't you run into the house and see if Martha can't spare one or two more maids"?
+6829-68769-0035	2.755	It won't be much, but I'm grateful to find a friend.	6829-68771-0016	6.99	She was very fond of the young ladies, whom she had known when Aunt Jane was the mistress here, and Beth was her especial favorite.
+6829-68771-0021	2.61	But it can't be," protested the girl.	6829-68771-0018	8.445	For a moment Beth stood staring, while the new maid regarded her with composure and a slight smile upon her beautiful face.
+6829-68771-0031	2.515	Her eyes wandered to the maid's hands.	6829-68771-0019	7.42	She was dressed in the regulation costume of the maids at Elmhurst, a plain black gown with white apron and cap.
+6829-68771-0022	3.8	I attend to the household mending, you know, and care for the linen.	6829-68771-0020	4.615	Then she gave a little laugh, and replied: "No, Miss Beth. I'm Elizabeth Parsons".
+6829-68769-0012	4.295	Oh, say! that's different," observed Markham, altering his demeanor.	6829-68771-0023	5.425	You speak like an educated person," said Beth, wonderingly. "Where is your home"?
+6829-68771-0035	4.39	Will you leave me alone in my own room, or must I go away to escape you"?	6829-68771-0024	6.245	For the first time the maid seemed a little confused, and her gaze wandered from the face of her visitor.
+6829-68769-0051	3.545	There was a grim smile of amusement on his shrewd face.	6829-68771-0025	7.83	She sat down in a rocking chair, and clasping her hands in her lap, rocked slowly back and forth. "I'm sorry," said Beth.
+6829-68769-0051	3.545	There was a grim smile of amusement on his shrewd face.	6829-68771-0027	5.32	They - they excite me, in some way, and I - I can't bear them. You must excuse me".
+6829-68771-0035	4.39	Will you leave me alone in my own room, or must I go away to escape you"?	6829-68771-0029	8.945	Beth was a beautiful girl - the handsomest of the three cousins, by far; yet Eliza surpassed her in natural charm, and seemed well aware of the fact.
+6829-68769-0003	4.215	It was a deliberate theft from his employers to protect a girl he loved.	6829-68771-0030	6.225	Her manner was neither independent nor assertive, but rather one of well bred composure and calm reliance.
+6829-68769-0002	3.075	I can't see it in that light," said the old lawyer.	6829-68771-0032	6.555	However her features and form might repress any evidence of nervousness, these hands told a different story.
+6829-68771-0034	2.475	I wish I knew myself," she cried, fiercely.	6829-68771-0033	5.45	She rose quickly to her feet, with an impetuous gesture that made her visitor catch her breath.
+6829-68769-0002	3.075	I can't see it in that light," said the old lawyer.	6829-68771-0035	4.39	Will you leave me alone in my own room, or must I go away to escape you"?
+6829-68769-0028	3.29	He is supposed to sign all the checks of the concern.	6829-68771-0036	5.2	Eliza closed the door behind her with a decided slam, and a key clicked in the lock.
+8463-287645-0008	3.325	As usual nothing was done in the way of punishment".	8463-287645-0000	4.73	This was what did the mischief so far as the "running away" was concerned.
+8463-294828-0008	2.65	And yet, what a fine, gallant lad!	8463-287645-0003	7.905	Of this party, Edward, a boy of seventeen, called forth much sympathy; he too was claimed by Hollan.
+8463-294828-0026	2.745	We have a commander who's game for anything"!	8463-287645-0006	7.71	The doctor who attended the injured creature in this case was simply told that she slipped and fell down stairs as she was coming down.
+8463-294828-0021	2.735	A route slightly less direct, that's all.	8463-287645-0010	4.325	He worked me very hard; he wanted to be beating me all the time".
+8463-287645-0008	3.325	As usual nothing was done in the way of punishment".	8463-287645-0011	6.38	She was a large, homely woman; they were common white people, with no reputation in the community".
+8463-294828-0011	3.91	He went here, there, and everywhere in perfect contentment.	8463-287645-0012	5.425	Substantially this was Jacob's unvarnished description of his master and mistress.
+8463-294828-0032	4.395	In person. Welcome aboard, professor. Your cabin is waiting for you".	8463-287645-0013	6.665	As to his age, and also the name of his master, Jacob's statement varied somewhat from the advertisement.
+3729-6852-0016	4.195	Madame Quinson, besides, can answer your enquiries.	3729-6852-0011	7.37	I had a name, I believe, in my young days, but I have forgotten it since I have been in service.
+3729-6852-0010	2.755	I never had any family.	3729-6852-0014	5.71	Here, go and get me change for a Louis". "I have it, sir".
+3729-6852-0025	3	Is there not a meridian everywhere"?	3729-6852-0016	4.195	Madame Quinson, besides, can answer your enquiries.
+3729-6852-0016	4.195	Madame Quinson, besides, can answer your enquiries.	3729-6852-0018	6.21	I sit down at a small table: a waiter comes immediately to enquire my wishes.
+3729-6852-0019	3.305	I tell him to give me some coffee, if it is good.	3729-6852-0022	8.315	I address him in Italian, and he answers very wittily, but his way of speaking makes me smile, and I tell him why.
+3729-6852-0019	3.305	I tell him to give me some coffee, if it is good.	3729-6852-0023	8.185	My remark pleases him, but I soon prove to him that it is not the right way to speak, however perfect may have been the language of that ancient writer.
+3729-6852-0019	3.305	I tell him to give me some coffee, if it is good.	3729-6852-0024	5.515	I see a crowd in one corner of the garden, everybody standing still and looking up.
+3729-6852-0016	4.195	Madame Quinson, besides, can answer your enquiries.	3729-6852-0026	4.69	Yes, but the meridian of the Palais Royal is the most exact".
+3729-6852-0019	3.305	I tell him to give me some coffee, if it is good.	3729-6852-0028	5.265	All these honest persons are waiting their turn to get their snuff boxes filled".
+3729-6852-0010	2.755	I never had any family.	3729-6852-0029	8.605	It is sold everywhere, but for the last three weeks nobody will use any snuff but that sold at the 'Civet Cat.
+3729-6852-0025	3	Is there not a meridian everywhere"?	3729-6852-0031	4.4	But how did she manage to render it so fashionable"?
+3729-6852-0019	3.305	I tell him to give me some coffee, if it is good.	3729-6852-0037	5.89	She introduced me to all her guests, and gave me some particulars respecting every one of them.
+3729-6852-0021	2.96	I thank him and take my leave.	3729-6852-0038	5.77	What, sir"! I said to him, "am I fortunate enough to see you?
+3729-6852-0019	3.305	I tell him to give me some coffee, if it is good.	3729-6852-0039	8.825	He himself recited the same passage in French, and politely pointed out the parts in which he thought that I had improved on the original.
+3729-6852-0019	3.305	I tell him to give me some coffee, if it is good.	3729-6852-0044	6.98	I will make you translate them into French, and you need not be afraid of my finding you insatiable".
+7176-92135-0026	2.95	Enter Hamlet with his favourite boar hound.	7176-88083-0000	5.695	All about him was a tumult of bright and broken color, scattered in broad splashes.
+7176-92135-0039	3.125	Tea, please, Matthews. Butler (impassively).	7176-88083-0002	7.51	His feet were red, his long narrow beak, with its saw toothed edges and sharp hooked tip, was bright red.
+7176-92135-0024	4.1	To be or not to be, that is the question; whether 'tis nobler	7176-88083-0003	7.6	But here he was at a terrible disadvantage as compared with the owls, hawks, and eagles. He had no rending claws.
+7176-88083-0016	3.92	Straightway the hawk glided from his perch and darted after him.	7176-88083-0004	7.5	But suddenly, straight and swift as a diving cormorant, he shot down into the torrent and disappeared beneath the surface.
+7176-88083-0016	3.92	Straightway the hawk glided from his perch and darted after him.	7176-88083-0005	4.7	Once fairly a wing, however, he wheeled and made back hurriedly for his perch.
+7176-92135-0008	4.43	Lend me your ear for ten minutes, and you shall learn just what stagecraft is".	7176-88083-0006	4.295	It might have seemed that a trout of this size was a fairly substantial meal.
+7176-92135-0008	4.43	Lend me your ear for ten minutes, and you shall learn just what stagecraft is".	7176-88083-0009	4.045	The great hawk followed hurriedly, to retrieve his prey from the ground.
+7176-88083-0008	3.28	In despair he hurled himself downward too soon.	7176-88083-0010	6.74	The cat growled softly, picked up the prize in her jaws and trotted into the bushes to devour it.
+7176-88083-0016	3.92	Straightway the hawk glided from his perch and darted after him.	7176-88083-0012	5.045	The hawk alighted on the dead branch, and sat upright, motionless, as if surprised.
+7176-88083-0009	4.045	The great hawk followed hurriedly, to retrieve his prey from the ground.	7176-88083-0014	4.67	The hawk sat upon the branch and watched his quarry swimming beneath the surface.
+7176-88083-0016	3.92	Straightway the hawk glided from his perch and darted after him.	7176-88083-0019	5.81	As he flew, his down reaching, clutching talons were not half a yard above the fugitive's head.
+7176-88083-0009	4.045	The great hawk followed hurriedly, to retrieve his prey from the ground.	7176-88083-0020	5.415	Where the waves for an instant sank, they came closer, - but not quite within grasping reach.
+7176-92135-0024	4.1	To be or not to be, that is the question; whether 'tis nobler	7176-88083-0022	9.485	The hawk, embittered by the loss of his first quarry, had become as dogged in pursuit as a weasel, not to be shaken off or evaded or deceived.
+7176-88083-0016	3.92	Straightway the hawk glided from his perch and darted after him.	7176-88083-0023	9.645	He had a lot of line out, and the place was none too free for a long cast; but he was impatient to drop his flies again on the spot where the big fish was feeding.
+7176-92135-0024	4.1	To be or not to be, that is the question; whether 'tis nobler	7176-88083-0024	8.195	The last drop fly, as luck would have it, caught just in the corner of the hawk's angrily open beak, hooking itself firmly.
+7176-88083-0006	4.295	It might have seemed that a trout of this size was a fairly substantial meal.	7176-88083-0025	7.38	At the sudden sharp sting of it, the great bird turned his head and noticed, for the first time, the fisherman standing on the bank.
+7176-88083-0009	4.045	The great hawk followed hurriedly, to retrieve his prey from the ground.	7176-88083-0026	5.53	The drag upon his beak and the light check upon his wings were inexplicable to him, and appalling.
+7127-75947-0008	4.155	The arrow pierced his heart and wounded him mortally.	7127-75946-0004	4.49	Certainly, sire; but I must have money to do that". "What!
+7127-75947-0035	4.415	Good gracious! has the king any right to interfere in matters of that kind?	7127-75946-0006	7.98	He has given them with too much grace not to have others still to give, if they are required, which is the case at the present moment.
+7127-75947-0017	2.665	What, already here"! they said to her.	7127-75946-0007	4.755	It is necessary, therefore, that he should comply". The king frowned.
+7127-75947-0030	2.76	She was here just now," said the count.	7127-75946-0008	4.46	Does your majesty then no longer believe the disloyal attempt"?
+7127-75946-0005	2.67	What do you mean"? inquired Louis,	7127-75946-0009	4.72	Not at all; you are, on the contrary, most agreeable to me".
+7127-75947-0011	3.62	Remain, I implore you: the evening is most lovely.	7127-75946-0012	9.87	The news circulated with the rapidity of lightning; during its progress it kindled every variety of coquetry, desire, and wild ambition.
+7127-75947-0002	3.235	Do you think so"? she replied with indifference.	7127-75946-0013	8.58	The king had completed his toilette by nine o'clock; he appeared in an open carriage decorated with branches of trees and flowers.
+7127-75947-0018	4.04	I have been here this quarter of an hour," replied La Valliere.	7127-75946-0015	7.515	Suddenly, for the purpose of restoring peace and order, Spring, accompanied by his whole court, made his appearance.
+7127-75947-0018	4.04	I have been here this quarter of an hour," replied La Valliere.	7127-75946-0018	9.14	There was something in his carriage which resembled the buoyant movements of an immortal, and he did not dance so much as seem to soar along.
+7127-75947-0018	4.04	I have been here this quarter of an hour," replied La Valliere.	7127-75946-0020	6.52	Far from it, sire; your majesty having given no directions about it, the musicians have retained it".
+7127-75947-0002	3.235	Do you think so"? she replied with indifference.	7127-75946-0024	5.09	Monsieur was the only one who did not understand anything about the matter.
+7127-75947-0018	4.04	I have been here this quarter of an hour," replied La Valliere.	7127-75946-0027	9.675	Disdainful of a success of which Madame showed no acknowledgement, he thought of nothing but boldly regaining the marked preference of the princess.
+7127-75946-0023	3.745	The king seemed only pleased with every one present.	7127-75946-0029	9.285	The king, who had from this moment become in reality the principal dancer in the quadrille, cast a look upon his vanquished rival.
+5105-28240-0018	2.885	You will take me on board, count, will you not"?	5105-28241-0000	6.455	Her sea going qualities were excellent, and would have amply sufficed for a circumnavigation of the globe.
+5105-28240-0016	4.17	To all these inquiries, the count responded in the affirmative.	5105-28241-0005	8.415	For a few miles she followed the line hitherto presumably occupied by the coast of Algeria; but no land appeared to the south.
+5105-28233-0001	4.49	He seemed born to please without being conscious of the power he possessed.	5105-28241-0006	7.55	The log and the compass, therefore, were able to be called upon to do the work of the sextant, which had become utterly useless.
+5105-28240-0002	4.01	exclaimed Servadac, keeping his eye unmoved at his telescope.	5105-28241-0008	8.54	The earth has undoubtedly entered upon a new orbit, but she is not incurring any probable risk of being precipitated onto the sun".
+5105-28240-0013	2.96	Nothing more than you know yourself".	5105-28241-0009	7.01	And what demonstration do you offer," asked Servadac eagerly, "that it will not happen"?
+5105-28240-0010	2.935	Captain Servadac hastened towards him.	5105-28241-0012	6.775	Is it not impossible," he murmured aloud, "that any city should disappear so completely?
+5105-28241-0014	2.995	Another circumstance was most remarkable.	5105-28241-0013	4.82	Would not the loftiest eminences of the city at least be visible?
+5105-28240-0018	2.885	You will take me on board, count, will you not"?	5105-28241-0016	6.285	You must see, lieutenant, I should think, that we are not so near the coast of Algeria as you imagined".
+5105-28240-0018	2.885	You will take me on board, count, will you not"?	5105-28241-0019	5.29	Nothing was to be done but to put about, and return in disappointment towards the north.
+7021-85628-0004	2.805	Yes, why not"? thought Anders.	7021-79759-0000	4.775	Nature of the Effect produced by Early Impressions.
+7021-79740-0009	3.635	They were now playing with their dolls in the parlor.	7021-79759-0002	5.25	They are chiefly formed from combinations of the impressions made in childhood.
+7021-79759-0001	2.48	That is comparatively nothing.	7021-79759-0003	4.62	Vast Importance and Influence of this mental Furnishing,
+1320-122617-0041	4.15	Uncas cast his skin, and stepped forth in his own beautiful proportions.	1320-122612-0001	9.52	The dews were suffered to exhale, and the sun had dispersed the mists, and was shedding a strong and clear light in the forest, when the travelers resumed their journey.
+1320-122612-0014	3.515	The examination, however, resulted in no discovery.	1320-122612-0002	7.46	After proceeding a few miles, the progress of Hawkeye, who led the advance, became more deliberate and watchful.
+1320-122617-0005	4.4	The bear shook his shaggy sides, and then a well known voice replied:	1320-122612-0003	9.865	He often stopped to examine the trees; nor did he cross a rivulet without attentively considering the quantity, the velocity, and the color of its waters.
+1320-122617-0005	4.4	The bear shook his shaggy sides, and then a well known voice replied:	1320-122612-0004	6.425	Distrusting his own judgment, his appeals to the opinion of Chingachgook were frequent and earnest.
+1320-122612-0009	3.88	It would have been more wonderful had he spoken without a bidding.	1320-122612-0005	5.915	Yet here are we, within a short range of the Scaroons, and not a sign of a trail have we crossed!
+1320-122617-0030	3.98	So choose for yourself to make a rush or tarry here".	1320-122612-0006	4.845	Let us retrace our steps, and examine as we go, with keener eyes.
+1320-122612-0014	3.515	The examination, however, resulted in no discovery.	1320-122612-0007	5.54	Chingachgook had caught the look, and motioning with his hand, he bade him speak.
+1320-122612-0009	3.88	It would have been more wonderful had he spoken without a bidding.	1320-122612-0008	7.875	The eyes of the whole party followed the unexpected movement, and read their success in the air of triumph that the youth assumed.
+1320-122612-0009	3.88	It would have been more wonderful had he spoken without a bidding.	1320-122612-0013	6.55	A circle of a few hundred feet in circumference was drawn, and each of the party took a segment for his portion.
+1320-122617-0041	4.15	Uncas cast his skin, and stepped forth in his own beautiful proportions.	1320-122612-0015	6.385	The whole party crowded to the spot where Uncas pointed out the impression of a moccasin in the moist alluvion.
+5142-33396-0028	3.755	On a bench in a far corner were a dozen people huddled together.	5142-33396-0001	5.02	What is your country, Olaf? Have you always been a thrall"? The thrall's eyes flashed.
+5142-33396-0010	3.455	In the stern I curved the tail up almost as high as the head.	5142-33396-0006	6.23	I made her for only twenty oars because I thought few men would follow me; for I was young, fifteen years old.
+5142-33396-0003	3.47	The rest of you, off a viking'! "He had three ships.	5142-33396-0007	4.975	At the prow I carved the head with open mouth and forked tongue thrust out.
+5142-33396-0050	2.885	May you drink heart's ease from it for many years.	5142-33396-0012	4.59	Then I will get me a farm and will winter in that land. Now who will follow me?
+5142-33396-0021	3.505	Up and down the water we went to get much wealth and much frolic.	5142-33396-0015	4.31	As our boat flashed down the rollers into the water I made this song and sang it:
+5142-33396-0014	3.245	Thirty men, one after another, raised their horns and said:	5142-33396-0019	4.985	Oh! it is better to live on the sea and let other men raise your crops and cook your meals.
+5142-33396-0036	4.26	So I will give out this law: that my men shall never leave you alone.	5142-33396-0022	4.77	What of the farm, Olaf'? "'Not yet,' I answered. 'Viking is better for summer.
+5142-33396-0047	2.535	My men pounded the table with their fists.	5142-33396-0024	5.345	I stood with my back to the wall; for I wanted no sword reaching out of the dark for me.
+5142-33396-0037	3.575	Hakon there shall be your constant companion, friend farmer.	5142-33396-0031	7.845	They set up a crane over the fire and hung the pot upon it, and we sat and watched it boil while we joked. At last the supper began.
+5142-33396-0010	3.455	In the stern I curved the tail up almost as high as the head.	5142-33396-0032	9.785	The farmer sat gloomily on the bench and would not eat, and you cannot wonder; for he saw us putting potfuls of his good beef and basket loads of bread into our big mouths.
+5142-33396-0050	2.885	May you drink heart's ease from it for many years.	5142-33396-0033	5.28	You would not eat with us. You cannot say no to half of my ale. I drink this to your health.
+5142-33396-0009	3.37	There, stand so'! I said, 'and glare and hiss at my foes.	5142-33396-0034	6.615	Then I drank half of the hornful and sent the rest across the fire to the farmer. He took it and smiled, saying:
+5142-36586-0000	3.65	It is manifest that man is now subject to much variability.	5142-33396-0036	4.26	So I will give out this law: that my men shall never leave you alone.
+5142-33396-0060	2.615	Take him out, Thorkel, and let him taste your sword.	5142-33396-0038	4.18	He shall not leave you day or night, whether you are working or playing or sleeping.
+5142-33396-0030	2.765	The thralls were bringing in a great pot of meat.	5142-33396-0042	6.095	So no tales got out to the neighbors. Besides, it was a lonely place, and by good luck no one came that way.
+5142-33396-0030	2.765	The thralls were bringing in a great pot of meat.	5142-33396-0044	4.855	I am stiff with long sitting,' he said. 'I itch for a fight'. "I turned to the farmer.
+5142-33396-0014	3.245	Thirty men, one after another, raised their horns and said:	5142-33396-0051	5.57	And with it I leave you a name, Sif the Friendly. I shall hope to drink with you sometime in Valhalla.
+5142-33396-0060	2.615	Take him out, Thorkel, and let him taste your sword.	5142-33396-0052	5.88	Here is a ring for Sif the Friendly'. "'And here is a bracelet'. "'A sword would not be ashamed to hang at your side.
+5142-33396-0049	3.305	Here, friend, take it,' and he thrust it into the farmer's hand.	5142-33396-0054	5.745	That is the best way to decide, for the spear will always point somewhere, and one thing is as good as another.
+5142-33396-0050	2.885	May you drink heart's ease from it for many years.	5142-33396-0059	5.47	Yes. And with all your fingers it took you a year to catch me'. "The king frowned more angrily.
+5142-33396-0025	3.32	Come, come'! I called, when no one obeyed. 'A fire!	5142-33396-0065	5.195	Soft heart'! he said gently to her; then to Thorkel, 'Well, let him go, Thorkel!
+5142-33396-0049	3.305	Here, friend, take it,' and he thrust it into the farmer's hand.	5142-33396-0067	5.565	But, young sharp tongue, now that we have caught you we will put you into a trap that you cannot get out of.
+5683-32866-0000	2.645	Miss Lake declined the carriage to night.	5683-32879-0000	8.92	It was not very much past eleven that morning when the pony carriage from Brandon drew up before the little garden wicket of Redman's Farm.
+5683-32879-0022	4.175	I like you still, Rachel; I'm sure I'll always like you.	5683-32879-0003	9.345	Women can hide their pain better than we men, and bear it better, too, except when shame drops fire into the dreadful chalice.
+5683-32866-0001	3.47	And he added something still less complimentary.	5683-32879-0005	6.11	This transient spring and lighting up are beautiful - a glamour beguiling our senses.
+5683-32865-0001	2.58	said Lord Chelford, addressing me.	5683-32879-0007	6.795	Rachel's pale and sharpened features and dilated eye struck her with a painful surprise.
+5683-32879-0008	2.95	You have been so ill, my poor Rachel.	5683-32879-0009	5.135	Ill and troubled, dear - troubled in mind, and miserably nervous.
+5683-32866-0006	4.215	Yes, so they said; but that would, I think, have been worse.	5683-32879-0010	7.75	Poor Rachel! her nature recoiled from deceit, and she told, at all events, as much of the truth as she dared.
+5683-32865-0014	2.615	He's not a man for country quarters!	5683-32879-0011	9.21	She spoke with a sudden energy, which partook of fear and passion, and flushed her thin cheek, and made her languid eyes flash.
+5683-32865-0015	4.145	I had a horrid dream about him last night.' That?	5683-32879-0012	4.38	Thank you, Rachel, my Cousin Rachel, my only friend.
+5683-32879-0001	3.66	Well, she was better, though she had had a bad night.	5683-32879-0014	8.405	Yes, something - everything,' said Rachel, hurriedly, looking frowningly at a flower which she was twirling in her fingers.
+5683-32866-0023	2.745	All the furniture belonged to other times.	5683-32879-0018	7.44	It is an antipathy - an antipathy I cannot get over, dear Dorcas; you may think it a madness, but don't blame me.
+5683-32866-0007	4.12	If a fellow's been a little bit wild, he's Beelzebub at once.	5683-32879-0019	6.35	I have very few to love me now, and I thought you might love me, as I have begun to love you.
+5683-32865-0014	2.615	He's not a man for country quarters!	5683-32879-0020	6.545	And she threw her arms round her cousin's neck, and brave Rachel at last burst into tears.
+5683-32865-0006	3.35	At dinner Lake was easy and amusing.	5683-32879-0021	4.09	Dorcas, in her strange way, was moved.
+5683-32865-0003	3.51	They are cousins, you know; we are all cousins.	5683-32879-0022	4.175	I like you still, Rachel; I'm sure I'll always like you.
+5683-32866-0001	3.47	And he added something still less complimentary.	5683-32879-0023	4.975	You resemble me, Rachel: you are fearless and inflexible and generous.
+1580-141084-0003	4.1	No names, please"! said Holmes, as we knocked at Gilchrist's door.	1580-141084-0000	4.615	It was the Indian, whose dark silhouette appeared suddenly upon his blind.
+1580-141084-0034	4.49	Well, well, don't trouble to answer. Listen, and see that I do you no injustice.	1580-141084-0002	5.905	This set of rooms is quite the oldest in the college, and it is not unusual for visitors to go over them.
+1580-141083-0041	3.575	Let us hear the suspicions. I will look after the proofs".	1580-141084-0003	4.1	No names, please"! said Holmes, as we knocked at Gilchrist's door.
+1580-141083-0050	3.085	I really don't think he knew much about it, mister Holmes.	1580-141084-0004	9.005	Of course, he did not realize that it was I who was knocking, but none the less his conduct was very uncourteous, and, indeed, under the circumstances rather suspicious".
+1580-141083-0046	3.53	But I have occasionally done the same thing at other times".	1580-141084-0008	6.795	I cannot allow the examination to be held if one of the papers has been tampered with. The situation must be faced".
+1580-141083-0021	3.715	There is no opening except the one pane," said our learned guide.	1580-141084-0009	4.685	It is possible that I may be in a position then to indicate some course of action.
+1580-141084-0037	2.965	When I approached your room, I examined the window.	1580-141084-0011	5	When we were out in the darkness of the quadrangle, we again looked up at the windows.
+1580-141084-0045	3.625	Suddenly he heard him at the very door. There was no possible escape.	1580-141084-0016	5.96	My friend did not appear to be depressed by his failure, but shrugged his shoulders in half humorous resignation.
+1580-141083-0016	4.255	I was in such a hurry to come to you". "You left your door open"?	1580-141084-0021	4.01	On the palm were three little pyramids of black, doughy clay.
+1580-141083-0030	3.48	mister Soames was somewhat overwhelmed by this flood of information.	1580-141084-0023	8.735	In a few hours the examination would commence, and he was still in the dilemma between making the facts public and allowing the culprit to compete for the valuable scholarship.
+1580-141083-0046	3.53	But I have occasionally done the same thing at other times".	1580-141084-0024	9.185	He could hardly stand still so great was his mental agitation, and he ran towards Holmes with two eager hands outstretched. "Thank heaven that you have come!
+1580-141083-0025	3.905	The man entered and took the papers, sheet by sheet, from the central table.	1580-141084-0026	6.995	If this matter is not to become public, we must give ourselves certain powers and resolve ourselves into a small private court martial.
+1580-141083-0046	3.53	But I have occasionally done the same thing at other times".	1580-141084-0029	8.075	His troubled blue eyes glanced at each of us, and finally rested with an expression of blank dismay upon Bannister in the farther corner.
+1580-141083-0050	3.085	I really don't think he knew much about it, mister Holmes.	1580-141084-0031	6.47	We want to know, mister Gilchrist, how you, an honourable man, ever came to commit such an action as that of yesterday"?
+1580-141083-0028	2.585	Then he tossed it down and seized the next.	1580-141084-0032	4.995	For a moment Gilchrist, with upraised hand, tried to control his writhing features.
+1580-141083-0040	3.75	One hardly likes to throw suspicion where there are no proofs".	1580-141084-0033	7	Come, come," said Holmes, kindly, "it is human to err, and at least no one can accuse you of being a callous criminal.
+1580-141083-0036	3.98	Holmes held it out on his open palm in the glare of the electric light.	1580-141084-0034	4.49	Well, well, don't trouble to answer. Listen, and see that I do you no injustice.
+1580-141084-0035	2.63	He could examine the papers in his own office.	1580-141084-0039	4.885	I entered, and I took you into my confidence as to the suggestions of the side table.
+1580-141084-0035	2.63	He could examine the papers in his own office.	1580-141084-0040	5.985	He returned carrying his jumping shoes, which are provided, as you are aware, with several sharp spikes.
+1580-141084-0045	3.625	Suddenly he heard him at the very door. There was no possible escape.	1580-141084-0041	7.99	No harm would have been done had it not been that, as he passed your door, he perceived the key which had been left by the carelessness of your servant.
+1580-141083-0024	4.48	You left him in a chair, you say. Which chair"? "By the window there".	1580-141084-0042	5.06	A sudden impulse came over him to enter, and see if they were indeed the proofs.
+1580-141083-0030	3.48	mister Soames was somewhat overwhelmed by this flood of information.	1580-141084-0047	5.25	I have a letter here, mister Soames, which I wrote to you early this morning in the middle of a restless night.
+1580-141084-0045	3.625	Suddenly he heard him at the very door. There was no possible escape.	1580-141084-0048	9.265	It will be clear to you, from what I have said, that only you could have let this young man out, since you were left in the room, and must have locked the door when you went out.
+1580-141083-0024	4.48	You left him in a chair, you say. Which chair"? "By the window there".	1580-141084-0049	7.575	It was simple enough, sir, if you only had known, but, with all your cleverness, it was impossible that you could know.
+6930-76324-0010	2.69	What in the world is that"? queried Joyce.	6930-75918-0002	5.025	Congratulations were poured in upon the princess everywhere during her journey.
+6930-76324-0013	4.305	It can't hurt anything, I'm sure, for we won't disturb things at all.	6930-75918-0006	5.85	This has indeed been a harassing day," continued the young man, his eyes fixed upon his friend.
+6930-75918-0000	3.505	Concord returned to its place amidst the tents.	6930-75918-0008	4.785	Can you imagine why Buckingham has been so violent"? "I suspect".
+6930-76324-0019	2.575	Now let's dust the furniture and pictures".	6930-75918-0009	7.28	It is you who are mistaken, Raoul; I have read his distress in his eyes, in his every gesture and action the whole day".
+6930-75918-0000	3.505	Concord returned to its place amidst the tents.	6930-75918-0015	6.38	Thus it is that the honor of three is saved: our country's, our master's, and our own.
+6930-76324-0013	4.305	It can't hurt anything, I'm sure, for we won't disturb things at all.	6930-75918-0017	6.16	But in this friendly pressure Raoul could detect the nervous agitation of a great internal conflict.
+4077-13751-0019	2.92	Who began the quarrel? Was it the "Mormons"?	4077-13754-0000	4.78	The army found the people in poverty, and left them in comparative wealth.
+4077-13751-0013	4.315	Their sufferings have never yet been fitly chronicled by human scribe.	4077-13754-0003	5.68	Moreover, had the people been inclined to rebellion what greater opportunity could they have wished?
+4077-13754-0001	3.77	But a word further concerning the expedition in general.	4077-13754-0004	4.985	Already a North and a South were talked of - why not set up also a West?
+4077-13751-0013	4.315	Their sufferings have never yet been fitly chronicled by human scribe.	4077-13754-0009	7.65	At the inception of plural marriage among the Latter day Saints, there was no law, national or state, against its practise.
+1995-1837-0015	4.485	The squares of cotton, sharp edged, heavy, were just about to burst to bolls!	1995-1826-0000	9.485	In the debate between the senior societies her defence of the Fifteenth Amendment had been not only a notable bit of reasoning, but delivered with real enthusiasm.
+1995-1837-0015	4.485	The squares of cotton, sharp edged, heavy, were just about to burst to bolls!	1995-1826-0002	4.605	John Taylor, who had supported her through college, was interested in cotton.
+1995-1837-0000	3.865	He knew the Silver Fleece - his and Zora's - must be ruined.	1995-1826-0005	5.125	But, John, there's no society - just elementary work
+1995-1837-0013	3.195	Then he looked down. The lagoon was dry.	1995-1826-0009	7.57	You ought to know, John, if I teach Negroes I'll scarcely see much of people in my own class".
+1995-1837-0020	3.21	The years of the days of her dying were ten.	1995-1826-0011	8.94	Here she was teaching dirty children, and the smell of confused odors and bodily perspiration was to her at times unbearable.
+1995-1836-0007	3.435	But you believe in some education"? asked Mary Taylor.	1995-1826-0012	6.18	She wanted a glance of the new books and periodicals and talk of great philanthropies and reforms.
+1995-1837-0009	3.76	The lagoon had been level with the dykes a week ago; and now?	1995-1826-0013	8.77	So for the hundredth time she was thinking today, as she walked alone up the lane back of the barn, and then slowly down through the bottoms.
+1995-1826-0015	3.55	She had almost forgotten that it was here within touch and sight.	1995-1826-0016	5.9	The glimmering sea of delicate leaves whispered and murmured before her, stretching away to the Northward.
+1995-1837-0022	3.415	Up in the sick room Zora lay on the little white bed.	1995-1826-0017	6.145	There might be a bit of poetry here and there, but most of this place was such desperate prose.
+1995-1837-0015	4.485	The squares of cotton, sharp edged, heavy, were just about to burst to bolls!	1995-1826-0018	5.01	Her regard shifted to the green stalks and leaves again, and she started to move away.
+1995-1826-0004	3.035	Might learn something useful down there".	1995-1826-0019	5.25	Cotton is a wonderful thing, is it not, boys"? she said rather primly.
+1995-1837-0011	3.375	He started at the thought. He hurried forth sadly.	1995-1826-0020	6.12	Miss Taylor did not know much about cotton, but at least one more remark seemed called for.
+1995-1826-0003	3.09	Better go," he had counselled, sententiously.	1995-1826-0022	4.745	I suppose, though, it's too early for them". Then came the explosion.
+1995-1837-0002	2.79	Ah! the swamp, the cruel swamp!	1995-1826-0024	5.095	The Golden Fleece - it's the Silver Fleece"! He harkened.
+5683-32866-0001	3.47	And he added something still less complimentary.	5683-32865-0004	7.365	Whatever Lord Chelford said, Miss Brandon received it very graciously, and even with a momentary smile.
+5683-32865-0002	2.78	He had his hand upon Lake's shoulder.	5683-32865-0007	6.065	I'm glad you like it,' says Wylder, chuckling benignantly on it, over his shoulder.
+5683-32866-0001	3.47	And he added something still less complimentary.	5683-32865-0008	6.12	I believe I have a little taste that way; those are all real, you know, those jewels.
+5683-32866-0000	2.645	Miss Lake declined the carriage to night.	5683-32865-0009	9.89	And he placed it in that gentleman's fingers, who now took his turn at the lamp, and contemplated the little parallelogram with a gleam of sly amusement.
+5683-32866-0006	4.215	Yes, so they said; but that would, I think, have been worse.	5683-32865-0010	6.335	I was thinking it's very like the ace of hearts,' answered the captain softly, smiling on.
+5683-32865-0003	3.51	They are cousins, you know; we are all cousins.	5683-32865-0011	6.355	Whereupon Lake laughed quietly, still looking on the ace of hearts with his sly eyes.
+5683-32865-0015	4.145	I had a horrid dream about him last night.' That?	5683-32865-0013	7.095	Do you know?' 'Lake? Oh! I really can't tell; but he'll soon tire of country life.
+5683-32879-0012	4.38	Thank you, Rachel, my Cousin Rachel, my only friend.	5683-32865-0015	4.145	I had a horrid dream about him last night.' That?
+5683-32866-0006	4.215	Yes, so they said; but that would, I think, have been worse.	5683-32865-0017	5.455	All the time he was talking to me his angry little eyes were following Lake.
+1320-122617-0005	4.4	The bear shook his shaggy sides, and then a well known voice replied:	1320-122617-0000	7.835	Notwithstanding the high resolution of Hawkeye he fully comprehended all the difficulties and danger he was about to incur.
+1320-122617-0022	3.855	The Delawares are children of the tortoise, and they outstrip the deer".	1320-122617-0003	6.285	There was something in his air and manner that betrayed to the scout the utter confusion of the state of his mind.
+1320-122617-0008	4.185	The young man is in bondage, and much I fear his death is decreed.	1320-122617-0005	4.4	The bear shook his shaggy sides, and then a well known voice replied:
+1320-122612-0016	3.49	Run back, Uncas, and bring me the size of the singer's foot.	1320-122617-0006	5.655	Can these things be"? returned David, breathing more freely, as the truth began to dawn upon him.
+1320-122617-0005	4.4	The bear shook his shaggy sides, and then a well known voice replied:	1320-122617-0008	4.185	The young man is in bondage, and much I fear his death is decreed.
+1320-122612-0009	3.88	It would have been more wonderful had he spoken without a bidding.	1320-122617-0009	7.705	I greatly mourn that one so well disposed should die in his ignorance, and I have sought a goodly hymn-" "Can you lead me to him"?
+1320-122617-0005	4.4	The bear shook his shaggy sides, and then a well known voice replied:	1320-122617-0010	10	The task will not be difficult," returned David, hesitating; "though I greatly fear your presence would rather increase than mitigate his unhappy fortunes".
+1320-122617-0022	3.855	The Delawares are children of the tortoise, and they outstrip the deer".	1320-122617-0011	9.76	The lodge in which Uncas was confined was in the very center of the village, and in a situation, perhaps, more difficult than any other to approach, or leave, without observation.
+1320-122617-0041	4.15	Uncas cast his skin, and stepped forth in his own beautiful proportions.	1320-122617-0012	7.59	Four or five of the latter only lingered about the door of the prison of Uncas, wary but close observers of the manner of their captive.
+1320-122617-0022	3.855	The Delawares are children of the tortoise, and they outstrip the deer".	1320-122617-0014	4.9	They drew back a little from the entrance and motioned to the supposed conjurer to enter.
+1320-122617-0005	4.4	The bear shook his shaggy sides, and then a well known voice replied:	1320-122617-0015	5.125	But the bear, instead of obeying, maintained the seat it had taken, and growled:
+1320-122612-0016	3.49	Run back, Uncas, and bring me the size of the singer's foot.	1320-122617-0017	5.655	Then, as if satisfied of their safety, the scout left his position, and slowly entered the place.
+1320-122617-0008	4.185	The young man is in bondage, and much I fear his death is decreed.	1320-122617-0018	9.695	It was silent and gloomy, being tenanted solely by the captive, and lighted by the dying embers of a fire, which had been used for the purposed of cookery.
+1320-122617-0005	4.4	The bear shook his shaggy sides, and then a well known voice replied:	1320-122617-0019	8.23	Uncas occupied a distant corner, in a reclining attitude, being rigidly bound, both hands and feet, by strong and painful withes.
+1320-122617-0041	4.15	Uncas cast his skin, and stepped forth in his own beautiful proportions.	1320-122617-0020	8.895	The scout, who had left David at the door, to ascertain they were not observed, thought it prudent to preserve his disguise until assured of their privacy.
+1320-122617-0022	3.855	The Delawares are children of the tortoise, and they outstrip the deer".	1320-122617-0021	5.335	What shall we do with the Mingoes at the door? They count six, and this singer is as good as nothing".
+1320-122617-0022	3.855	The Delawares are children of the tortoise, and they outstrip the deer".	1320-122617-0023	7.815	Uncas, who had already approached the door, in readiness to lead the way, now recoiled, and placed himself, once more, in the bottom of the lodge.
+1320-122617-0022	3.855	The Delawares are children of the tortoise, and they outstrip the deer".	1320-122617-0024	7.555	But Hawkeye, who was too much occupied with his own thoughts to note the movement, continued speaking more to himself than to his companion.
+1320-122617-0022	3.855	The Delawares are children of the tortoise, and they outstrip the deer".	1320-122617-0025	6.36	So, Uncas, you had better take the lead, while I will put on the skin again, and trust to cunning for want of speed".
+1320-122617-0005	4.4	The bear shook his shaggy sides, and then a well known voice replied:	1320-122617-0026	5.225	Well, what can't be done by main courage, in war, must be done by circumvention.
+1320-122617-0022	3.855	The Delawares are children of the tortoise, and they outstrip the deer".	1320-122617-0027	5.689938	As soon as these dispositions were made, the scout turned to David, and gave him his parting instructions.
+1320-122617-0022	3.855	The Delawares are children of the tortoise, and they outstrip the deer".	1320-122617-0029	7.875	If you are not then knocked on the head, your being a non composser will protect you; and you'll then have a good reason to expect to die in your bed.
+1320-122617-0008	4.185	The young man is in bondage, and much I fear his death is decreed.	1320-122617-0031	6.285	Bravely and generously has he battled in my behalf, and this, and more, will I dare in his service".
+1320-122617-0022	3.855	The Delawares are children of the tortoise, and they outstrip the deer".	1320-122617-0034	9.485	Hold"! said David, perceiving that with this assurance they were about to leave him; "I am an unworthy and humble follower of one who taught not the damnable principle of revenge.
+1320-122617-0022	3.855	The Delawares are children of the tortoise, and they outstrip the deer".	1320-122617-0037	7.18	The Delaware dog"! he said, leaning forward, and peering through the dim light to catch the expression of the other's features; "is he afraid?
+1320-122617-0022	3.855	The Delawares are children of the tortoise, and they outstrip the deer".	1320-122617-0039	7.055	The Mohican started on his feet, and shook his shaggy covering, as though the animal he counterfeited was about to make some desperate effort.
+1320-122617-0041	4.15	Uncas cast his skin, and stepped forth in his own beautiful proportions.	1320-122617-0040	7.975	He had no occasion to delay, for at the next instant a burst of cries filled the outer air, and ran along the whole extent of the village.
+1320-122612-0016	3.49	Run back, Uncas, and bring me the size of the singer's foot.	1320-122617-0041	4.15	Uncas cast his skin, and stepped forth in his own beautiful proportions.
+121-127105-0036	4.15	But was that all her reward"? one of the ladies asked.	121-121726-0000	8.46	Also, a popular contrivance whereby love making may be suspended but not stopped during the picnic season.
+121-121726-0004	4.02	Heaven, a good place to be raised to.	121-121726-0001	5.925	Harangue The tiresome product of a tireless tongue.
+121-121726-0013	2.49	Tied to a woman.	121-121726-0002	4.41	angor, pain. Painful to hear.
+121-127105-0008	2.76	He hung fire again. "A woman's.	121-121726-0003	6.755	Hay fever, a heart trouble caused by falling in love with a grass widow.
+121-121726-0006	3.895	Heredity, the cause of all our faults.	121-121726-0004	4.02	Heaven, a good place to be raised to.
+121-127105-0008	2.76	He hung fire again. "A woman's.	121-121726-0007	6.73	Horse sense, a degree of wisdom that keeps one from betting on the races.
+121-121726-0014	3.165	Hypocrite, a horse dealer.	121-121726-0008	4.99	Hose Man's excuse for wetting the walk.
+121-121726-0006	3.895	Heredity, the cause of all our faults.	121-121726-0009	7.26	Hotel, a place where a guest often gives up good dollars for poor quarters.
+121-127105-0008	2.76	He hung fire again. "A woman's.	121-121726-0010	9.81	Housecleaning, a domestic upheaval that makes it easy for the government to enlist all the soldiers it needs.
+121-121726-0014	3.165	Hypocrite, a horse dealer.	121-121726-0011	4.035	Husband, the next thing to a wife.
+121-121726-0002	4.41	angor, pain. Painful to hear.	121-121726-0012	4.045	hussy, woman, and bond, tie.
+61-70970-0016	4.37	We will go out together to the bower; there is a way down to the court from my window.	61-70970-0000	6.075	Young Fitzooth had been commanded to his mother's chamber so soon as he had come out from his converse with the Squire.
+61-70970-0012	3.135	Yet he will teach you a few tricks when morning is come.	61-70970-0001	6.155	There befell an anxious interview, Mistress Fitzooth arguing for and against the Squire's project in a breath.
+61-70968-0045	3.475	Pray follow us, with mine and my lord Sheriff's men".	61-70970-0002	4.165	Most of all Robin thought of his father. What would he counsel?
+61-70968-0056	3.565	The wine did certainly bring back the color to the Squire's cheeks.	61-70970-0007	4.485	He was in deep converse with the clerk, and entered the hall holding him by the arm.
+61-70968-0039	3.805	And mine is Will Stuteley. Shall we be comrades"?	61-70970-0011	6.075	As any in England, I would say," said Gamewell, proudly. "That is, in his day.
+61-70968-0016	3.72	And then they became vexed, and would have snatched your purse from us.	61-70970-0013	4.35	There was no chance to alter his sleeping room to one nearer to Gamewell's chamber.
+61-70968-0046	3.55	Nottingham Castle was reached, and admittance was demanded.	61-70970-0015	8.415	Will," cried he, softly; and Stuteley, who had chosen his couch across the door of his young master's chamber, sprang up at once in answer.
+61-70968-0029	3.495	The Squire helped to thrust them all in and entered swiftly himself.	61-70970-0016	4.37	We will go out together to the bower; there is a way down to the court from my window.
+61-70968-0046	3.55	Nottingham Castle was reached, and admittance was demanded.	61-70970-0018	4.6	The hours passed wearily by, and movement could yet be heard about the hall.
+61-70970-0009	3.405	Tis late; and I go myself within a short space.	61-70970-0020	5.025	Will," whispered Robin, opening his door as he spoke, "are you ready"?
+61-70970-0013	4.35	There was no chance to alter his sleeping room to one nearer to Gamewell's chamber.	61-70970-0021	5.405	They then renewed their journey, and, under the better light, made a safe crossing of the stable roofs.
+61-70970-0016	4.37	We will go out together to the bower; there is a way down to the court from my window.	61-70970-0024	7.235	They moved thereafter cautiously about the hut, groping before and about them to find something to show that Warrenton had fulfilled his mission.
+61-70970-0016	4.37	We will go out together to the bower; there is a way down to the court from my window.	61-70970-0025	7.435	They were upon the verge of an open trap, in the far corner of the hut; and Stuteley had tripped over the edge of the reversed flap mouth of this pit.
+61-70970-0033	3.42	Truly such a horse should be worth much in Nottingham Fair!	61-70970-0026	5.475	Fitzooth's hand rested at last upon the top rung of a ladder, and slowly the truth came to him.
+61-70970-0032	3.135	enquired Robin, with his suspicions still upon him.	61-70970-0027	5.08	Robin carefully descended the ladder and found himself soon upon firm rocky ground.
+61-70970-0016	4.37	We will go out together to the bower; there is a way down to the court from my window.	61-70970-0028	6.55	Stuteley was by his side in a flash: and then they both began feeling about them to ascertain the shape and character of this vault.
+61-70968-0055	3.965	Robin was glad when, at length, they were left to their own devices.	61-70970-0029	4.03	From the blackness behind the light they heard a voice - Warrenton's!
+61-70970-0007	4.485	He was in deep converse with the clerk, and entered the hall holding him by the arm.	61-70970-0031	5.135	cried he, waving the lanthorn before him to make sure that these were no ghosts in front of him.
+61-70968-0039	3.805	And mine is Will Stuteley. Shall we be comrades"?	61-70970-0034	4.485	Nay, nay, lording," answered Warrenton, with a half laugh.
+61-70968-0006	2.935	But then the picture was gone as quickly as it came".	61-70970-0035	7.405	Warrenton spoke thus with significance, to show Robin that he was not to think Geoffrey's claims to the estate would be passed by.
+61-70970-0033	3.42	Truly such a horse should be worth much in Nottingham Fair!	61-70970-0036	6.785	Robin Fitzooth saw that his doubts of Warrenton had been unfair: and he became ashamed of himself for harboring them.
+61-70968-0052	2.65	But who is this fellow plucking at your sleeve?	61-70970-0037	5.98	His tones rang pleasantly on Warrenton's ears, and forthwith a good fellowship was heralded between them.
+61-70968-0046	3.55	Nottingham Castle was reached, and admittance was demanded.	61-70970-0039	6.665	He implores us to be discreet as the grave in this matter, for in sooth his life is in the hollow of our hands".
+61-70970-0016	4.37	We will go out together to the bower; there is a way down to the court from my window.	61-70970-0040	4.165	They regained their apartment, apparently without disturbing the household of Gamewell.
+5105-28240-0002	4.01	exclaimed Servadac, keeping his eye unmoved at his telescope.	5105-28240-0000	5.455	Fast as his legs could carry him, Servadac had made his way to the top of the cliff.
+5105-28240-0016	4.17	To all these inquiries, the count responded in the affirmative.	5105-28240-0002	4.01	exclaimed Servadac, keeping his eye unmoved at his telescope.
+5105-28240-0013	2.96	Nothing more than you know yourself".	5105-28240-0003	5.515	She is under sail; but she is Count Timascheff's yacht". He was right.
+5105-28240-0014	3.07	Are you certain that this is the Mediterranean"?	5105-28240-0004	6.015	If the count were on board, a strange fatality was bringing him to the presence of his rival.
+5105-28240-0014	3.07	Are you certain that this is the Mediterranean"?	5105-28240-0005	7.4	He reckoned, therefore, not only upon ascertaining the extent of the late catastrophe, but upon learning its cause.
+5105-28240-0014	3.07	Are you certain that this is the Mediterranean"?	5105-28240-0007	4.625	Servadac took it for granted that the Dobryna was endeavoring to put in.
+5105-28241-0014	2.995	Another circumstance was most remarkable.	5105-28240-0011	6.02	I left you on a continent, and here I have the honor of finding you on an island".
+5105-28240-0014	3.07	Are you certain that this is the Mediterranean"?	5105-28240-0015	8.525	For some moments he seemed perfectly stupefied; then, recovering himself, he began to overwhelm the count with a torrent of questions.
+5105-28240-0002	4.01	exclaimed Servadac, keeping his eye unmoved at his telescope.	5105-28240-0016	4.17	To all these inquiries, the count responded in the affirmative.
+5105-28241-0014	2.995	Another circumstance was most remarkable.	5105-28240-0017	5.665	Some mysterious force seemed to have brought about a convulsion of the elements.
+5105-28241-0003	3.98	Steam up and canvas spread, the schooner started eastwards.	5105-28240-0019	6.240062	My yacht is at your service, sir, even should you require to make a tour round the world".
+5105-28233-0001	4.49	He seemed born to please without being conscious of the power he possessed.	5105-28240-0022	4.725	It was on the last day of January that the repairs of the schooner were completed.
+5105-28241-0003	3.98	Steam up and canvas spread, the schooner started eastwards.	5105-28240-0024	8.2	Doubts now arose, and some discussion followed, whether or not it was desirable for Ben Zoof to accompany his master.
+1284-1181-0002	3.835	The head of the Patchwork Girl was the most curious part of her.	1284-1181-0003	4.505	The hair was of brown yarn and hung down on her neck in several neat braids.
+1284-1180-0027	3.27	Yet that task was not so easy as you may suppose.	1284-1181-0004	7.15	Gold is the most common metal in the Land of Oz and is used for many purposes because it is soft and pliable.
+1284-1180-0027	3.27	Yet that task was not so easy as you may suppose.	1284-1181-0007	4.04	She poured into the dish a quantity from each of these bottles.
+1284-1180-0027	3.27	Yet that task was not so easy as you may suppose.	1284-1181-0008	6.08	I think that will do," she continued, "for the other qualities are not needed in a servant".
+1284-1181-0002	3.835	The head of the Patchwork Girl was the most curious part of her.	1284-1181-0009	5.245	She ran to her husband's side at once and helped him lift the four kettles from the fire.
+1284-1181-0002	3.835	The head of the Patchwork Girl was the most curious part of her.	1284-1181-0010	6.435	Their contents had all boiled away, leaving in the bottom of each kettle a few grains of fine white powder.
+1284-1181-0002	3.835	The head of the Patchwork Girl was the most curious part of her.	1284-1181-0011	7.75	Very carefully the Magician removed this powder, placing it all together in a golden dish, where he mixed it with a golden spoon.
+1284-1180-0004	4.285	When they were outside, Unc simply latched the door and started up the path.	1284-1181-0012	8.51	No one saw him do this, for all were looking at the Powder of Life; but soon the woman remembered what she had been doing, and came back to the cupboard.
+1284-1181-0002	3.835	The head of the Patchwork Girl was the most curious part of her.	1284-1181-0014	7.92	He selected a small gold bottle with a pepper box top, so that the powder might be sprinkled on any object through the small holes.
+1284-1181-0007	4.04	She poured into the dish a quantity from each of these bottles.	1284-1181-0015	5.115	Most people talk too much, so it is a relief to find one who talks too little".
+1284-1181-0007	4.04	She poured into the dish a quantity from each of these bottles.	1284-1181-0016	9.515	I am not allowed to perform magic, except for my own amusement," he told his visitors, as he lighted a pipe with a crooked stem and began to smoke.
+1284-1181-0002	3.835	The head of the Patchwork Girl was the most curious part of her.	1284-1181-0020	6.73	Dear me; what a chatterbox you're getting to be, Unc," remarked the Magician, who was pleased with the compliment.
+4446-2271-0012	3.78	I say, Sir Harry, the little girl's going famously to night, isn't she"?	4446-2275-0000	6.34	The stop at Queenstown, the tedious passage up the Mersey, were things that he noted dimly through his growing impatience.
+4446-2273-0002	3.295	Lamb wouldn't care a great deal about many of them, I fancy".	4446-2275-0001	4.66	She blushed and smiled and fumbled his card in her confusion before she ran upstairs.
+4446-2271-0005	3.395	She saves her hand, too. She's at her best in the second act.	4446-2275-0002	7.675	Alexander paced up and down the hallway, buttoning and unbuttoning his overcoat, until she returned and took him up to Hilda's living room.
+4446-2271-0006	2.905	He's been wanting to marry Hilda these three years and more.	4446-2275-0005	4.445	I felt it in my bones when I woke this morning that something splendid was going to turn up.
+4446-2273-0005	4.125	I haven't had a chance yet to tell you what a jolly little place I think this is.	4446-2275-0007	8.975	She pushed him toward the big chair by the fire, and sat down on a stool at the opposite side of the hearth, her knees drawn up to her chin, laughing like a happy little girl.
+4446-2271-0003	3.7	It's been on only two weeks, and I've been half a dozen times already.	4446-2275-0008	4.13	When did you come, Bartley, and how did it happen? You haven't spoken a word".
+4446-2275-0035	4.075	Alexander rose and shook himself angrily. "Yes, I know I'm cowardly.	4446-2275-0012	6.025	She looked at his heavy shoulders and big, determined head, thrust forward like a catapult in leash.
+4446-2273-0036	3.12	Alexander unclenched the two hands at his sides.	4446-2275-0016	7.3	Hilda watched him from her corner, trembling and scarcely breathing, dark shadows growing about her eyes. "It...
+4446-2275-0015	2.98	He pulled up a window as if the air were heavy.	4446-2275-0019	4.93	The world is all there, just as it used to be, but I can't get at it any more.
+4446-2273-0033	3.3	For a long time neither Hilda nor Bartley spoke.	4446-2275-0021	5.05	Hilda's face quivered, but she whispered: "Yes, I think it must have been.
+4446-2273-0030	2.885	Alexander went over and opened the window for her.	4446-2275-0026	5.495	She closed her eyes and took a deep breath, as if to draw in again the fragrance of those days.
+4446-2275-0035	4.075	Alexander rose and shook himself angrily. "Yes, I know I'm cowardly.	4446-2275-0029	6.28	Please tell me one thing, Bartley. At least, tell me that you believe I thought I was making you happy".
+4446-2275-0010	3.735	Alexander leaned forward and warmed his hands before the blaze.	4446-2275-0033	7.06	What I mean is that I want you to promise never to see me again, no matter how often I come, no matter how hard I beg".
+4446-2271-0011	3.945	Sir Harry Towne, mister Bartley Alexander, the American engineer".	4446-2275-0035	4.075	Alexander rose and shook himself angrily. "Yes, I know I'm cowardly.
+4446-2273-0017	2.74	How jolly it was being young, Hilda!	4446-2275-0038	4.53	I will ask the least imaginable, but I must have something!
+4446-2271-0005	3.395	She saves her hand, too. She's at her best in the second act.	4446-2275-0040	6.965	The sight of you, Bartley, to see you living and happy and successful can I never make you understand what that means to me"?
+4446-2271-0005	3.395	She saves her hand, too. She's at her best in the second act.	4446-2275-0041	4.755	You see, loving some one as I love you makes the whole world different.
+4446-2275-0011	2.435	Bartley bent lower over the fire.	4446-2275-0042	5.4	And then you came back, not caring very much, but it made no difference".
+4446-2273-0033	3.3	For a long time neither Hilda nor Bartley spoke.	4446-2275-0043	5.88	Bartley bent over and took her in his arms, kissing her mouth and her wet, tired eyes.
+5142-33396-0015	4.31	As our boat flashed down the rollers into the water I made this song and sang it:	5142-36377-0001	5.39	In five minutes I was in a new world, and my melancholy room was full of the liveliest French company.
+5142-33396-0062	2.9	Now she put her hand on his arm and smiled and said:	5142-36377-0002	5.62	The sound of an imperative and uncompromising bell recalled me in due time to the regions of reality.
+5142-33396-0050	2.885	May you drink heart's ease from it for many years.	5142-36377-0004	5.485	She signed to me, with a ghostly solemnity, to take the vacant place on the left of her father.
+5142-33396-0023	3.48	It was so dark that I could see nothing but a few sparks on the hearth.	5142-36377-0005	7.085	The door opened again while I was still studying the two brothers, without, I honestly confess, being very favorably impressed by either of them.
+5142-33396-0049	3.305	Here, friend, take it,' and he thrust it into the farmer's hand.	5142-36377-0006	4.635	A new member of the family circle, who instantly attracted my attention, entered the room.
+5142-33396-0053	3.93	I took five great bracelets of gold from our treasure chest and gave them to him.	5142-36377-0007	6.18	A little cracked" - that in the popular phrase was my impression of the stranger who now made his appearance in the supper room.
+5142-36586-0000	3.65	It is manifest that man is now subject to much variability.	5142-36377-0010	4.294937	He is not well; he has come over the ocean for rest, and change of scene.
+5142-33396-0023	3.48	It was so dark that I could see nothing but a few sparks on the hearth.	5142-36377-0013	6.585	They pointedly drew back from John Jago as he approached the empty chair next to me and moved round to the opposite side of the table.
+5142-33396-0049	3.305	Here, friend, take it,' and he thrust it into the farmer's hand.	5142-36377-0015	4.34	Our first impressions of people are, in nine cases out of ten, the right impressions.
+5142-33396-0049	3.305	Here, friend, take it,' and he thrust it into the farmer's hand.	5142-36377-0017	4.685	The only cheerful conversation was the conversation across the table between Naomi and me.
+5142-33396-0002	3.67	Two hundred warriors feasted in his hall and followed him to battle.	5142-36377-0018	4.97	He looked up at Naomi doubtingly from his plate, and looked down again slowly with a frown.
+5142-33396-0011	3.52	There she sat on the rollers, as fair a ship as I ever saw.	5142-36377-0020	4.53	A more dreary and more disunited family party I never sat at the table with.
+5142-36586-0000	3.65	It is manifest that man is now subject to much variability.	5142-36377-0023	5.79	You were quite right to say 'No,'" Ambrose began. "Never smoke with John Jago. His cigars will poison you".
+5142-33396-0040	2.81	And these shall follow your thralls in the same way.	5142-36377-0024	5.78	Naomi shook her forefinger reproachfully at them, as if the two sturdy young farmers had been two children.
+8555-292519-0015	2.85	He had broken into her courtyard.	8555-292519-0005	9.575	While the old gold and the marble stays, Forever gleaming its soft strong blaze, Calm in the early evening glow.
+8555-292519-0013	4.185	That was but rustling of dripping plants in the dark.	8555-292519-0007	8.405	It is my heart hung in the sky; And no clouds ever float between The grave flowers and my heart on high.
+8555-292519-0015	2.85	He had broken into her courtyard.	8555-292519-0008	6.025	Over the track lined city street The young men, the grinning men, pass.
+8555-284449-0009	3.27	You are, mate," replied the sailor.	8555-292519-0010	5.77	Old dances are simplified of their yearning, bleached by Time.
+8555-292519-0015	2.85	He had broken into her courtyard.	8555-292519-0012	5.17	Through the black night rain, he sang to her window bars:
+8555-292519-0015	2.85	He had broken into her courtyard.	8555-292519-0013	4.185	That was but rustling of dripping plants in the dark.
+5683-32865-0001	2.58	said Lord Chelford, addressing me.	5683-32866-0002	5.125	But don't these very wise things sometimes turn out very foolishly?
+5683-32865-0001	2.58	said Lord Chelford, addressing me.	5683-32866-0004	9.225	By this time Lord Chelford and Wylder returned; and, disgusted rather with myself, I ruminated on my want of general ship.
+5683-32866-0014	3.97	Don't insult me, Stanley, by talking again as you did this morning.	5683-32866-0005	4.59	and he made a little dip of his cane towards Brandon Hall, over his shoulder.
+5683-32866-0008	3.3	Bracton's a very good fellow, I can assure you.	5683-32866-0006	4.215	Yes, so they said; but that would, I think, have been worse.
+5683-32879-0001	3.66	Well, she was better, though she had had a bad night.	5683-32866-0007	4.12	If a fellow's been a little bit wild, he's Beelzebub at once.
+5683-32866-0015	2.83	What I say is altogether on your own account.	5683-32866-0011	7.37	Their walk continued silent for the greater part, neither was quite satisfied with the other. But Rachel at last said
+5683-32866-0015	2.83	What I say is altogether on your own account.	5683-32866-0012	8.26	Now that's impossible, Radie; for I really don't think I once thought of him all this evening - except just while we were talking.
+5683-32866-0014	3.97	Don't insult me, Stanley, by talking again as you did this morning.	5683-32866-0013	9.93	There was a bright moonlight, broken by the shadows of overhanging boughs and withered leaves; and the mottled lights and shadows glided oddly across his pale features.
+5683-32866-0006	4.215	Yes, so they said; but that would, I think, have been worse.	5683-32866-0016	4.88	Mark my words, you'll find him too strong for you; aye, and too deep.
+5683-32865-0001	2.58	said Lord Chelford, addressing me.	5683-32866-0017	4.585	I am very uneasy about it, whatever it is. I can't help it.
+5683-32879-0001	3.66	Well, she was better, though she had had a bad night.	5683-32866-0018	5.455	To my mind there has always been something inexpressibly awful in family feuds.
+5683-32866-0001	3.47	And he added something still less complimentary.	5683-32866-0021	7.9	My bed was unexceptionably comfortable, but, in my then mood, I could have wished it a great deal more modern.
+5683-32866-0014	3.97	Don't insult me, Stanley, by talking again as you did this morning.	5683-32866-0024	9.855	I shan't trouble you about my train of thoughts or fancies; but I began to feel very like a gentleman in a ghost story, watching experimentally in a haunted chamber.
+5683-32866-0008	3.3	Bracton's a very good fellow, I can assure you.	5683-32866-0027	4.755	A cold, bright moon was shining with clear sharp lights and shadows.
+5683-32879-0012	4.38	Thank you, Rachel, my Cousin Rachel, my only friend.	5683-32866-0028	5.62	The sombre old trees, like gigantic hearse plumes, black and awful.
+5683-32866-0003	2.865	In the meantime I had formed a new idea of her.	5683-32866-0030	4.845	A little bit of plaster tumbled down the chimney, and startled me confoundedly.
+8555-284447-0020	4.09	The goat's warlike spirit was roused by this successful attack.	8555-284449-0001	8.63	Then they all marched out a little way into the fields and found that the Army of Pinkies had already formed and was advancing steadily toward them.
+8555-284447-0020	4.09	The goat's warlike spirit was roused by this successful attack.	8555-284449-0003	8.875	When the Blueskins saw Ghip Ghisizzle they raised another great shout, for he was the favorite of the soldiers and very popular with all the people.
+8555-284447-0003	4.415	But Captain Bill made no such attempt, knowing it would be useless.	8555-284449-0007	9.31	Now, then, let's enter the City and enjoy the grand feast that's being cooked. I'm nearly starved, myself, for this conquering kingdoms is hard work".
+8555-284447-0020	4.09	The goat's warlike spirit was roused by this successful attack.	8555-284449-0008	6.135	Then she gave Rosalie back her magic ring, thanking the kind Witch for all she had done for them.
+8555-284447-0020	4.09	The goat's warlike spirit was roused by this successful attack.	8555-284449-0012	9.87	I'll gladly do that," promised the new Boolooroo; "and I'll feed the honorable goat all the shavings and leather and tin cans he can eat, besides the grass.
+8555-284447-0003	4.415	But Captain Bill made no such attempt, knowing it would be useless.	8555-284449-0013	5.775	Scuse me," said Trot; "I neglected to tell you that you're not the Boolooroo any more.
+8555-292519-0013	4.185	That was but rustling of dripping plants in the dark.	8555-284449-0015	5.12	I'll not be wicked any more," sighed the old Boolooroo; "I'll reform.
+8555-284447-0022	3.56	I had a notion it was you, mate, as saved me from the knife.	8555-284449-0016	5.895	As a private citizen I shall be a model of deportment, because it would be dangerous to be otherwise".
+8555-284447-0020	4.09	The goat's warlike spirit was roused by this successful attack.	8555-284449-0018	7.03	So Ghip Ghisizzle ordered the Captain to take a file of soldiers and escort the raving beauties to their new home.
+8555-284447-0020	4.09	The goat's warlike spirit was roused by this successful attack.	8555-284449-0019	7.61	That evening Trot gave a grand ball in the palace, to which the most important of the Pinkies and the Blueskins were invited.
+8555-284447-0020	4.09	The goat's warlike spirit was roused by this successful attack.	8555-284449-0020	5.095	The combined bands of both the countries played the music and a fine supper was served.
diff --git a/capspeech/nar/data_preprocessing/__init__.py b/capspeech/nar/data_preprocessing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/nar/data_preprocessing/caption.py b/capspeech/nar/data_preprocessing/caption.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f965e4c044f03f02b9640095ade8fcf17cab8ff
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/caption.py
@@ -0,0 +1,55 @@
+import argparse
+import logging
+import json
+import os
+import numpy as np
+import torch
+import tqdm
+import time
+from transformers import T5EncoderModel, AutoTokenizer
+import glob
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Encode the data captionings using t5 model")
+    parser.add_argument('--save_dir', type=str, default=None, help="path to the manifest, phonemes, and encodec codes dirs")
+    parser.add_argument('--start', type=int, default=0, help='start index for parallel processing')
+    parser.add_argument('--end', type=int, default=10000000, help='end index for parallel processing')
+    return parser.parse_args()
+
+if __name__ == "__main__":
+
+    formatter = (
+        "%(asctime)s [%(levelname)s] %(filename)s:%(lineno)d || %(message)s"
+    )
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    args = parse_args()
+    
+    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
+    caption_encoder = T5EncoderModel.from_pretrained("google/flan-t5-large").cuda().eval()
+
+    # get the path
+    phn_save_root = os.path.join(args.save_dir, "t5")
+    os.makedirs(phn_save_root, exist_ok=True)
+    
+    stime = time.time()
+
+    logging.info(f"captioning...")
+    json_paths = glob.glob(os.path.join(args.save_dir, 'jsons', '*.json'))
+    for json_path in json_paths:
+        with open(json_path, 'r', encoding="utf-8") as json_file:
+            jsondata = json.load(json_file)
+            
+        jsondata = jsondata[args.start:args.end]
+
+        for key in tqdm.tqdm(range(len(jsondata))):
+            save_fn = os.path.join(phn_save_root, jsondata[key]['segment_id']+".npz")
+            if not os.path.exists(save_fn):
+                text = jsondata[key]['caption']
+                
+                with torch.no_grad():
+                    batch_encoding = tokenizer(text, return_tensors="pt")
+                    ori_tokens = batch_encoding["input_ids"].cuda()
+                    outputs = caption_encoder(input_ids=ori_tokens).last_hidden_state
+                
+                phn = outputs.cpu().numpy()
+                np.savez_compressed(save_fn, phn)
\ No newline at end of file
diff --git a/capspeech/nar/data_preprocessing/clap.py b/capspeech/nar/data_preprocessing/clap.py
new file mode 100644
index 0000000000000000000000000000000000000000..3660137eb945d99511a75390dd42a78304503105
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap.py
@@ -0,0 +1,24 @@
+import torch
+import laion_clap
+import os
+from tqdm import tqdm
+import numpy as np
+
+with open("events.txt", "r") as f:
+    events = [line.strip() for line in f]
+    
+save_path = './clap_embs'
+
+model = laion_clap.CLAP_Module(enable_fusion=False)
+model.load_ckpt("./630k-best.pt")
+
+with torch.no_grad():
+    for event in tqdm(events):
+        text_data = [event.lower()] 
+        text_embed = model.get_text_embedding(text_data, use_tensor=True)
+        text_embed = text_embed.squeeze().cpu().numpy()
+        save_fn = os.path.join(save_path, event.lower().replace(" ", "_")+".npz")
+        np.savez_compressed(save_fn, text_embed)
+
+
+
diff --git a/capspeech/nar/data_preprocessing/clap_embs/acoustic_guitar.npz b/capspeech/nar/data_preprocessing/clap_embs/acoustic_guitar.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9f9363cba151502b8658a44caa72fcbcc2bb6eb6
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/acoustic_guitar.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4653c459d26af10c81c8224fbdfabac1e0a7f7ff3ce7ccdf543d09a4341864ea
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/air_conditioning_noise.npz b/capspeech/nar/data_preprocessing/clap_embs/air_conditioning_noise.npz
new file mode 100644
index 0000000000000000000000000000000000000000..957a418fea12639d76a64895c9830d0e5d48e6b6
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/air_conditioning_noise.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ab31670aec1a40daed659467db01449ef4cf7a5e84041f7c015c3c9d91141c2
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/air_horn.npz b/capspeech/nar/data_preprocessing/clap_embs/air_horn.npz
new file mode 100644
index 0000000000000000000000000000000000000000..33ba05ab4589010b0dd4fbbc2e3bbfdf03d90da0
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/air_horn.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9530ae89abb36a0fdc3aa16756024fe0fafcabb52ca5b64ffd68aafd502fee9f
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/airplane.npz b/capspeech/nar/data_preprocessing/clap_embs/airplane.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ebf85f0acc6ddd4009ce68927beb010c4350cb39
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/airplane.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21f50e0f127ffa684a2143be08a7a4c9ca203b356aad77c3bd0f71582f4547e7
+size 2144
diff --git a/capspeech/nar/data_preprocessing/clap_embs/airplane_flyby.npz b/capspeech/nar/data_preprocessing/clap_embs/airplane_flyby.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5c3f22966fa66c5d5bd0ad17d79c452f0d1e8bcd
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/airplane_flyby.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba7ff69443cc1123bd436cade27a063df989d0d34a7191600602705720ef095c
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/alarm_clock_ringing.npz b/capspeech/nar/data_preprocessing/clap_embs/alarm_clock_ringing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8aa7252657d82f02ecf598c44006ffe91fc0e162
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/alarm_clock_ringing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:583745cfc35addcc0cb4f992e3092f6249c84699145193d3f052ade1d2680991
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/alligators,_crocodiles_hissing.npz b/capspeech/nar/data_preprocessing/clap_embs/alligators,_crocodiles_hissing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3f60c4b7c3dc6cff03145ca6c2a6c0c20d36a2e0
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/alligators,_crocodiles_hissing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29f0614d7d8974a599ace3eaf24494a19bd9333603566bda8f61743914809a83
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/ambulance_siren.npz b/capspeech/nar/data_preprocessing/clap_embs/ambulance_siren.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8c285a963580f6e4767eda63f15d42bb7b35845c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/ambulance_siren.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31826f693b94181558ae04e4dfb7ac1013a57836ee6785594159b7cf9d221b4d
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/applause.npz b/capspeech/nar/data_preprocessing/clap_embs/applause.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4eec214eb36954a6332ca7c47cef561e00d92d42
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/applause.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc4e4cca2885580fad4916597668ba832342e7140a686ca78c0645e5cbe95ec4
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/arc_welding.npz b/capspeech/nar/data_preprocessing/clap_embs/arc_welding.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ee0f90fd54c7d7faf84a85450699839f2b91372a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/arc_welding.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4222718e175b2c415c08a79e0c1d8a1810c597651938abf969ea656c5beaf5d6
+size 2157
diff --git a/capspeech/nar/data_preprocessing/clap_embs/baby_babbling.npz b/capspeech/nar/data_preprocessing/clap_embs/baby_babbling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b3336c63b48d398f0b0aecef5c7f8e36e3f61d00
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/baby_babbling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff017cefeea4f6dbee1275dfaad8fd3412a6c5ac0dfe584fe06517b90a604c66
+size 2139
diff --git a/capspeech/nar/data_preprocessing/clap_embs/baby_crying.npz b/capspeech/nar/data_preprocessing/clap_embs/baby_crying.npz
new file mode 100644
index 0000000000000000000000000000000000000000..eddc1ac970a2eb33a2b16466ab632468a7a47f21
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/baby_crying.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2c7f74518342e6a14673c9c8ac3de136d8084c4ba090ef3173b5ba5d5c84beb
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/baby_laughter.npz b/capspeech/nar/data_preprocessing/clap_embs/baby_laughter.npz
new file mode 100644
index 0000000000000000000000000000000000000000..cfd2d7548c850929d0c1bfcdee7ecae530551570
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/baby_laughter.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:451ef71203df0dde3d4a5cb36760c26c19d0b1057fb36f8ea8e97bfb02df0af9
+size 2144
diff --git a/capspeech/nar/data_preprocessing/clap_embs/baltimore_oriole_calling.npz b/capspeech/nar/data_preprocessing/clap_embs/baltimore_oriole_calling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d852d7b79d438f3dfbe63923ea84d19646c3c2d7
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/baltimore_oriole_calling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ace96d5696a134de14cf87cfa21c740c91fe5d42277906c8d3ddb7a1faff660c
+size 2159
diff --git a/capspeech/nar/data_preprocessing/clap_embs/bark.npz b/capspeech/nar/data_preprocessing/clap_embs/bark.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3f228b1aae96a4f43209fc82ecb7b7c68635bc48
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/bark.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a393257c86776a4352f345fc25c66867d3bc7527423e301a6d15d4a72fdec5b
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/barn_swallow_calling.npz b/capspeech/nar/data_preprocessing/clap_embs/barn_swallow_calling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f8343b3c35dbe90c5d88f2d808c9cd8766231b26
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/barn_swallow_calling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:326e149c0a84fd7315f3ff872c3e66f2a1b5534dd9ed0fc5ba4a41f08e9379f5
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/basketball_bounce.npz b/capspeech/nar/data_preprocessing/clap_embs/basketball_bounce.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f57f8fd236a5feb79e21e5182f5a37d290689533
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/basketball_bounce.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a651fd4638001720995996edebcd036d8694e832fbb965ef615cd95a6477c47
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/bass_drum.npz b/capspeech/nar/data_preprocessing/clap_embs/bass_drum.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2d43e1ca0336801d56b6c62f2e7c0a8207a67277
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/bass_drum.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8635a8d60b63ba62c603978fea249a5f9b104ff2b7bdc0191f145ec49c06349
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/bathroom_ventilation_fan_running.npz b/capspeech/nar/data_preprocessing/clap_embs/bathroom_ventilation_fan_running.npz
new file mode 100644
index 0000000000000000000000000000000000000000..46b196d3f9c3cefd5cdbafee4411df42990c9238
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/bathroom_ventilation_fan_running.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6261024ee812a95500dadc63d0250d012381664113d5306d3609db53042a6ed
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/beat_boxing.npz b/capspeech/nar/data_preprocessing/clap_embs/beat_boxing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..268e1d22b267957a4ba09655b927687b15b306ef
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/beat_boxing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fa07f6b305a2f7275f8beb5b7982abf51fd7df90d3a289e46448b7566744c04
+size 2143
diff --git a/capspeech/nar/data_preprocessing/clap_embs/bee,_wasp,_etc._buzzing.npz b/capspeech/nar/data_preprocessing/clap_embs/bee,_wasp,_etc._buzzing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..52a54c34300ee53609ffd25cb38387c434586fbc
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/bee,_wasp,_etc._buzzing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7256509be45557c8f729979b15944d7984f0a96a790618fd87a7bc10fd401a0
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/bird_chirping,_tweeting.npz b/capspeech/nar/data_preprocessing/clap_embs/bird_chirping,_tweeting.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b62270a4f09a473d26108decfd39db9869a5f2a3
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/bird_chirping,_tweeting.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2bd3d857bf9fe3a77f1c2ed3574e5d836830b5a434acacf0eab48f8546285ec
+size 2157
diff --git a/capspeech/nar/data_preprocessing/clap_embs/bird_squawking.npz b/capspeech/nar/data_preprocessing/clap_embs/bird_squawking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..775b51fd2558d9f3a18210c53daf4c5e4fcefe9c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/bird_squawking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a8aed2eaabd5c8a9b60a1da87b33724924952f539203e21d5f75101ac41fb63
+size 2160
diff --git a/capspeech/nar/data_preprocessing/clap_embs/bird_wings_flapping.npz b/capspeech/nar/data_preprocessing/clap_embs/bird_wings_flapping.npz
new file mode 100644
index 0000000000000000000000000000000000000000..dbd3331c5cf7f3b88e10cd8e2bd8f4786d524f61
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/bird_wings_flapping.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f53479dc990cb3271ca2326d9d1a5478ca03c9ca141b75f4a8dc34c87f3c02c2
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/black_capped_chickadee_calling.npz b/capspeech/nar/data_preprocessing/clap_embs/black_capped_chickadee_calling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..24a19fbcd78dd08ea6a8ea31891e425d61afe8f4
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/black_capped_chickadee_calling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:211303b79109cb3fa7375e718ed6860b06179344e5cf296cbd0bb9dac648bde4
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/blowtorch_igniting.npz b/capspeech/nar/data_preprocessing/clap_embs/blowtorch_igniting.npz
new file mode 100644
index 0000000000000000000000000000000000000000..373a1c9a12473b936d0cc01e85e69bc574927331
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/blowtorch_igniting.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:277db9b2b16d64b007dbcf4ed25dc9ad6af7570fac59f10d14151fc5a05e006a
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/bouncing_on_trampoline.npz b/capspeech/nar/data_preprocessing/clap_embs/bouncing_on_trampoline.npz
new file mode 100644
index 0000000000000000000000000000000000000000..04cf7000bfd8077894257069cbcdb74edfe5fb4d
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/bouncing_on_trampoline.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a85571a6660d1eb0101a7abd8a63a8dde4f173e14fe86a0aac65a8af6ad3ab63
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/bowling_impact.npz b/capspeech/nar/data_preprocessing/clap_embs/bowling_impact.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e77b46ddd0dbe4b592535e117fb1b30b169a117e
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/bowling_impact.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d76e2a6046ef987a594eaf28efc4799d208b012bb9c30abeca87809af2bc5551
+size 2159
diff --git a/capspeech/nar/data_preprocessing/clap_embs/breathing.npz b/capspeech/nar/data_preprocessing/clap_embs/breathing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d2ee6bd314d80e1860bb740ea0fd86ce981bd2f7
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/breathing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88b2ea82ebf6702e6159ec28876af9a754179f8598009d93e096f5f79d0a4ec6
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/brushing_teeth.npz b/capspeech/nar/data_preprocessing/clap_embs/brushing_teeth.npz
new file mode 100644
index 0000000000000000000000000000000000000000..690eba295cbf1d212a7210250cc9b1626d581b2e
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/brushing_teeth.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4157a2a7721577d9ec12007ed24cd19c6fb0aa4f069906d55b78ffe47a951f5a
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/bull_bellowing.npz b/capspeech/nar/data_preprocessing/clap_embs/bull_bellowing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..11eefa284af431d7c04e67303b4354cd67835347
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/bull_bellowing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4070b0722e1663e3f29cf97b6339832da5ec26dff5ed7d77e7d72db73381d59a
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/burping_or_eructation.npz b/capspeech/nar/data_preprocessing/clap_embs/burping_or_eructation.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ce106e126774163b3a8120269cc70c6b87d34fc7
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/burping_or_eructation.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5aa1d3c59d2fdbc3aeb0e370bbb5c1ce13886e17240ade5ee327cfa585cb52de
+size 2158
diff --git a/capspeech/nar/data_preprocessing/clap_embs/bus.npz b/capspeech/nar/data_preprocessing/clap_embs/bus.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6797fce935e986fa9a906b00f03897d2a5ac957f
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/bus.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12b82af7da0ff7b29c161540006fb7c578119a4526c08a0d6fdfe4902ed3130c
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/can_opening.npz b/capspeech/nar/data_preprocessing/clap_embs/can_opening.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ae7e629c7efec882d8abb7cf13fd9b6ab0737302
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/can_opening.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49d38bec3745ac7be5ea1b34ead78b4b9726aa220aa411498b938403f5174149
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/canary_calling.npz b/capspeech/nar/data_preprocessing/clap_embs/canary_calling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b0ef6709785cb514b36ab702fb26b8e0346fbf68
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/canary_calling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6bd4c7edcdba1c29a81f3d226b127c9ca8127d25656ce60d57734ed70b790749
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cap_gun_shooting.npz b/capspeech/nar/data_preprocessing/clap_embs/cap_gun_shooting.npz
new file mode 100644
index 0000000000000000000000000000000000000000..bea709310d1bae1a726e8cdd18ffa76af613a6c7
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cap_gun_shooting.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa8a75d2111c879ec39401e4ff25c9a3c57bc8faa349791974ccfb6c01064a12
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/car_engine_idling.npz b/capspeech/nar/data_preprocessing/clap_embs/car_engine_idling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e48b10e86799feddf074b39461a4f3832f23bbe6
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/car_engine_idling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6da9dfeacfbf92bd592b2ff1bb8b30f711bf7697372a9c04069e18a837be764f
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/car_engine_knocking.npz b/capspeech/nar/data_preprocessing/clap_embs/car_engine_knocking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e9f4570646db3e1dd7e0794b232a5f26a018cfba
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/car_engine_knocking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7da204b651ac906552af8e53ad9f9e88de6f9c9f612389bff5b2d5f19e7ddb02
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/car_engine_starting.npz b/capspeech/nar/data_preprocessing/clap_embs/car_engine_starting.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f1f97a0f0225ec92ad44bc6c2173777a887bcb8a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/car_engine_starting.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3120c614dca65cdb4031e1beb1531868d2c1b1c744b427b2d46c408de190764f
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/car_horn.npz b/capspeech/nar/data_preprocessing/clap_embs/car_horn.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ce399bb2e1f8699eeaf25755225fbb1e1d4cbf57
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/car_horn.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:357934c382b2f4dc038eb698ea13e2afbdac2cde959b07f58b3b53087b6d904c
+size 2158
diff --git a/capspeech/nar/data_preprocessing/clap_embs/car_passing_by.npz b/capspeech/nar/data_preprocessing/clap_embs/car_passing_by.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6cb5634a2ec3e543d8eb2164fcbf65c69d45dd60
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/car_passing_by.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15f220d929fc7929b0d88a4c8a6a4b78246dca352f7ecc30c59578a5cbb35ac8
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cat.npz b/capspeech/nar/data_preprocessing/clap_embs/cat.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b525eef5e8f50313fe67c7c4d331c2d956edb6cd
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cat.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30df1b33d51b96286e86c60800086b722b867e134940875a686efb1e81bd35c9
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cat_caterwauling.npz b/capspeech/nar/data_preprocessing/clap_embs/cat_caterwauling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..090decb168cfb336f7d6eb8bde318d691b3d1385
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cat_caterwauling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57376c0ffa14d3062fa6f15860d9bf62afbd46211ea65c7e89e70c146cacf6b0
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cat_growling.npz b/capspeech/nar/data_preprocessing/clap_embs/cat_growling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..11eaeac9f14596b7a7400d4dac3f9320a4d177ab
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cat_growling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e247387ed2053b24c6f58c2cfb65b3eacfeaf3e0cf47ae746795ea5c72c41b6d
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cat_hissing.npz b/capspeech/nar/data_preprocessing/clap_embs/cat_hissing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..63811e03ee413ce43220d30d4232ae1741afa562
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cat_hissing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee7c612549e7a0517e8f1f61d49dfb30e278e9060434106f144d11d031fb66f1
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cat_meowing.npz b/capspeech/nar/data_preprocessing/clap_embs/cat_meowing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8820900086d06922a8ad9eb5089460a0be8b6e6d
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cat_meowing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:79116cbe255d00c9560fb38c9325c02389e4dbeca36da0614846ea20a6bb69d7
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cat_purring.npz b/capspeech/nar/data_preprocessing/clap_embs/cat_purring.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c19a9508f2f3adb49162893aba1acd77d944dff6
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cat_purring.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b8d00222b3edbe9e21e1b5b01185e32150809be496d97f324f99700831f1261
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cattle,_bovinae_cowbell.npz b/capspeech/nar/data_preprocessing/clap_embs/cattle,_bovinae_cowbell.npz
new file mode 100644
index 0000000000000000000000000000000000000000..32698091a720a6cf5571e9d4b4c2ba08c2e9544e
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cattle,_bovinae_cowbell.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ca37348ebba3888243c9c367f0793cb68f150e7aa83f13913a5154aadc4c1d8
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cattle_mooing.npz b/capspeech/nar/data_preprocessing/clap_embs/cattle_mooing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..41a3ae8109c0edbc07636bba0fbaa92deb38251f
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cattle_mooing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3f2d62176a5a4c52b05008c29003eb0236d99792b04586cfc0ad35d14173095
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cell_phone_buzzing.npz b/capspeech/nar/data_preprocessing/clap_embs/cell_phone_buzzing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a89b26b05fea26e527afc852dcef17e618fec4cb
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cell_phone_buzzing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcc6f41021b88ad89ac7858c2a6b61a8cb1d1c6b07c4bbdfce6208c7c5543cb3
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cello.npz b/capspeech/nar/data_preprocessing/clap_embs/cello.npz
new file mode 100644
index 0000000000000000000000000000000000000000..12e518ba22f1130e380268b867701930db74ea06
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cello.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf4a2b1a2dac1ea2939335ab71a2eb3f43e250042c28a03cbbe53ac529739550
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/chainsaw.npz b/capspeech/nar/data_preprocessing/clap_embs/chainsaw.npz
new file mode 100644
index 0000000000000000000000000000000000000000..44f3d4c2d6e7e9fe7eaa84f05e0a6c53fa4f88b5
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/chainsaw.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac81c2ef30672f4c3fee7525860334b9d2527414cf8ab185bcbf0b895c209fd0
+size 2159
diff --git a/capspeech/nar/data_preprocessing/clap_embs/chainsawing_trees.npz b/capspeech/nar/data_preprocessing/clap_embs/chainsawing_trees.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4cbc313ed136afc64a72a87cdd872170cb1638ae
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/chainsawing_trees.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90ea424791ea78187f77a7f4ba6aa1b5dc110d775154e7510f3b471bf5ff1e01
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cheetah_chirrup.npz b/capspeech/nar/data_preprocessing/clap_embs/cheetah_chirrup.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3140600a74ea663d442c0c608414f1b906eec95e
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cheetah_chirrup.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2dd1efe2e8ea10fb811a941dcc4862f0bad8b11351bd7b802bee41fd8371f56e
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/chicken_clucking.npz b/capspeech/nar/data_preprocessing/clap_embs/chicken_clucking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..77069f946816dde971ffbed87fbc5c50ad1251e0
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/chicken_clucking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7233c7d1bd94caf7d428488f1be6d0bce83213c3f36ef06aba899f0afd65864d
+size 2158
diff --git a/capspeech/nar/data_preprocessing/clap_embs/chicken_crowing.npz b/capspeech/nar/data_preprocessing/clap_embs/chicken_crowing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..741ed71d31a7f70eba1aaf87c4767b3f76919d3f
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/chicken_crowing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:746246a602af40739612ff646371e846ee00e4f186bef5c4cdc740a65d3b7497
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/child_singing.npz b/capspeech/nar/data_preprocessing/clap_embs/child_singing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..521f15dd2d18ad9dca0b08769ed7c5ae50a11a26
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/child_singing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4844caa55f78c858f05224d878ae8f0458a21359d7affd0ed0ba5a835e527970
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/children_shouting.npz b/capspeech/nar/data_preprocessing/clap_embs/children_shouting.npz
new file mode 100644
index 0000000000000000000000000000000000000000..fe8740cad57060428c3f279d0ab93e14934f3dfd
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/children_shouting.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb2daa682c5d1570309e23f96123f9836ae059f31e9fcb0466cbd139b09e5bcf
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/chime.npz b/capspeech/nar/data_preprocessing/clap_embs/chime.npz
new file mode 100644
index 0000000000000000000000000000000000000000..358023f9100a11b0faea56b2d63c049a68aa83fb
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/chime.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f2e913ae61c7809ddfac511f8c30de566c8a3a5d7318432f2af010c81e1bb53
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/chimpanzee_pant-hooting.npz b/capspeech/nar/data_preprocessing/clap_embs/chimpanzee_pant-hooting.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a1e7362536886486649762d53b22148e7edc2886
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/chimpanzee_pant-hooting.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4cc3106b4b7b09a4b642735401c1da1cb115889bef2387907ff268498542a14
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/chinchilla_barking.npz b/capspeech/nar/data_preprocessing/clap_embs/chinchilla_barking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7a760dbb887e9d22c88254bb20fbad92891303f2
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/chinchilla_barking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97e889da32c7cb120b08a15a918eebe5e1d52e7101b7361c28191ddc4fc7432b
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/chipmunk_chirping.npz b/capspeech/nar/data_preprocessing/clap_embs/chipmunk_chirping.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4052e9aae8308306d79e7f1f059cabae363b0e4a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/chipmunk_chirping.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2afafd75bea76c56a4fc6a333058a883267e4291753e59a6f4a448a872eff264
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/chirping_birds.npz b/capspeech/nar/data_preprocessing/clap_embs/chirping_birds.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f8fe0789e4d02fca0cfc09573167cd64a668d4c7
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/chirping_birds.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7effe5ece41965df9eb1cf9bd97617329eb7c6c851f67f5356b802d52387b62
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/chopping_food.npz b/capspeech/nar/data_preprocessing/clap_embs/chopping_food.npz
new file mode 100644
index 0000000000000000000000000000000000000000..bbe18f363527204992c36c78dc78170d452a8ccf
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/chopping_food.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45918b183d617c3b22a06de899a6f7f7344fa0f1bcc46906701f029ea7604a92
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/chopping_wood.npz b/capspeech/nar/data_preprocessing/clap_embs/chopping_wood.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c0510e0646682055e130702edb8198ca8f70bc00
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/chopping_wood.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c9690b0ef0a930933e812c82306dd03bfe30b74ab8bb14506830374ff2949f4
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/church_bell_ringing.npz b/capspeech/nar/data_preprocessing/clap_embs/church_bell_ringing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c475a9c614f3d035b963e9a51fb0a0f22c6d0fb8
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/church_bell_ringing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ece5af36ef09b3fe84aad9b0b60f7522d7786f43fe6340181e769e2aaea2f04
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/church_bells.npz b/capspeech/nar/data_preprocessing/clap_embs/church_bells.npz
new file mode 100644
index 0000000000000000000000000000000000000000..302062ced6e51d54889c2e101823eddadb52eb66
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/church_bells.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b442d85a5acac532a1bdf8a5a7e4f874d5565360041f9322800394c5ea8555fc
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/civil_defense_siren.npz b/capspeech/nar/data_preprocessing/clap_embs/civil_defense_siren.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f6eae655c43e34001b27553a2c938491764ed923
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/civil_defense_siren.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed6e5628097cd48b3c178bac5abea29c634ad92d449e7ba70744dc65246243a0
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/clapping.npz b/capspeech/nar/data_preprocessing/clap_embs/clapping.npz
new file mode 100644
index 0000000000000000000000000000000000000000..cfcff045b760c21728297edc73ab0c253c896fff
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/clapping.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cd129cb1fa94d881108228f22698bc273acbcd284eb24876093a4509bb6e41f
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/clarinet.npz b/capspeech/nar/data_preprocessing/clap_embs/clarinet.npz
new file mode 100644
index 0000000000000000000000000000000000000000..21e7090a9702910b704f8ad5cd53eb23e7308f41
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/clarinet.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:795e4b25d60cf29e00ec838f808ea5b9993dfef38c45d3c3b11ed8da7be8cf48
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/clock_alarm.npz b/capspeech/nar/data_preprocessing/clap_embs/clock_alarm.npz
new file mode 100644
index 0000000000000000000000000000000000000000..178da7793cd4281e6504c15ebddc2a0240cc64c6
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/clock_alarm.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8d0c392180a5419b683f03a22449f08f3bc376665ee9cceb170bf9ea0b1a154
+size 2142
diff --git a/capspeech/nar/data_preprocessing/clap_embs/clock_tick.npz b/capspeech/nar/data_preprocessing/clap_embs/clock_tick.npz
new file mode 100644
index 0000000000000000000000000000000000000000..951c678954a30430876bcc5adb6bf24aa92eba72
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/clock_tick.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b12e5ef81fa6b4a6a70b507cdef10f5b86ece271ea9205895fc8474994f9b6c
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/computer_keyboard.npz b/capspeech/nar/data_preprocessing/clap_embs/computer_keyboard.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b20284054e56d57410366368a4377812e11b2d13
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/computer_keyboard.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58cf85ee0212f346eff4460eaf0dcbd2e3ccaade5d811b1076089be55803b921
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cough.npz b/capspeech/nar/data_preprocessing/clap_embs/cough.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2f2892346d9e39205075f17cc8c64d28d3046d9a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cough.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:412f2a05f4123ec9c55a40bac91b2cd8acebbaf4eeb90bbf93bc93ea0bc1e2a7
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/coughing.npz b/capspeech/nar/data_preprocessing/clap_embs/coughing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6dbc994424fe72c20e12dc6a82bb4c2a6942eba5
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/coughing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b568b96741601fc747d4f9a3e5df97b9c5c200af07ab6e1bcb468e5c95d187a
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cow.npz b/capspeech/nar/data_preprocessing/clap_embs/cow.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3737dbb4d4f858c004049b72342938da7731582c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cow.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af44c9b6ba11f36507bfbb2da39a586602f4574ae2486b7643cdc7d2d3ad115f
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cow_lowing.npz b/capspeech/nar/data_preprocessing/clap_embs/cow_lowing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8b5917dff735a1a834572bbe3aac04c003758d74
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cow_lowing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:604b78e2a144bdd7161cf7967044703ebe32780e40ea76bcf1d94f8010f9fe1f
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cowbell.npz b/capspeech/nar/data_preprocessing/clap_embs/cowbell.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e10a9bcc63b6f0ef46ba25885b825219e2b8748f
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cowbell.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a8259a85750a998e9ee706db65bfa6c3cc86fcefc35746005fed26642660396
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/coyote_howling.npz b/capspeech/nar/data_preprocessing/clap_embs/coyote_howling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..00b4ba6e31f1aa535cfd1ecc55480139e72cb01c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/coyote_howling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3679922b5ddb17021c23a93604fb563e0e5d2c6b1ed424ff090c0867fc0d25c
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/crackling_fire.npz b/capspeech/nar/data_preprocessing/clap_embs/crackling_fire.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a78e08b6931ddcdd3c940669002d1ca28b4c460d
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/crackling_fire.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f747f9087d6ec46ab0fe2376083b2b406776c0423eaa5f1a9b98dc9ce4fa9753
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cricket_chirping.npz b/capspeech/nar/data_preprocessing/clap_embs/cricket_chirping.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0cc357ab17d5c90b3899e095b8fd625beeeaedfe
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cricket_chirping.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d338a5c3d8b3f17bc2a3fa30c82af1155694574c6a046c589615a1f3c5916d1d
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/crickets.npz b/capspeech/nar/data_preprocessing/clap_embs/crickets.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f1d8d1928ebd803950439f8bb9013a9bd135bf8e
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/crickets.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3cbd9d4edfd7304b9822309ef76ac04cf56e26c01d5ab26e19212510ec7e73e
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/crow.npz b/capspeech/nar/data_preprocessing/clap_embs/crow.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ed8058739a22b1374b541426b8d1e180ff32868e
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/crow.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b30f34be3701979144935b8d8c8128084e33dc4435650d6704a75494eaa54e47
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/crow_cawing.npz b/capspeech/nar/data_preprocessing/clap_embs/crow_cawing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8170cecff60a5d45b1e2c20741bf0ab6b25c856b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/crow_cawing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:345551f324ac8ecbf20b7c0f7f69a14898628eafe922df9a4b70e424ff8b51c1
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/crying_baby.npz b/capspeech/nar/data_preprocessing/clap_embs/crying_baby.npz
new file mode 100644
index 0000000000000000000000000000000000000000..77a851e907dec28ff8dade2df051f45f34656e64
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/crying_baby.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98190aa9c280256a60fb417c8eb57d748e104216078dad99d56a7264bfb62fbb
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cuckoo_bird_calling.npz b/capspeech/nar/data_preprocessing/clap_embs/cuckoo_bird_calling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..788961345eee0fbcbe1dd9dba8e4eec3108abb53
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cuckoo_bird_calling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d6458f3f284b0482aa28d78e3beceaa39d84568522640067a68854db67e04d5
+size 2161
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cupboard_opening_or_closing.npz b/capspeech/nar/data_preprocessing/clap_embs/cupboard_opening_or_closing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1c46311109552b0a8a0962b3fc3fc76fd1fd9a4b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cupboard_opening_or_closing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3860e0bba6db4c9de7339cbd10be0979792eba865651909ae82dfcb70a11cf8a
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/cutting_hair_with_electric_trimmers.npz b/capspeech/nar/data_preprocessing/clap_embs/cutting_hair_with_electric_trimmers.npz
new file mode 100644
index 0000000000000000000000000000000000000000..da99c926d836d774ff5b8fc19534444ab939f617
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/cutting_hair_with_electric_trimmers.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd2bcf3be39dcc3c31773aed71e3fee0cbd77a8322f9599a242062b34ea9d5c9
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/dinosaurs_bellowing.npz b/capspeech/nar/data_preprocessing/clap_embs/dinosaurs_bellowing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..cf6903aecfde96d6737ab3775091e1992d3126a9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/dinosaurs_bellowing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac6c0b639e00b89ed3195fc57b3fcff6117958664eb80c5a7c4a213ec76b34b0
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/disc_scratching.npz b/capspeech/nar/data_preprocessing/clap_embs/disc_scratching.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e6f494e83e8f3788ece3a78699b5e500bfaba55c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/disc_scratching.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:adfbcc8df05577b9bd53de4ed3975c1ae8b93049887db6bc827f169a6509da92
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/dog.npz b/capspeech/nar/data_preprocessing/clap_embs/dog.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3d19079a4bdf71395e9f33767250b30e8fa1305b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/dog.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa5b595a2a721b524327b64a9ee33cd697326fca5a13bc8410d8a771539be5f0
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/dog_barking.npz b/capspeech/nar/data_preprocessing/clap_embs/dog_barking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..99ff0c3f85f8584b75d9fd918f92b114f84d3842
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/dog_barking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c002f37788e15701ea9b02828b8aacaa332fd9fb48d8adeb8cb76323cc9a041e
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/dog_baying.npz b/capspeech/nar/data_preprocessing/clap_embs/dog_baying.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ee9343e9b93bcad996b4dcc0d2c0a99489e1ea59
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/dog_baying.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1836a19d9eb2b467dd18618378648e4731beb136a20f45e3cb9d4c7a4f1b9be
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/dog_bow-wow.npz b/capspeech/nar/data_preprocessing/clap_embs/dog_bow-wow.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8c65692c6854a558c8b5778f812bb99c0c35e282
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/dog_bow-wow.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f03ac3f198bfad811fd03ec2bba521540525e251989ffd6d9ae2875ba15fde2f
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/dog_growling.npz b/capspeech/nar/data_preprocessing/clap_embs/dog_growling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..526239b61561fafb6f823e0137bf18a6e5651a9b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/dog_growling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91c97bb22e924794f4e87dc0bc70cc15b8834038776309cc431ac41242cc092e
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/dog_howling.npz b/capspeech/nar/data_preprocessing/clap_embs/dog_howling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..580e7b3b74e7112260884416387450d9e6c8ccfe
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/dog_howling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f03509198fff105bf58bec0dabf94c9facceb7c9887eace56a40528671ee4bd5
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/dog_whimpering.npz b/capspeech/nar/data_preprocessing/clap_embs/dog_whimpering.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9e7a19e2a79e8bce982e3a17f28b23dcb1fb467a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/dog_whimpering.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e2a9318a472a5a39e3f9ad268f4e55b912f7d6d4de7f83632cdd99af12988ae
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/donkey,_ass_braying.npz b/capspeech/nar/data_preprocessing/clap_embs/donkey,_ass_braying.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1c2d3f949a7c0b73cef4c8d2ef2f1e21d2c6a9c5
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/donkey,_ass_braying.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c73e617735257ef3e631c30135c1130768b36ffd003db721811deada97b90be0
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/door_slamming.npz b/capspeech/nar/data_preprocessing/clap_embs/door_slamming.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f7c3d7e5553a517c2151db5df11076616dc86990
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/door_slamming.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6fb20a75f9bb4bf43faf7688e6cfdef19ef20d8353b71a67682d4a0269d5156
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/door_wood_creaks.npz b/capspeech/nar/data_preprocessing/clap_embs/door_wood_creaks.npz
new file mode 100644
index 0000000000000000000000000000000000000000..09104f6c3676914498d196050e596b6832cea711
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/door_wood_creaks.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afdb1f8e3b8b6fba39ecfe12227eafd6069e27827dee35b458f188d56045112e
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/door_wood_knock.npz b/capspeech/nar/data_preprocessing/clap_embs/door_wood_knock.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d0885f449f0a4f9438028029306ab9232e53931f
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/door_wood_knock.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e93fb885fdf2c6eef0db1c75ab19d9334366dbe8a10c8690a49c09bed2d62ea1
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/double_bass.npz b/capspeech/nar/data_preprocessing/clap_embs/double_bass.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d7e1f18051b06560d210e2ab8a9e57ff8e469762
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/double_bass.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:772ab0ebe7d51ea8c65170442ef31cf5a2178436dbf20d9d173d13c70a07bfe4
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/drawer_open_or_close.npz b/capspeech/nar/data_preprocessing/clap_embs/drawer_open_or_close.npz
new file mode 100644
index 0000000000000000000000000000000000000000..06a90a3886b659b29d784b1d9121c5d88c5396b7
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/drawer_open_or_close.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9afb7dc20ca343c716f53272869250ff7394685e6c321863015df2b59ae310e
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/drinking_sipping.npz b/capspeech/nar/data_preprocessing/clap_embs/drinking_sipping.npz
new file mode 100644
index 0000000000000000000000000000000000000000..963a389d5a0767f96e2a17248640ad2b7b1ee92b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/drinking_sipping.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27bd4c2ebe340090d3db6b51fc64704e37dbddd1a23a9c8d9eaacf1dec211b4b
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/driving_buses.npz b/capspeech/nar/data_preprocessing/clap_embs/driving_buses.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9407e8e98f562e871bf1147e772807ab3dbd2f5b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/driving_buses.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1dac1c8b0292620ec9f7e3f425d27ab3419afe3699c479a6c7b93ebaf1055eb
+size 2158
diff --git a/capspeech/nar/data_preprocessing/clap_embs/driving_motorcycle.npz b/capspeech/nar/data_preprocessing/clap_embs/driving_motorcycle.npz
new file mode 100644
index 0000000000000000000000000000000000000000..608bcf233555b01669eebbf790b06763b0c95025
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/driving_motorcycle.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52e7951334c8b446c1b468546ae54d6685dbb8ba3b8c17ce844ea1c3e6224aa8
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/driving_snowmobile.npz b/capspeech/nar/data_preprocessing/clap_embs/driving_snowmobile.npz
new file mode 100644
index 0000000000000000000000000000000000000000..49169a4a928440a4591fc2ffa0d57ce566cda76d
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/driving_snowmobile.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c27abb86c6a6721343f38f64ce7e1852e458bb37566728e5699a2e0021089915
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/duck_quacking.npz b/capspeech/nar/data_preprocessing/clap_embs/duck_quacking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..192e18a6156bc87e472fd3f3c2af8ba622fd5687
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/duck_quacking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5966341682c582f829495093d0b1648e9c6b2545bedbf6b42802f2d7539bbedf
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/eagle_screaming.npz b/capspeech/nar/data_preprocessing/clap_embs/eagle_screaming.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0829f0361628f96115253f6d47f34e857230e9de
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/eagle_screaming.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b29c24e904d66343df76d5a5f9a88e381c3c1f809fe9862e2d51bb99d00f28d5
+size 2157
diff --git a/capspeech/nar/data_preprocessing/clap_embs/eating_with_cutlery.npz b/capspeech/nar/data_preprocessing/clap_embs/eating_with_cutlery.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f94175bad51b4dab9ac58f4004a66fb92448b457
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/eating_with_cutlery.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f01eaf3e92d3d7a4a5970d93cf92141be28c3f2da3779fbd3886bc8d1822e521
+size 2158
diff --git a/capspeech/nar/data_preprocessing/clap_embs/electric_grinder_grinding.npz b/capspeech/nar/data_preprocessing/clap_embs/electric_grinder_grinding.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4a9cf4486b113bdad37a481cd105cd64e6f47052
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/electric_grinder_grinding.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6848f4a5a8da5c237bd94d278ca56007f9ed0b69fdef778169d0c2d3dd98d705
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/electric_piano.npz b/capspeech/nar/data_preprocessing/clap_embs/electric_piano.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9a893bbbc2b644d5e7cd62812bd6b0c705a96977
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/electric_piano.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:524f197b711e809b39f1f63f7d2175edaf6753103040d9341b8954494304e34e
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/electric_shaver,_electric_razor_shaving.npz b/capspeech/nar/data_preprocessing/clap_embs/electric_shaver,_electric_razor_shaving.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e683123e3061bb1ff590dfad77142648c669eec6
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/electric_shaver,_electric_razor_shaving.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82e6393784802c031b8164129b5439077678e8b94df2d06273ad491c4da0f89c
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/elephant_trumpeting.npz b/capspeech/nar/data_preprocessing/clap_embs/elephant_trumpeting.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3dac0919ddad8adf1c73c1ad389e8c731a1eb4aa
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/elephant_trumpeting.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5f81e4f4311adcff15de0d79d4ca43f17a3d43dd59ccd26b884cd2dcc79f628
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/eletric_blender_running.npz b/capspeech/nar/data_preprocessing/clap_embs/eletric_blender_running.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6b83b953429a5ea7a4b6307554b8a78a9c2fdc60
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/eletric_blender_running.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4d42db36d3d96a467dca97302a4434d08156c733cfc9a4cdb531d0303939c9a
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/elk_bugling.npz b/capspeech/nar/data_preprocessing/clap_embs/elk_bugling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7055c8bb65d7494a7fd052585144752412b9d505
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/elk_bugling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d66de5245ea766a5e68cecaae18a69641ece42ea4dd87d85eb7d6ce9ef16bb1
+size 2142
diff --git a/capspeech/nar/data_preprocessing/clap_embs/engine.npz b/capspeech/nar/data_preprocessing/clap_embs/engine.npz
new file mode 100644
index 0000000000000000000000000000000000000000..275604a7b21319b669c99656159fcb96e5081582
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/engine.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4ef31fdce7abdf9185076e6db3e02f9bd8191096137b6c75bd7cbee4f9a94f4
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/engine_accelerating,_revving,_vroom.npz b/capspeech/nar/data_preprocessing/clap_embs/engine_accelerating,_revving,_vroom.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5e5ada03297ae997977bd3e7363ccc4eafd68adb
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/engine_accelerating,_revving,_vroom.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3565a606a3c7dbb6c92346550799e7ea93e6af0be4f0d53989c778d27c4254f
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/extending_ladders.npz b/capspeech/nar/data_preprocessing/clap_embs/extending_ladders.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8a1c14b24882e27d3de37cd81b397d298fcb65d0
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/extending_ladders.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05ce8f8558f26915bb8873f3a3b7824ed3992b69a495f97fbbe465bb1a088232
+size 2143
diff --git a/capspeech/nar/data_preprocessing/clap_embs/fart.npz b/capspeech/nar/data_preprocessing/clap_embs/fart.npz
new file mode 100644
index 0000000000000000000000000000000000000000..bf5e0e899963d07c4d13dde78066e6143ac07709
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/fart.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:feb3db26bdc802680f6777367d7ef5cd4924d2718db44e6d787839a2a527256c
+size 2158
diff --git a/capspeech/nar/data_preprocessing/clap_embs/female_singing.npz b/capspeech/nar/data_preprocessing/clap_embs/female_singing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..732def18ded90f767c860c35aedc7f76d15f7698
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/female_singing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36dea3229614b5dfdf2f84b52b9bbcc19c388f2143f06c39f66c215bab80522c
+size 2140
diff --git a/capspeech/nar/data_preprocessing/clap_embs/ferret_dooking.npz b/capspeech/nar/data_preprocessing/clap_embs/ferret_dooking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b68d4ac20eacc71a73979c81b80c85ffcc1a5ddd
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/ferret_dooking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebf3229c414682a1b5e5885e3b2c4e37957347e2ef7de50ffd3cd6f0d7cecffd
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/finger_snapping.npz b/capspeech/nar/data_preprocessing/clap_embs/finger_snapping.npz
new file mode 100644
index 0000000000000000000000000000000000000000..19dea876282fc1de744660c42815cb900ea11134
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/finger_snapping.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a5d4fc86f4c0764e7b53f2e9f4e206e911ec6c9f8b8ce4fe4703935e0c131f1
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/fire_crackling.npz b/capspeech/nar/data_preprocessing/clap_embs/fire_crackling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..57866f727d4230926579dcb030a2f7d28133f278
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/fire_crackling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60e4a0d7c967dd129465fc606e5b5fc6e1e6e310b6c36bc533d7393d07c979fb
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/fire_truck_siren.npz b/capspeech/nar/data_preprocessing/clap_embs/fire_truck_siren.npz
new file mode 100644
index 0000000000000000000000000000000000000000..df60b9b3fbef3f540704fd0024d526022604f4ed
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/fire_truck_siren.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dc6b34144ca3eaff7c31472ceef32710ad01b12b8a53c2c02db1c22d69b391a
+size 2157
diff --git a/capspeech/nar/data_preprocessing/clap_embs/fireworks.npz b/capspeech/nar/data_preprocessing/clap_embs/fireworks.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3021fdebb4949270d5c81c7789bac7dc13bc2a83
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/fireworks.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e4ca6c2c62532e06879c51912aff7acec03a4f5c44bc126df04451523b1e11a
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/fireworks_banging.npz b/capspeech/nar/data_preprocessing/clap_embs/fireworks_banging.npz
new file mode 100644
index 0000000000000000000000000000000000000000..152bca72d695d7ca90d996d5bc9492952feff0a1
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/fireworks_banging.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c25653deefc7ad3daa3f1d66a63156b656423eebdb8e3f4d24bfb8037d6081ef
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/firing_cannon.npz b/capspeech/nar/data_preprocessing/clap_embs/firing_cannon.npz
new file mode 100644
index 0000000000000000000000000000000000000000..757d245d686100cd6efd4d05a9e168cfad2edf0b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/firing_cannon.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78de6356e9685ef4ac5a05ae67498760201e75bb6e8671570ba1ea3723e897f8
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/firing_muskets.npz b/capspeech/nar/data_preprocessing/clap_embs/firing_muskets.npz
new file mode 100644
index 0000000000000000000000000000000000000000..072371a13ddc34af398b28042f83595778035d2d
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/firing_muskets.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8ec4196aeba2b1721abd3693650a7e6e07e52e904149a7572a33ec8a77f0d52
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/flute.npz b/capspeech/nar/data_preprocessing/clap_embs/flute.npz
new file mode 100644
index 0000000000000000000000000000000000000000..21564a013f1af5e110f1794c84e8a9d8d5d9f9a1
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/flute.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bc72d81dce6d50a7684e6cf7220a9c892841270fb0e83cd039f3b6ac6eee21f
+size 2140
diff --git a/capspeech/nar/data_preprocessing/clap_embs/fly,_housefly_buzzing.npz b/capspeech/nar/data_preprocessing/clap_embs/fly,_housefly_buzzing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..70f6b8889a4a71266f6eb2ab29035502d6b13346
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/fly,_housefly_buzzing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e80288019426173641a608ea92b13d7cbf73fe64c9680ede60da8537ae4f65c
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/foghorn.npz b/capspeech/nar/data_preprocessing/clap_embs/foghorn.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c8d0646d78c5c97ab9c03b5d9b5daf67a2a8b763
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/foghorn.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cadfc5daac2b75f87ef0d2573d65c43ad33e8967e10cc0a43b793ad910f57bff
+size 2157
diff --git a/capspeech/nar/data_preprocessing/clap_embs/footsteps.npz b/capspeech/nar/data_preprocessing/clap_embs/footsteps.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8c0324ca08ae74fb8e2205a2b2dd9684798aace9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/footsteps.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29c39a69fbc1dd632a98ed5bc798811f75fb8d5255dc9ccc58dbcf8dc22fbc47
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/footsteps_on_snow.npz b/capspeech/nar/data_preprocessing/clap_embs/footsteps_on_snow.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c8df990e38ba3b7579bd242bad265eaee391a9dd
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/footsteps_on_snow.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2e616807b377a9e6485c7ed07462c50b733df77faae0a77888245e14d42e89a
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/forging_swords.npz b/capspeech/nar/data_preprocessing/clap_embs/forging_swords.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4f87148d67ba4827f8f1c2d9aec222fba6c2147b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/forging_swords.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:683faac9de89fc18353978414ed95ea4d0a46e282164a0770e172fd0ea21a403
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/fox_barking.npz b/capspeech/nar/data_preprocessing/clap_embs/fox_barking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..eb49a81adf401bd643cfe5987313cd96b92224f7
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/fox_barking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49d1bd9107cc4e93c149ecb911243b4edd90db440051aeacc52b8bdc5f41a1f5
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/francolin_calling.npz b/capspeech/nar/data_preprocessing/clap_embs/francolin_calling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5712910a81ed74be5873a11c8c77667e9bdf7e6c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/francolin_calling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e9592e40ebdab216fc72ef79814766383e2be37792f6b553a8897e9a5053e75
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/frog.npz b/capspeech/nar/data_preprocessing/clap_embs/frog.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6c47b27de3ea155a57512b66aa2872c7307e475b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/frog.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:868c51013f8b491e38d70402c87be6c73da8338e058694b479a8ec19bb1b8f32
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/frog_croaking.npz b/capspeech/nar/data_preprocessing/clap_embs/frog_croaking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..24d945f773010121615d4d1858134e2933e7397a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/frog_croaking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:082b89f5be4dbe38899335dbeb7e8096bdbd5d07684570a67e644c2d7e345189
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/gibbon_howling.npz b/capspeech/nar/data_preprocessing/clap_embs/gibbon_howling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f305f77aaab03064e64673ebeb394ed0b914641c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/gibbon_howling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:118a88542cb9fbdbad20ec453db3256b0ff390ed821e55571ea2466d92aa8d0a
+size 2160
diff --git a/capspeech/nar/data_preprocessing/clap_embs/glass_breaking.npz b/capspeech/nar/data_preprocessing/clap_embs/glass_breaking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f2357997de8fb583eb3bc03d805e0815cfb4aedb
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/glass_breaking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43cbfc9ba79195dc710217c49cc37ea1b02de7c8f1522b15cdd70b7a32fa37c8
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/glockenspiel.npz b/capspeech/nar/data_preprocessing/clap_embs/glockenspiel.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c933345895d13b6cc5c31bd0315cc100a7bade91
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/glockenspiel.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b909e1fe01948c04a5dbe9113c8ba8283f7db181fd0c850b0047742b08f0e529
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/goat_bleating.npz b/capspeech/nar/data_preprocessing/clap_embs/goat_bleating.npz
new file mode 100644
index 0000000000000000000000000000000000000000..27b6edf67c026a7205691cddecfbb5d21b4f6ae4
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/goat_bleating.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a9a32b4ef09659fe480ad9e94379afccf4f1fe2ac924772a9f6a369a5b27324
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/golf_driving.npz b/capspeech/nar/data_preprocessing/clap_embs/golf_driving.npz
new file mode 100644
index 0000000000000000000000000000000000000000..256a01271b4632f5363605d21942c969fe93caa0
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/golf_driving.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bdfec8c9852ecba962623f958cc0a469a9274ecf86bd2f6f4c4d3b49d5196d3
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/gong.npz b/capspeech/nar/data_preprocessing/clap_embs/gong.npz
new file mode 100644
index 0000000000000000000000000000000000000000..108246642abfdec3c9958d59aa70202eb4168a9c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/gong.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71089c126f36f23fd24412bc18dd27b1ca31ad48b698f6211de1d9475c30d057
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/goose_honking.npz b/capspeech/nar/data_preprocessing/clap_embs/goose_honking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..28558c8faced5c6717306c5cd42f82a13f4363a5
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/goose_honking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2883569525ca3066c14b784201ff5784237c226849eb23cc0279bc3e06541eb4
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/gunshot_or_gunfire.npz b/capspeech/nar/data_preprocessing/clap_embs/gunshot_or_gunfire.npz
new file mode 100644
index 0000000000000000000000000000000000000000..72239f8a51f5642b3f26f8f291f240efd9b523f7
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/gunshot_or_gunfire.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ccde48e18e755f7c8dbf36e5520d3830727e3bcdc9d5e3d34169a404712ebdf
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/hail.npz b/capspeech/nar/data_preprocessing/clap_embs/hail.npz
new file mode 100644
index 0000000000000000000000000000000000000000..970ca6cc92909d23539b2eea0233fe41dea3d2b8
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/hail.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:933b6bc5d9a55f7bfb7712ff6f88ff04ffad5169a770c135de63aa88ae6671e2
+size 2144
diff --git a/capspeech/nar/data_preprocessing/clap_embs/hair_dryer_drying.npz b/capspeech/nar/data_preprocessing/clap_embs/hair_dryer_drying.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2805a40c5d2332e58e6ba59ac6a2915e52b85ce9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/hair_dryer_drying.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06424100420806b160d88f23a0e2213fe7cbd0cf1657f1581e720f30e34cda24
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/hammering_nails.npz b/capspeech/nar/data_preprocessing/clap_embs/hammering_nails.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6fdb16b4e997481eb5707b29653acbdd6b259099
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/hammering_nails.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c89766e645196bd2d3fb6e188b20ca8d20480e846642e332225bb2fbd89b118
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/hand_saw.npz b/capspeech/nar/data_preprocessing/clap_embs/hand_saw.npz
new file mode 100644
index 0000000000000000000000000000000000000000..92fa48b312719eabdfcd1503bb74ed4414405bd5
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/hand_saw.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a933e9fa73514c82f64d5c9cc5941ecea8e2063fee0b6c5bc55fa496f2c2a8f6
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/harmonica.npz b/capspeech/nar/data_preprocessing/clap_embs/harmonica.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7d528b72a6bc319c406a6da9c6077d911c8496b9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/harmonica.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d56ce0debe24e5eba3f526f85712434cf4a6a961a6f38be9d7559561e96696c3
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/heart_sounds,_heartbeat.npz b/capspeech/nar/data_preprocessing/clap_embs/heart_sounds,_heartbeat.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5bf802bd584cb12fc2062405df1612d1108b07a7
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/heart_sounds,_heartbeat.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97939a2114084d4497e0bb9644c4189b06cee7cbf737a4a1ceb687dd65f79ba1
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/hedge_trimmer_running.npz b/capspeech/nar/data_preprocessing/clap_embs/hedge_trimmer_running.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4050d372ca7b01d13248ab33ba211dfb8aaaa884
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/hedge_trimmer_running.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b42acb1edaa2c8eeecc86f36cee08993838912f80f429d7982059a18e984908a
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/helicopter.npz b/capspeech/nar/data_preprocessing/clap_embs/helicopter.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2dda5a6adde0e59ea848f6980d9aacbc3a3410d9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/helicopter.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ca0752920daae32bed92de7bdb3529cb1e46f20aa4347aa3af6d369de0018ab
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/hen.npz b/capspeech/nar/data_preprocessing/clap_embs/hen.npz
new file mode 100644
index 0000000000000000000000000000000000000000..54430d589e54da68aff2b856e812e712c963e5f2
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/hen.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfd1dcb74f9027205771512a97034b8e15cd021b5caaa381d61c14677628b543
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/hi-hat.npz b/capspeech/nar/data_preprocessing/clap_embs/hi-hat.npz
new file mode 100644
index 0000000000000000000000000000000000000000..33f6c72bd896e78d64ba15f7564b093deb7c69d5
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/hi-hat.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f54c2fea2d8f2b027c2bc984dceab10f40cd5baf4b54dc9f91c05178b4388a36
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/horse_clip-clop.npz b/capspeech/nar/data_preprocessing/clap_embs/horse_clip-clop.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d9cf71f7fcd6010656a5c8f17b72f958b89f542a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/horse_clip-clop.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a43235dc5b0fad6d37b92fe9a94c4dde983fe81385912172a00e72bd30e39c27
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/horse_neighing.npz b/capspeech/nar/data_preprocessing/clap_embs/horse_neighing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8eb7a31c6d8c93e8532a1858bd9b4c03805de605
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/horse_neighing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dab60aa984ce6db2ce78f2be66946d60b94450171bfda0c68e32eb6ea4dfebb5
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/ice_cracking.npz b/capspeech/nar/data_preprocessing/clap_embs/ice_cracking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..aafebfe3ff1718e702ff5e00fbbdb796918ff7d5
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/ice_cracking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:928ef52a0dd575ecfb5a194b8910926f0cb1ca52f9bc549c14a09a0f1ffb9815
+size 2159
diff --git a/capspeech/nar/data_preprocessing/clap_embs/ice_cream_truck,_ice_cream_van.npz b/capspeech/nar/data_preprocessing/clap_embs/ice_cream_truck,_ice_cream_van.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b8e7845d353243196e191c086ff0c85058a39cfd
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/ice_cream_truck,_ice_cream_van.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e90f4891fe9837047c1a2d5f99b5dc09358e722268240c904ea885906a12ac7
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/insects.npz b/capspeech/nar/data_preprocessing/clap_embs/insects.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ea12ee5533ed5686ddfd2db7d70f6367afa25fde
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/insects.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62af4d2521ea5d0be5cf6b77a0a19cdef702b409949f8b0274122a354cb189ff
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/keyboard_typing.npz b/capspeech/nar/data_preprocessing/clap_embs/keyboard_typing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..56908c39063d9e4391c5708abc9890fcac5e72e0
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/keyboard_typing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39e3fa875301eea7129fba5aa4a9604f3db43ce6b56a038501e94959964673a4
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/keys_jangling.npz b/capspeech/nar/data_preprocessing/clap_embs/keys_jangling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9a895c77c0923d4fb2e0dbfb876aca2ba56ddcdf
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/keys_jangling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:311ec6d578a4a5f914072bddefbd84a66382a1f86cd4b034435d8d2e8bd96510
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/knock.npz b/capspeech/nar/data_preprocessing/clap_embs/knock.npz
new file mode 100644
index 0000000000000000000000000000000000000000..79fe4536bcd39243211ffd620c689e862f8885bb
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/knock.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a74a13725b34816585832eb63886c0848544fcd60543cd37dc0e8d948bedba89
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/lathe_spinning.npz b/capspeech/nar/data_preprocessing/clap_embs/lathe_spinning.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6af50d6190acdf374e76fb459427def585245042
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/lathe_spinning.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e30dcc4f7c25997f4be3a0722a86049088bd123ed2138717d8b546d000f4870
+size 2159
diff --git a/capspeech/nar/data_preprocessing/clap_embs/laughing.npz b/capspeech/nar/data_preprocessing/clap_embs/laughing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9ecebb12b704b05e1ae445f257a6a61bf4b3fa97
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/laughing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2642f2f8042bee2ceb628cd819167cd060e60be6ef39b3cbe86cd0f35547b89a
+size 2144
diff --git a/capspeech/nar/data_preprocessing/clap_embs/laughter.npz b/capspeech/nar/data_preprocessing/clap_embs/laughter.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b68d2e5ae429e79bf9a2bd17afcf863101b7c7c7
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/laughter.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0eff327732d49f7583ebac13e36c5d57685bcdbbb660292114087ce788f76236
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/lawn_mowing.npz b/capspeech/nar/data_preprocessing/clap_embs/lawn_mowing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..820453c300a59309b4ac7159b8c8bbdce6768572
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/lawn_mowing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94744df312dce6e544444f7e4a46c9b44bf9c823f112f98910c9d60e3f17ab93
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/lighting_firecrackers.npz b/capspeech/nar/data_preprocessing/clap_embs/lighting_firecrackers.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7370072867cb9f58597ecbbc66e74acc62a4223a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/lighting_firecrackers.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b173ec36e3662dad053273e1206d81763cc8e3449b948d66fd02b3356aaa0d8
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/lions_growling.npz b/capspeech/nar/data_preprocessing/clap_embs/lions_growling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c705fa5fc7c8b88b87d23cf87629b767352bd50b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/lions_growling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:227ae250f7ae8002a68f444a88244c013e4212b8c4dc8453ce4fd6ce5cce9946
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/lions_roaring.npz b/capspeech/nar/data_preprocessing/clap_embs/lions_roaring.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4b2eae3bfac235f265d4f232f2b189c0d55927c9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/lions_roaring.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31debb9f8edc4bf3213d95e020d6c669784e263b24483cc7abe7a01c67a98497
+size 2158
diff --git a/capspeech/nar/data_preprocessing/clap_embs/lip_smacking.npz b/capspeech/nar/data_preprocessing/clap_embs/lip_smacking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b856b278149dad8cfbc10246048cb3dfc27c6981
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/lip_smacking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:daa05c40e4bde9cf3c92c1f349f1fe151635f114db10cb042409f383419e235e
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/machine_gun_shooting.npz b/capspeech/nar/data_preprocessing/clap_embs/machine_gun_shooting.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6b95290bc91d8214612a6452a225187ec516f576
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/machine_gun_shooting.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b500cf31021c8f61c9dc3493a537c8f3ee6025b5db5a21a109425fea5d3fa64b
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/magpie_calling.npz b/capspeech/nar/data_preprocessing/clap_embs/magpie_calling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..02f2ecbc803ca079f54859822f818a9077723340
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/magpie_calling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b89f4cdc2997e8abcdd5f640ce8cdbf24e7ba3ab356b8ce092f45fa5d40ab55
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/male_singing.npz b/capspeech/nar/data_preprocessing/clap_embs/male_singing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..57a9382c0a83f58f2b31e9e10dae3a3a344da0b5
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/male_singing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b43ef9d9856fe0d51a6a20be667fe1bbadc69ef2386b698ccf7938cc4b875c81
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/meow.npz b/capspeech/nar/data_preprocessing/clap_embs/meow.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7dc99e4756be7ef1b2a8748679f866624403e105
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/meow.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:533d3023f536f496e006bca819216689de3ac8af9ef3b2e250502f8375d9d20f
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/metronome.npz b/capspeech/nar/data_preprocessing/clap_embs/metronome.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3cd2e7a14fc7bee470e09aca00478589c6d3f34e
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/metronome.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88a65c7f43674c5932d0a92494022c4e068331b98e2e48df231f7fda267fa679
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/microwave_oven.npz b/capspeech/nar/data_preprocessing/clap_embs/microwave_oven.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ab3b335d3bac017fe2dd3a2d4dd99f9c2e91a984
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/microwave_oven.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:429e47ed363a113db7afe922d4b5cc96c4044b4ace8f22d97ce27462e4a5b624
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/missile_launch.npz b/capspeech/nar/data_preprocessing/clap_embs/missile_launch.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1be9883bd28295f9d1e1432be6d35b179dd3e32d
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/missile_launch.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:712b8d30b19c916b9aa2115c8a8f0fa0b869b5acc8891ac92a514a846a3e5525
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/mosquito_buzzing.npz b/capspeech/nar/data_preprocessing/clap_embs/mosquito_buzzing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..de9bd5f3236ce3c766629304611675247a09dfa3
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/mosquito_buzzing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6bdc222d21022325ef46e10d22c017e1dcdb2c16952bcb10869f19ad3d2998f3
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/motorboat,_speedboat_acceleration.npz b/capspeech/nar/data_preprocessing/clap_embs/motorboat,_speedboat_acceleration.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ca3605c2f3516c039888e5398edc678c17b41370
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/motorboat,_speedboat_acceleration.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a116b31009960ed20c6cb1642b3a685d9664086b687b945637637aa09a7aa73
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/mouse_click.npz b/capspeech/nar/data_preprocessing/clap_embs/mouse_click.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7c506b297911a66e7a1d3d0d405833adf80aabab
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/mouse_click.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ad5a602566239ec4449e9220e5a62ad5137bfe4ea52df92d436c2e953c647fe
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/mouse_clicking.npz b/capspeech/nar/data_preprocessing/clap_embs/mouse_clicking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6cff060988fc624ea10e84254a3eb59ebb528ab9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/mouse_clicking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7740f62ef83bf92b6aa68f87009fc99e2469ac4c28ce21596571b06afcd391b
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/mouse_pattering.npz b/capspeech/nar/data_preprocessing/clap_embs/mouse_pattering.npz
new file mode 100644
index 0000000000000000000000000000000000000000..11f853e7a7bc825f2df9631c5beef6196bed93bb
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/mouse_pattering.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:342c368cde53d2fc2752b4f4cd0b1802acd38bbc959ab27f9e23caca0888305a
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/mouse_squeaking.npz b/capspeech/nar/data_preprocessing/clap_embs/mouse_squeaking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4d2641ac8874e334e5319d4ebfba8b6f5f9e2401
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/mouse_squeaking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eabbf1bcafeb1c2f7b40e8a2e5fc02b2b85ccf92c9210114b9f4a41165447cca
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/mynah_bird_singing.npz b/capspeech/nar/data_preprocessing/clap_embs/mynah_bird_singing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5321c726e5cd5758e363d26c9c33b1fdde82b8e6
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/mynah_bird_singing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c2f0a38b774f3e700ed6f46c1cfcf04dc1fc34b776a26901aa0c76608ed7c15
+size 2158
diff --git a/capspeech/nar/data_preprocessing/clap_embs/none.npz b/capspeech/nar/data_preprocessing/clap_embs/none.npz
new file mode 100644
index 0000000000000000000000000000000000000000..95bcef7a5c8dc985cf74c1cbbf345d9f4634fcb8
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/none.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7a018d813c454d309b0839ff9668c8dd3d807e14c2803b76fa5de1c981afeb4
+size 2160
diff --git a/capspeech/nar/data_preprocessing/clap_embs/oboe.npz b/capspeech/nar/data_preprocessing/clap_embs/oboe.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e06667eefce92d5437134bfbcaab1dd653638a60
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/oboe.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b48bcb878820832a5c2ca721294a2c9d2505d9f99077e32bdc93b36b0b06489
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/ocean_burbling.npz b/capspeech/nar/data_preprocessing/clap_embs/ocean_burbling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c3025b96860ad04441e4f13170fec41810c49035
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/ocean_burbling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efe22b87550db9c9c94518c13a4e3dcebffe4257281e1773d9867a676373c7cb
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/opening_or_closing_car_doors.npz b/capspeech/nar/data_preprocessing/clap_embs/opening_or_closing_car_doors.npz
new file mode 100644
index 0000000000000000000000000000000000000000..dceb256a404ec13f8906e1a707bf7152f4b7e0c1
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/opening_or_closing_car_doors.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01659232fb9dd093fac0089a54355440f448b37b8fc0ba9786e0589ca7dd254a
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/opening_or_closing_car_electric_windows.npz b/capspeech/nar/data_preprocessing/clap_embs/opening_or_closing_car_electric_windows.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4924d9237584c2ce09245a81ec3e037cf58b9a9e
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/opening_or_closing_car_electric_windows.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9088de524eb4d3af0f88c2f703edaacde980a586cc38f2d5ffc596fd12eab324
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/opening_or_closing_drawers.npz b/capspeech/nar/data_preprocessing/clap_embs/opening_or_closing_drawers.npz
new file mode 100644
index 0000000000000000000000000000000000000000..cb7181e2279dadac9f5f37817b2bd5e0e7237d01
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/opening_or_closing_drawers.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a92141fcb3b3a71d06bec9af04537b78bbfba9d50a4bc85b3ac3f8f6046c873
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/orchestra.npz b/capspeech/nar/data_preprocessing/clap_embs/orchestra.npz
new file mode 100644
index 0000000000000000000000000000000000000000..136c806b8b7e262f2862bb456f2ce96bb839c388
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/orchestra.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c09ca035bd525531a755f8ef1c0dca53f1840976f135bf497ecf6f38ca63e92
+size 2137
diff --git a/capspeech/nar/data_preprocessing/clap_embs/otter_growling.npz b/capspeech/nar/data_preprocessing/clap_embs/otter_growling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..fa5b9aeee8a89cfe71ba21e5f9519bca296e5b9b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/otter_growling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec98f156be939d097d61aa438f8a84c9d75fece7e523e8dc14db744daecfc330
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/owl_hooting.npz b/capspeech/nar/data_preprocessing/clap_embs/owl_hooting.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b6244d5a8f79084ec311ef0579c06c3fd46f0741
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/owl_hooting.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4671331f6def06217230c1a64ce3183ca56fcd2d9f27e997d249496ece594a6d
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/parrot_talking.npz b/capspeech/nar/data_preprocessing/clap_embs/parrot_talking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c6c8210a9e8c87e9b46badda84c4dca7c704a69d
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/parrot_talking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8c97e2cd6be7af0cc0586864554e281dfef6c98d73e20464b734f4425aecd73
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/penguins_braying.npz b/capspeech/nar/data_preprocessing/clap_embs/penguins_braying.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2c6e858b4dc92c6f818e3b36fd0f0cfb354e7c82
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/penguins_braying.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90e6b3bf9b10d7583192365477a614ee761ddd88f8ba941c8c2fb54c3efb6a2a
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_babbling.npz b/capspeech/nar/data_preprocessing/clap_embs/people_babbling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..04aa571f3ed843e9b7e657306350a3aa3fea55c4
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_babbling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:311bcbf93c97975d2f230fa9e47aef3f39cf5ef602e01eab0b573f248dc6c2ad
+size 2158
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_battle_cry.npz b/capspeech/nar/data_preprocessing/clap_embs/people_battle_cry.npz
new file mode 100644
index 0000000000000000000000000000000000000000..17511ec9910339b67e3a85ac5810b6cfef5df394
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_battle_cry.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a77112b662db33b6080926daaf62970459b6ce7b06c19eb24a0859ad9ceb284
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_belly_laughing.npz b/capspeech/nar/data_preprocessing/clap_embs/people_belly_laughing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b9346aed40ccb78a13c49e3b7abb316aa2c3e89e
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_belly_laughing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38007fcefd2ef2ebcfdd0618f6662be2d6c9cf1b495e1478118ba434e6a1b8b5
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_booing.npz b/capspeech/nar/data_preprocessing/clap_embs/people_booing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d6a5ed5b5417a970cf3d27d27efa18d2614977a8
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_booing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5a2b1cf97e670fc13fd1bc2ebce37ce523dbd4a8093a44b5e524a66e7723c03
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_burping.npz b/capspeech/nar/data_preprocessing/clap_embs/people_burping.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ab7490330830ced1e944132d44e3dd1b4a60579c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_burping.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d57d92a32007fe11fde0f3f3cd47584e66ae22050729bfb93839070e4d57f71f
+size 2158
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_cheering.npz b/capspeech/nar/data_preprocessing/clap_embs/people_cheering.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4d87455eadd38abfe5a658c0c89ec78536050bfe
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_cheering.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a33ca855cb46d859eff4f2ef0def67338dd421095c2e6fdf6b0bfd3fdd6b75d8
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_clapping.npz b/capspeech/nar/data_preprocessing/clap_embs/people_clapping.npz
new file mode 100644
index 0000000000000000000000000000000000000000..232ad93c447a0b431b936d93bad29bf76b6e9f0e
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_clapping.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3dd0f386eb69d01246a9290ecfbd382a58116be18d8caee9815a7fffe8172c3
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_coughing.npz b/capspeech/nar/data_preprocessing/clap_embs/people_coughing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..bcf4dde6f40141f6814d4cbee0a5f0293643d00a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_coughing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bc5980658122c09313f89f59dc21c9cd9b84a982493ce771de75e445f52c8f2
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_crowd.npz b/capspeech/nar/data_preprocessing/clap_embs/people_crowd.npz
new file mode 100644
index 0000000000000000000000000000000000000000..50c73eaea50cb958f0af7c719fe2e803d39d7716
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_crowd.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cf69b868032176935f4d7dcce97baf203200f1abd415d2583764447c9ec648c
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_eating.npz b/capspeech/nar/data_preprocessing/clap_embs/people_eating.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3aca0b093cdc185d3081fe0a1deebec4d3f2d5d9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_eating.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ebe165da7bbfa1c853080cbf6419e589dd99d045893b160b08affe2a615d2ae
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_eating_apple.npz b/capspeech/nar/data_preprocessing/clap_embs/people_eating_apple.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1236c4e39fb9653b60babf87e4794f933665afa0
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_eating_apple.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b56ef0f9f4718f546673493ccf41b66ce927391c04d129526d2dec8898b654c
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_eating_crisps.npz b/capspeech/nar/data_preprocessing/clap_embs/people_eating_crisps.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9e861d48c810dde7580534f6ba3d0f5bfb7afd09
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_eating_crisps.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:023788b0c75a534f995ef3af06c0c84b43cd761705e6747cf018a638146033e0
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_eating_noodle.npz b/capspeech/nar/data_preprocessing/clap_embs/people_eating_noodle.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d0e60abec8df9bac0404558eccb9cb19f7ddb3c5
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_eating_noodle.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2ef2deb0ecd1adad3e310b274657683abca95b5670d864ec7a8dc936d743556
+size 2159
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_farting.npz b/capspeech/nar/data_preprocessing/clap_embs/people_farting.npz
new file mode 100644
index 0000000000000000000000000000000000000000..63c02a9105ed8cdbfccc88a84d351a6c4a0dc0f1
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_farting.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf06bb7b158d7826cd26d54e112178d7ee042049cb5193a17ca0c3d415a063ef
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_finger_snapping.npz b/capspeech/nar/data_preprocessing/clap_embs/people_finger_snapping.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8331ca16785374a006b869b03af35c4e968df0d6
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_finger_snapping.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f883365df303d3108eff7c9ea22d4647a536f7e94523914af9ec9ed2652bb850
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_gargling.npz b/capspeech/nar/data_preprocessing/clap_embs/people_gargling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..dcc63f42c05902eb5f8ab2d6f5449f86227e0765
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_gargling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1daee679494c90b59fc6750bf5f3a5b7ae474def3d40fedd838d843740eb6fd3
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_giggling.npz b/capspeech/nar/data_preprocessing/clap_embs/people_giggling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5640a5ef31efc9f750607483874705f907833fa9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_giggling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d2acc7340c47bbf6a60603c76f4149c7a3c3425dd35221e98a760875e5c719c
+size 2141
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_hiccup.npz b/capspeech/nar/data_preprocessing/clap_embs/people_hiccup.npz
new file mode 100644
index 0000000000000000000000000000000000000000..21613eacc65bd67ee8bba35e08982cfa1bd02319
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_hiccup.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:577db0eee26ec9740431afad6c65f0c31cc3b8843961bca389f316eb33c24da2
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_humming.npz b/capspeech/nar/data_preprocessing/clap_embs/people_humming.npz
new file mode 100644
index 0000000000000000000000000000000000000000..18b2f49bb62c352380dd8075546e7f4983ae89d3
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_humming.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa5e6d6f5ae9730486385687ece9df77487b3897df56633d5def5bdff3ad5b7d
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_marching.npz b/capspeech/nar/data_preprocessing/clap_embs/people_marching.npz
new file mode 100644
index 0000000000000000000000000000000000000000..beb7e37c4d943326a95dd65b991a1f034fa88d6c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_marching.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f204e31f6cfe8bd696cbbbf99351c84c4e37e64afe3c47f1d3b818e68a4c27e7
+size 2144
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_nose_blowing.npz b/capspeech/nar/data_preprocessing/clap_embs/people_nose_blowing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..56d92695861f63e432cae5ea1c8909bf0502df6c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_nose_blowing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd40e3ed31a19ee76e979c8de4dc121f345912285757c878b277dd6bc438e8fd
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_running.npz b/capspeech/nar/data_preprocessing/clap_embs/people_running.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5fb03270fb2ecd2b401c9011b74c5bbabe310c1e
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_running.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfe20e385e3b2b32e73fe91a4c9c70588a529e9f381804e5e85586e358894cfb
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_screaming.npz b/capspeech/nar/data_preprocessing/clap_embs/people_screaming.npz
new file mode 100644
index 0000000000000000000000000000000000000000..aeaf89bc081a47dad0f62e5e5be7c1145611593c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_screaming.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37eb2df4f464b5943143729fdee9d9ba4164b1dc8f117d0d1f322f184723a2e3
+size 2157
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_shuffling.npz b/capspeech/nar/data_preprocessing/clap_embs/people_shuffling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..12e6fb0eb6c9536661c1aee7321c1936d187e8b2
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_shuffling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17df11bef60fb41784605a4edd11dd100d014142826c6aaf294130b215eb235a
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_slapping.npz b/capspeech/nar/data_preprocessing/clap_embs/people_slapping.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b42944aa82d0c09c5e2ad4b1a73d011fd83c4788
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_slapping.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a142bd049f6b00eeac1704b1b698ffc35ee1037ba5e562e703319c47e7fe95d8
+size 2157
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_slurping.npz b/capspeech/nar/data_preprocessing/clap_embs/people_slurping.npz
new file mode 100644
index 0000000000000000000000000000000000000000..36e0e8c22863659a0dbf8f4c4f5ffed60f4c5229
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_slurping.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c54dc1e98a28461b61f9466aebfb98721c5984c721caa193cdd05eaa1299aea8
+size 2157
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_sneezing.npz b/capspeech/nar/data_preprocessing/clap_embs/people_sneezing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..aee631d665ba5dd06b40f81f566b2253ad332c2a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_sneezing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce329cb90017c54ecb027a726d9e0ce977769bff44193d37187e31348c8ea6a8
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_sniggering.npz b/capspeech/nar/data_preprocessing/clap_embs/people_sniggering.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9db21665bc553dd0ae022e634d31759d3ebbb121
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_sniggering.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50caa8f53f94ed60e87a3a23846a73a902b8476d80c4cedc549bd35ed1237f54
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_sobbing.npz b/capspeech/nar/data_preprocessing/clap_embs/people_sobbing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..35f9382e3617729c184332d21dcb5e484f27e16b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_sobbing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96f324793d26794a93495a4702db562bbfe618ede848fc4abffb9b540c3489e7
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_whispering.npz b/capspeech/nar/data_preprocessing/clap_embs/people_whispering.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c3aaa79fa3a7e9d7cdce2a79471b4349aeb1f2cc
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_whispering.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee8c23fad5f3e3f010403da3d5cb089f15ec398ec31be8c861bc93b10d14097f
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/people_whistling.npz b/capspeech/nar/data_preprocessing/clap_embs/people_whistling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ba690fb961948c1f6ab5da69f761c5c8736bccf7
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/people_whistling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9a5193695e0e17937befafbbf0dc6f714bfe0308c9f5e972b1c95b7bfd550a7
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/pheasant_crowing.npz b/capspeech/nar/data_preprocessing/clap_embs/pheasant_crowing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..58a8309285f713107532b18c81978e3c78ffec76
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/pheasant_crowing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df283545239906d60cf2192117408a7c26a4251ecba9dbfba885f2907f12590
+size 2159
diff --git a/capspeech/nar/data_preprocessing/clap_embs/pig.npz b/capspeech/nar/data_preprocessing/clap_embs/pig.npz
new file mode 100644
index 0000000000000000000000000000000000000000..988df0f83bf2eb0d2c024e8de8ff7cfa23fdc370
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/pig.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a823dd3f540ef390a378952d0f24544057d95bee1caf8ccaa2e6a411650d2f3
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/pig_oinking.npz b/capspeech/nar/data_preprocessing/clap_embs/pig_oinking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c0841491dd1b346ceed8a34b6d980c59043b03c6
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/pig_oinking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7639f3b33d17bea9be18b22ff53175cd86e193c5e1889f8a2029a8c3d4c20a7e
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/pigeon,_dove_cooing.npz b/capspeech/nar/data_preprocessing/clap_embs/pigeon,_dove_cooing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6154e3ff6fec06c6c52addf3bf1149cb73a24a37
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/pigeon,_dove_cooing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b339bb7a420b115d3df65b7af398c2578b09fda25e2f61020486a0d8a57fa94
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/planing_timber.npz b/capspeech/nar/data_preprocessing/clap_embs/planing_timber.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0f98a1bd4fcd7e9478e973479a72c31065be3d07
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/planing_timber.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:487b9a25b32355159053548a681227ca4cd13dc95845164e30b3cb1eeb030acc
+size 2162
diff --git a/capspeech/nar/data_preprocessing/clap_embs/plastic_bottle_crushing.npz b/capspeech/nar/data_preprocessing/clap_embs/plastic_bottle_crushing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7ce1103f0847f4b89e4a5ae999190e79989cd54c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/plastic_bottle_crushing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:883459dc8dd63f29d87c9f25c8e886aabc261b140e767d38f6d48034f54ee899
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_accordion.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_accordion.npz
new file mode 100644
index 0000000000000000000000000000000000000000..737f56bfd12b8e68a7192c98dba7e0fc41b36fba
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_accordion.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86516fac9cb13b9cac8108df0979bffe51b7ffa05929ae907fa1aa92c63c7616
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_acoustic_guitar.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_acoustic_guitar.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9024c3e08d9639a00274500f5a52db19aa2a5f4b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_acoustic_guitar.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:115d91e9f3123112ae1383fc63eb1207c0874b5c22f20218cb37abbdabdd7a1f
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_badminton.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_badminton.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e2118c8078cc29e798220235d1a39d251028cb13
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_badminton.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0217cc589d6579e3c8895b655c9c90ef855d32be9f13f24ba1f0279553bf1a7b
+size 2142
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_bagpipes.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_bagpipes.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6940f47041d620af2074687e4954903cfc11132f
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_bagpipes.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a88cc063e94937ed771b2ff8c2517690f68f492c4e6b3f8a5472652f9dc30e1
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_banjo.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_banjo.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c130edb26f987f5a498e1416ff228968d98d6cda
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_banjo.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a7030736b3ab2737d0562b33277c44b7a2c8230c865e3b409407920430d9fd1
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_bass_drum.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_bass_drum.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a8652d84f22a12b410cc444ac99282139a03148c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_bass_drum.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97ff6eaa97d38eb08fa2c2d048d6c42983c0055c966f82eeeff46a607c0f4d3d
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_bass_guitar.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_bass_guitar.npz
new file mode 100644
index 0000000000000000000000000000000000000000..325da97433f3a787ccde08017743f63dedd59aeb
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_bass_guitar.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:212caa8a9e1d1b9ffbe8c5152c8d7059a4c75c009ec2589bf80dfb8e44fe771c
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_bassoon.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_bassoon.npz
new file mode 100644
index 0000000000000000000000000000000000000000..359bea96fec65011fc2f75b1d1f49120057682d9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_bassoon.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19e0e173686967b2c2dc4c24041db9415b1626e25677872fac19657beaec0a2f
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_bongo.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_bongo.npz
new file mode 100644
index 0000000000000000000000000000000000000000..da13ce3b3b61693d883e6441cdac9b9c01410b77
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_bongo.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e94e67e166255869255ee8921dee23f1b85a42e57634226233e82f7e2f704ecb
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_bugle.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_bugle.npz
new file mode 100644
index 0000000000000000000000000000000000000000..92efce0aedb1ca7854f9fae2bc53a518e25e6882
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_bugle.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5418fd6fd60a760a97148d93d7448bd09f020c29f97c8918d3c9ca546570273
+size 2141
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_castanets.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_castanets.npz
new file mode 100644
index 0000000000000000000000000000000000000000..412c45ce1df71127e53f5c328c7eede139d0c459
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_castanets.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95ac0825f5d80df9c73f41ce3d690b0af36f7870d5f42944885d3813c6274ca2
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_cello.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_cello.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c5dc583b0365444e2ecb00dd10e82963505ddb22
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_cello.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6b284d1f431bc06de1e453ac63f2d949f172c77610b6da1aad52a9a39a771f3
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_clarinet.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_clarinet.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d6d0879fe9b71833665cad4581512da64424f9f8
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_clarinet.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bd7cc02074e4c6e83e195b2dfac800de443f4173fa4057cf715e4c4d8c30f39
+size 2143
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_congas.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_congas.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1503a49884c56c62ae80c42504e3eb3350dee197
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_congas.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d72e0b7fb2c77c651321436d4fd26b5df1c4b86b67c72d86eb576bd1e7ccb32f
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_cornet.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_cornet.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a9b5a61bcf4bb27c834d8af9187a3169405c5111
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_cornet.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38a018a9a27cd935024b590ddfce172a73f46556ef8d8f9aa1152aabe39b2ee3
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_cymbal.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_cymbal.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1b6c0332bb7872d5ab2a59e083c6ba21a1b64ee1
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_cymbal.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d24104aa61900caa7da8ead28c63d1aaa19f417865f2b2761d563e9b6a8184fa
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_darts.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_darts.npz
new file mode 100644
index 0000000000000000000000000000000000000000..de25fc7b51a59dcedc249a3a0bfaeaafbd813dcb
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_darts.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:826e1a5a13aaceb69adf8b3959cdba953e2fe195fc4e41a03ce6a26e226d0cde
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_didgeridoo.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_didgeridoo.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d59d829c3343883526f9a882bfaad9f52fa37424
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_didgeridoo.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6817bbccb8df46582a2d84d7f8275580c9519be9ac8950b46d992cc098549da6
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_djembe.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_djembe.npz
new file mode 100644
index 0000000000000000000000000000000000000000..05fc1e14daa190d7f4fea466cd59f0e6b5c5c2f6
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_djembe.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86c5510a780fcd3ba94d197dd09b52f2a1cfafcf43017e03419d1c96d5a6f34a
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_double_bass.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_double_bass.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3cf99d8affa02f23b67899e2e0bc330051e9e61b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_double_bass.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4910d2219138232f1340e92c1421039aea6c138ffc05d3baa0e07bc0d23037b
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_drum_kit.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_drum_kit.npz
new file mode 100644
index 0000000000000000000000000000000000000000..248d084b009728ec6934a0f1f42a73f4b284568c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_drum_kit.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd03e19aba0a248a45f24617e0d80217700f01c75dbb18a12122227d3d28d2a8
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_electric_guitar.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_electric_guitar.npz
new file mode 100644
index 0000000000000000000000000000000000000000..96d1ac2c9cfd1545dbb321e92dbfd9aa5b5e1d7d
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_electric_guitar.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d32c88f6622eb4b6730e82058dc4789c7d5afbd00e9c7a760fbdf0fb404c356b
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_electronic_organ.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_electronic_organ.npz
new file mode 100644
index 0000000000000000000000000000000000000000..002f06cc5c5ca8c7c0442288e343a512546401bd
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_electronic_organ.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d5899688af1fccb7441154a712d0eb04b4d84069662d602dd3ea0784c4e6b24
+size 2158
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_erhu.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_erhu.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0fbccbd4672fdd95f734767bd5d2d62dd019e679
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_erhu.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5199d9d173a95ddee154cc866b99667bf29731c4e6a91dec7a09d7e3fe24a92
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_flute.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_flute.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8a6b1fb29c9fa364a8847e512a015569bdbc8cfc
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_flute.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:893f00b30a34105c983cb06d586efded7109394e18c54335bc4fabd23e267950
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_french_horn.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_french_horn.npz
new file mode 100644
index 0000000000000000000000000000000000000000..65b64cad737ea6dfd2714625afe19dde0f9abfee
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_french_horn.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:174ebaa0e34449455a98217ed16107de34e307a047c735c3b8758b74bcf29e2f
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_glockenspiel.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_glockenspiel.npz
new file mode 100644
index 0000000000000000000000000000000000000000..759d747352771d1d881b259888137200fe0d2355
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_glockenspiel.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d1aa9a6bcb77b8839f24d816d5471b92465f1e7542d36cac4655615969b1492
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_gong.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_gong.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e569f443ac8b91791b1da49bca807361e6c30306
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_gong.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:922b1acb2d660cee613132efee33ca30e35e5ae8b896e00aa5f7342d5d033db5
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_guiro.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_guiro.npz
new file mode 100644
index 0000000000000000000000000000000000000000..044fc00faf9ff5d34673a78a5c29285e171192a0
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_guiro.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4abfc3c891ae72d3dc107c1fbfe732b5479a25454bda3f34d7ca15c4f35a6cbe
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_hammond_organ.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_hammond_organ.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d6493f2775b3e7b66d4c6e8f649243073395a6e8
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_hammond_organ.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e86045c8831da4afea437220b8ecf2038ef25d6d1e72ff2b6eb54be9e1c5b8c
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_harmonica.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_harmonica.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f3faaea10d537c8ec32989c1add1a7fce358d9d0
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_harmonica.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ea5fe5fcdd0ccab9cc01bb9393769ed962a926c87d5a5818e4d2531609208b2
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_harp.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_harp.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0246c271ad7494aa7d285865900a6b60b6c1c8a2
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_harp.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34d6659f6d19f1151490dc2166ac80316f6e95cb815a8053a87b21d9e500142c
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_harpsichord.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_harpsichord.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1cd80b3eff4a5853d4b50d24951316acd115117e
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_harpsichord.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b41228b2429aff3c0ed40deca5f0b21970fc691d93ee47bc588b0b71efc0dc57
+size 2158
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_hockey.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_hockey.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2079f886ef5606c226819ec4bb67b426df9aaffa
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_hockey.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f311e193f55076899787129da8a183921d9b3a162a4f612a0a0bc1f42fb60175
+size 2157
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_lacrosse.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_lacrosse.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2583bd0a84f67ada9e8661db9cbaedc7d2dd9153
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_lacrosse.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c720d74fe8cc019bdd9a3ca9c33c57ca45c31d42d524c90e48bc84e175047819
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_mandolin.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_mandolin.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8dff8d530604c1a47021fe483e99576f013e93b9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_mandolin.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4affdcf124ad336120e09043596c798b93a9a9b6cbbbd46894c2c60188897e09
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_marimba,_xylophone.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_marimba,_xylophone.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6fc3e6b32733465ad61e7dc7ea7373df76265a48
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_marimba,_xylophone.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4f04cd4bdf1003ebe4bf0e734d894e0953acd4bd7b2e731b2ca0cfa1983cc68
+size 2165
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_oboe.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_oboe.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6b69bcd272bd89a8cadff48160f7f25b159adc34
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_oboe.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c64a70f026c2b9f4453cc546bcb3eab7808ba2ac6b886573f8a04df759c5a2cc
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_piano.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_piano.npz
new file mode 100644
index 0000000000000000000000000000000000000000..543deec6a6ba72b7890466a8e455a5210a5230ec
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_piano.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d6f5dac03bd5f166553b4cab73dca9bb09d3c39f032b56683512272c8d8065e
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_saxophone.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_saxophone.npz
new file mode 100644
index 0000000000000000000000000000000000000000..baaa19a9888f7ac1a71050a2a647e8c613d84038
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_saxophone.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebe1df8a3bafddef35c3924ab3d717c556a0ecfebeedfb28d20a7778a020c0ee
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_shofar.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_shofar.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a223ee2bc22c50d60ca55061f1b27a44c8e0b3c7
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_shofar.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f64e04b04ee75415ca94aa4c8c0026706eee35e82fda88648703870edc7d45ad
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_sitar.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_sitar.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8699f266d1e5ac82885fe0c7d4afe391f4396598
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_sitar.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:978c9e14df9a5c0921466f624f0139cdf38b1ce9eb6c70ad0eab912e9b66d0f4
+size 2159
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_snare_drum.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_snare_drum.npz
new file mode 100644
index 0000000000000000000000000000000000000000..906f4d37e364c8f441a5b5d48e7fb03a1130c910
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_snare_drum.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3976823ecfdfd32f6486747cacdec514d6d1a92a7e4dfb9b1d5d67ca988c942
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_squash.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_squash.npz
new file mode 100644
index 0000000000000000000000000000000000000000..83f3d337ba3e8a902d9387b22ad15200a0cfee79
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_squash.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c8c9a17bbf1b4e12eff4b79d7bf7f80d08ffdf1ca30adfb4404c2bcc1c7f7bf
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_steel_guitar,_slide_guitar.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_steel_guitar,_slide_guitar.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f86da3d9c37545a44878e85e3143fd08e3ea1bfc
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_steel_guitar,_slide_guitar.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29990955b21b03fcec8fbdf709030b0eb95898a23ae1c58281379781b0771f47
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_steelpan.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_steelpan.npz
new file mode 100644
index 0000000000000000000000000000000000000000..fcc4f4f632ef9b28a319a28808204382561da2b8
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_steelpan.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d1eef8d96d4e4423c9ff02eafe345cd683e3412c2a3f4f8012678b3464b2a77
+size 2144
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_synthesizer.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_synthesizer.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ce2c6168ff5a4b131fc57480097eac499fb04f98
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_synthesizer.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5aa703cd3d051a64e7409d59678740ae2f350850413d1b96b5e459c364dfe9e
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_tabla.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_tabla.npz
new file mode 100644
index 0000000000000000000000000000000000000000..cdf70749c67fffde6450879215f5e7aca7b210c1
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_tabla.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ef52458411a5537a369d983dcb96ed7aad788357739accd947f704e52a5a4c9
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_table_tennis.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_table_tennis.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2363c44be948f54802aeccf6c8fff897f7792680
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_table_tennis.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:954094d973b054e9c17300f8f6ba82330e5c8326f6200bae141d4dc252ac6c1a
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_tambourine.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_tambourine.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c80def92b420f154779cbd9bbb8fc0b2bafbdf8f
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_tambourine.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1889e0d2cd8346b68d03826f57990933d10d6ae5cf3960fbfa3aa5f14dd8e46b
+size 2157
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_tennis.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_tennis.npz
new file mode 100644
index 0000000000000000000000000000000000000000..084b31bd9852ef4096172491b979202f025dff17
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_tennis.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5025855126b584763f152538a1ae072d92335b95a2cf9a373f14d0c0358223d4
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_theremin.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_theremin.npz
new file mode 100644
index 0000000000000000000000000000000000000000..23ba4e24c4171c8decb56b19c2b67672a8af5882
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_theremin.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6cfea1b957d89331620de47736f5c381e186e849361019c94e982f9ecad9925
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_timbales.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_timbales.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d15203f019cfe3f135ef6b77c2b4fb7132d14366
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_timbales.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a23b59856fa0e4b57ad31deb0d94f52e6719aa5a35a9cd0bfd7318a96f80c86
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_timpani.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_timpani.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4a8da5510e43e3f87294137f35545d63eccd9abe
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_timpani.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0599487f2c800728cdb41f3b30355afca339c21f85f84f922bb65f44f5b14637
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_trombone.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_trombone.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f9fba2064acb0d496c33d5a2472ca1d53f0e28be
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_trombone.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15d7a2856e319452ad154a571c451eddbed2def8b7416c9a416b11bd52e7b90a
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_trumpet.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_trumpet.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4def5233b40f99a2441c6c77a3a017a5dc441c95
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_trumpet.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f221483676e278b5ded57f738520906681f03accff43f3c7cdedf5660240d7db
+size 2159
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_tuning_fork.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_tuning_fork.npz
new file mode 100644
index 0000000000000000000000000000000000000000..78b99fa24122b9f43216f455a09610d31ae6b3fe
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_tuning_fork.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b92e2b8af6f224ed8507bc403e474ecc09af289cdfc97dd3a6d9118130fdc5a
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_tympani.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_tympani.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ca988e8933908e67868bb796217b857bcb5d3787
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_tympani.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b08cb53d57b1744a4cce1e40267924ae1758519012cd8321076c54213ed57509
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_ukulele.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_ukulele.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8b8413e474912f60ce969347ae1374876f0e3a0c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_ukulele.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9a74820e293831f42a00271079bbf9671c31138d147ff4e55c1a1840244f076
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_vibraphone.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_vibraphone.npz
new file mode 100644
index 0000000000000000000000000000000000000000..864610d7ce5fd5ed5d77cc88240be825ca316889
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_vibraphone.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32fdf380e6843778538b7a13331e6427d6abcae446a0f51ca3c3761130e0c2a2
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_violin,_fiddle.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_violin,_fiddle.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9686271c732cc8fb3b67782a7c4ee07db448c018
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_violin,_fiddle.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b450f73c93e86ca5dd113cba68350d1a0f144128f6c713e05d6611e9a13c9911
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_volleyball.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_volleyball.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5368620ec057b9b5cfc6f328338225256206ac6a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_volleyball.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51e8086d2e9dd9f7ee90b239133bea758614f3f5a837b619012cc17c80707591
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_washboard.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_washboard.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5d24a6dbb2832402b0503f4ae90c4e79f465a986
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_washboard.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81adcf9744094b6d6972c5413131bc80189f8d8e9bcf74701115829ff3334a39
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/playing_zither.npz b/capspeech/nar/data_preprocessing/clap_embs/playing_zither.npz
new file mode 100644
index 0000000000000000000000000000000000000000..55a3f1111e46f1ac0a16851f0afc220f478a8676
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/playing_zither.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e82649eca4b2adb741ce9c03e7c57846eb14537beb4097d1f010b8bf0c681715
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/police_car_(siren).npz b/capspeech/nar/data_preprocessing/clap_embs/police_car_(siren).npz
new file mode 100644
index 0000000000000000000000000000000000000000..e1f011c14c7e468406887db141672332b2e73292
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/police_car_(siren).npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a031af521fcd3a5c6540b4ecd41797ebc7e76cb9e983ecc6bcc1ab7997051719
+size 2162
diff --git a/capspeech/nar/data_preprocessing/clap_embs/police_radio_chatter.npz b/capspeech/nar/data_preprocessing/clap_embs/police_radio_chatter.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c42dfec5676e154959b2bcf46a55c98b8f883e49
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/police_radio_chatter.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af194f59b80df9e129f4828be32b6b347ee569159acd478fa870816826aece54
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/popping_popcorn.npz b/capspeech/nar/data_preprocessing/clap_embs/popping_popcorn.npz
new file mode 100644
index 0000000000000000000000000000000000000000..acb6395c8de74ada8476680d3f97e2cd17083782
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/popping_popcorn.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b23c0f5ee97dd663c880a385eacf658c3464c56808f5bbf526a1be85da1a5a9e
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/pouring_water.npz b/capspeech/nar/data_preprocessing/clap_embs/pouring_water.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f1c59022ffacd30e2918a08d3ef5ddf9847f3257
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/pouring_water.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06f6523f4a604dcbe8b59f8e4e17564b4fdd52ab6bce92d60091e84025c18c6f
+size 2157
diff --git a/capspeech/nar/data_preprocessing/clap_embs/printer_printing.npz b/capspeech/nar/data_preprocessing/clap_embs/printer_printing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9d4722331b08c94c8978e5a699f0f937802c11cd
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/printer_printing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e961b7988153b955da340038ca9bc9afa41e607497e524d899aa5b4452364b90
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/pumping_water.npz b/capspeech/nar/data_preprocessing/clap_embs/pumping_water.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c1e542e06971266d4523abf57d2ed6f4dbfb7c49
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/pumping_water.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64bbf3945f3f0c89a71a8d1fb038f914f2def41361f98e20e3d78b8d3435b781
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/race_car,_auto_racing.npz b/capspeech/nar/data_preprocessing/clap_embs/race_car,_auto_racing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2a6d4faa135dfd72ab49bd12ec187d256ce24270
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/race_car,_auto_racing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a13fe5487c7d962bf9cb245cbad03b60f654c87720a0a7db6a2a0a47b329634
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/railroad_car,_train_wagon.npz b/capspeech/nar/data_preprocessing/clap_embs/railroad_car,_train_wagon.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a60166af5fdf9d4a2471b010b8d8f47acf150693
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/railroad_car,_train_wagon.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:723e95361cffa62f7b402345c758a9ccbc508fff0c619ece9ce8e6aec573975b
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/rain.npz b/capspeech/nar/data_preprocessing/clap_embs/rain.npz
new file mode 100644
index 0000000000000000000000000000000000000000..77edd19da94c7d237bda716a16d1f802b8996c77
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/rain.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f986bcdf7911a29552f8bd98083e1ffbb9e9b723bfb1dfa286b1b71bda3fcf2
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/raining.npz b/capspeech/nar/data_preprocessing/clap_embs/raining.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4010c9748d96238d3d417c5e4eb3964caede3f69
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/raining.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db68df764f27c43235d1e00d513105d86a43bbc099abaa4157a73947f353a354
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/rapping.npz b/capspeech/nar/data_preprocessing/clap_embs/rapping.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6154d730d6029e49bf06e0c4d3c0a39aee105133
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/rapping.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed0ae01d51131bc3e10a1afcb6702f9e955c92653becd08940cc5a23d31c289b
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/reversing_beeps.npz b/capspeech/nar/data_preprocessing/clap_embs/reversing_beeps.npz
new file mode 100644
index 0000000000000000000000000000000000000000..901d8d2112e8e2eff9d094e6f8998018c9e25741
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/reversing_beeps.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9bbd86c50d54ad7f1dfc61470406244924d284da2069b696aa4cf6752a52601b
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/ripping_paper.npz b/capspeech/nar/data_preprocessing/clap_embs/ripping_paper.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e82776c0dfaf485901dc19c8e33f162eeed0bec3
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/ripping_paper.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36aa232472a30fc7c709306a10bbe47d3e6b009f144daf6d2017d2d73b53140a
+size 2159
diff --git a/capspeech/nar/data_preprocessing/clap_embs/roller_coaster_running.npz b/capspeech/nar/data_preprocessing/clap_embs/roller_coaster_running.npz
new file mode 100644
index 0000000000000000000000000000000000000000..295cac96e822c071ef0f08551063c2d3b8eff8d9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/roller_coaster_running.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8ed4314dba807946e99a165732d140ec358c57dedeea589c0e48b1cbb700c85
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/rooster.npz b/capspeech/nar/data_preprocessing/clap_embs/rooster.npz
new file mode 100644
index 0000000000000000000000000000000000000000..eefa50e6bbe585a84bda47c76e015ea1bda59326
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/rooster.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:627087243c0dd1a5f39b15b365d7437000dc5cf757f102367bbbabaeb032af65
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/rope_skipping.npz b/capspeech/nar/data_preprocessing/clap_embs/rope_skipping.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2a40a60e52d034e46d1d8ee30977e8c54e1615f2
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/rope_skipping.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5817ec84d02f97ca85a8107d23318195c691c14039e4956f7b15331a519405dd
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/rowboat,_canoe,_kayak_rowing.npz b/capspeech/nar/data_preprocessing/clap_embs/rowboat,_canoe,_kayak_rowing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c6e0ef6df60dd5568c6e6518b2e9290fdcb24684
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/rowboat,_canoe,_kayak_rowing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4a60301fac349e73b870970c3836aa76391af9282dfe7f8da8025182100f68b
+size 2144
diff --git a/capspeech/nar/data_preprocessing/clap_embs/running_electric_fan.npz b/capspeech/nar/data_preprocessing/clap_embs/running_electric_fan.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f74392e9087914f89e5f4eb6d2bffa59b68d37ad
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/running_electric_fan.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fef6f6a78c5abb618f397bb4794a1c6d9875b80296c10fd5c03c4c6562ebeaa1
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/sailing.npz b/capspeech/nar/data_preprocessing/clap_embs/sailing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2cb64c27bbe4fc09a87dd0852b8c0242598e36c6
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/sailing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c526c563dd4fd4204693dc841be82fc9ae4bacef6c644f0a7c704cd8156b8848
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/saxophone.npz b/capspeech/nar/data_preprocessing/clap_embs/saxophone.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1a0ffb449bcc3a48c9d590d56900e4b1037910fe
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/saxophone.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bd584d02d49efdd94166b0d7804e5a767ab869c286815484b01d59db9369c1d
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/scissors.npz b/capspeech/nar/data_preprocessing/clap_embs/scissors.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c70eb86948e62a9755f901642233e27552e2b54d
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/scissors.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0ecaf3074368a6e0115b2a1645d7d93132d2cb2418078a7788c2b8403efcb77
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/scuba_diving.npz b/capspeech/nar/data_preprocessing/clap_embs/scuba_diving.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d18650d9fdc81ff61ec038e9953998f6bfeceb50
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/scuba_diving.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af7aaccd5f9cec8045131a68d54d62d3d909441e8d278bab934b356477491af5
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/sea_lion_barking.npz b/capspeech/nar/data_preprocessing/clap_embs/sea_lion_barking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4e0f74fcc791b1f78387af1c00a5fb7d22f339c3
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/sea_lion_barking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:138777f23c16e8115c293046dd3034fcaa03e690980c64849999c36f849aa4ae
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/sea_waves.npz b/capspeech/nar/data_preprocessing/clap_embs/sea_waves.npz
new file mode 100644
index 0000000000000000000000000000000000000000..32be964640f6c15870b5f9d78f05b236aa6f34d9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/sea_waves.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41665042087aedf1c37ccf0d41d220c816c999a6390b38b8f29f551d4f96352b
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/sharpen_knife.npz b/capspeech/nar/data_preprocessing/clap_embs/sharpen_knife.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4ee184d2d439d88797474e048f9d1fe42175f8c4
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/sharpen_knife.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c86031646bd4ce7d298de21ee214c827e308598a96bb97e70e88a621d5413ede
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/shatter.npz b/capspeech/nar/data_preprocessing/clap_embs/shatter.npz
new file mode 100644
index 0000000000000000000000000000000000000000..de0a0f685b16144d9fe57668df84fe3139846bcb
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/shatter.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43cf8485d670cb7daf03b89f6e42a27b4541024eb891b83669c7b0de1787a9e3
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/sheep.npz b/capspeech/nar/data_preprocessing/clap_embs/sheep.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a30e367c2102b840372a9d15628fb4f0c97792fb
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/sheep.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc613a86d5889cc0052c5ebfdd8ee9bb5a41ab8d49ac10d072b02022e51101f3
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/sheep_bleating.npz b/capspeech/nar/data_preprocessing/clap_embs/sheep_bleating.npz
new file mode 100644
index 0000000000000000000000000000000000000000..76a079e38a8ee1acbbb7399844336eef5741d1d3
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/sheep_bleating.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3605ade2e31fd3c1df46e237d3af84ddace0f0e4730d1aee29e88b9ee49bcd4f
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/singing_bowl.npz b/capspeech/nar/data_preprocessing/clap_embs/singing_bowl.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f7cd5a08eecc9ca1d8b4b3907129cfef2cdcde5f
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/singing_bowl.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1805121491c1df80f88b22a8d6c83f52b28ee20b96da9d07c3346552cf0f915
+size 2157
diff --git a/capspeech/nar/data_preprocessing/clap_embs/singing_choir.npz b/capspeech/nar/data_preprocessing/clap_embs/singing_choir.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ba1298b04ded33ae23c70e94d6735b21bf595196
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/singing_choir.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f23c1809f025c1b3510b228f75ef5a1d9e537960d5c0b55acee36539e2c2cff
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/siren.npz b/capspeech/nar/data_preprocessing/clap_embs/siren.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e293c96c34d26dcee3b870ebd37d6c6217abfc10
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/siren.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90cded8e05124607a750cdcad17ae19b375e5ac80deb7eaf54e418306e6fa09c
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/skateboarding.npz b/capspeech/nar/data_preprocessing/clap_embs/skateboarding.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c9215c88be78047de39ca7573e375f65ba7d8650
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/skateboarding.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b14dfbcb6e92c3e8dcc3cf872c5f9541cdd939ac40e394e37f8a2012f0367b7
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/skidding.npz b/capspeech/nar/data_preprocessing/clap_embs/skidding.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b82a194e692bbe4c72836fbac25e64193a49b892
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/skidding.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9c4900f693fd39cd67133b8f2976c582d0e04cdc2f4f25ffc413ecfc979c6ab
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/skiing.npz b/capspeech/nar/data_preprocessing/clap_embs/skiing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..96903ee2bf83865a01524176c020c4b053917751
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/skiing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84d03932eaa5d6e169fce01b5b17a2cf8372d95586261d1fad63b9fa2a35e2af
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/sliding_door.npz b/capspeech/nar/data_preprocessing/clap_embs/sliding_door.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a13ca5b5616c995445556da31306fd4bb3e44378
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/sliding_door.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15e3279d4751f4c4ea7243a49e72195d4d57dd2584abf18ddb20f941c7eecc91
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/sloshing_water.npz b/capspeech/nar/data_preprocessing/clap_embs/sloshing_water.npz
new file mode 100644
index 0000000000000000000000000000000000000000..98e9df3b70da56bdc716d2a02fc6ca9f06aadadc
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/sloshing_water.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0c5bb69f91d3cfb7927ee92a556327a58aa9819ec626e5a5424bc331d224b0b
+size 2143
diff --git a/capspeech/nar/data_preprocessing/clap_embs/slot_machine.npz b/capspeech/nar/data_preprocessing/clap_embs/slot_machine.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4c3ff865bc17f25aacd46b8466d9d7c80676f1f7
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/slot_machine.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5116aca9076ebc223e444052a948bb1d39dad1f78486a4e72d1957be6308110c
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/smoke_detector_beeping.npz b/capspeech/nar/data_preprocessing/clap_embs/smoke_detector_beeping.npz
new file mode 100644
index 0000000000000000000000000000000000000000..14c1b5961d37c3ded4961c2b7f584552ada5777e
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/smoke_detector_beeping.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36417f1f30c323b7dc84c6ad58101b3b804f4ccea2b5cbeb336cb79e7aadd40c
+size 2144
diff --git a/capspeech/nar/data_preprocessing/clap_embs/snake_hissing.npz b/capspeech/nar/data_preprocessing/clap_embs/snake_hissing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..79b1a98c82f0e2d6987b742272a0937a19dbb764
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/snake_hissing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efc072ba73cd558b98ed1b6978f319b66e4ee0ad90de227bd41328b9095f0466
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/snake_rattling.npz b/capspeech/nar/data_preprocessing/clap_embs/snake_rattling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..439267b61c41fcd908f32b3694c1cd51536c91ce
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/snake_rattling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cd8e78462a2d7eba6d766ff8be5d7f7eec48d3b7993da382e1efefb48c3b38e
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/snare_drum.npz b/capspeech/nar/data_preprocessing/clap_embs/snare_drum.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4af1d0f244965127aa408d1fb58d66ebdebd2e61
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/snare_drum.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3328d962b2847b8558142b4f6f0b1a98d48f7a72b93d26dcab7014c3d950ec93
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/sneezing.npz b/capspeech/nar/data_preprocessing/clap_embs/sneezing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..35833420ac03d4342981ba4e8f894b805b83d81c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/sneezing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ae8409fb88ec2b4b4ed3fbcd235f2647f5509f79cc4b5d7cf16b31a756deb50
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/snoring.npz b/capspeech/nar/data_preprocessing/clap_embs/snoring.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e2ec6bb7b8dab55824b2034fda81efbf0453c63b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/snoring.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86a5a6aa68723fce6a5c03f9baaa1bb9d30203756dac47b64d835730f2dee24b
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/splashing_water.npz b/capspeech/nar/data_preprocessing/clap_embs/splashing_water.npz
new file mode 100644
index 0000000000000000000000000000000000000000..cda10c9e4b1b37c47d0ba55837195514e5cb336a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/splashing_water.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57c79602b60bb082833001c62d6be5d7998e2aa677b2aafc93688ebc1933e804
+size 2157
diff --git a/capspeech/nar/data_preprocessing/clap_embs/spraying_water.npz b/capspeech/nar/data_preprocessing/clap_embs/spraying_water.npz
new file mode 100644
index 0000000000000000000000000000000000000000..646573179083a81effa36f32b8e5f56984484c71
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/spraying_water.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b890612dce46aec1caefc10ca52b480f0c68e70fa0eea788e53477c0d3f89b36
+size 2159
diff --git a/capspeech/nar/data_preprocessing/clap_embs/squeak.npz b/capspeech/nar/data_preprocessing/clap_embs/squeak.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8306d7f1bd170bd2d412a3e1bb3d070b8c717e21
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/squeak.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bf6e61a0a55c951887235a821986eadcef4cc04547bf0dd902551cfae6afab5
+size 2155
diff --git a/capspeech/nar/data_preprocessing/clap_embs/squishing_water.npz b/capspeech/nar/data_preprocessing/clap_embs/squishing_water.npz
new file mode 100644
index 0000000000000000000000000000000000000000..17ce7ca7c9468bace968490164fba40851937def
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/squishing_water.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3880da1648c31bfb3d200ab45d65fb232e00c82078ae12b13360c7ad78834b7
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/stream_burbling.npz b/capspeech/nar/data_preprocessing/clap_embs/stream_burbling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c49f171746d64f66090f449ac0a91f536d66e732
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/stream_burbling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6135fdde515b75e02ae3cc4785a1ed54ea556f0f0c9abd28e25b27a2430ae1ff
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/strike_lighter.npz b/capspeech/nar/data_preprocessing/clap_embs/strike_lighter.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a65a11d373051124691629dd1ac0c94f76acab83
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/strike_lighter.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f94fe14b17e8b9a6a44522d16c4c6b7ce1a672da655e5e085fdd176ae8cfd0a
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/striking_bowling.npz b/capspeech/nar/data_preprocessing/clap_embs/striking_bowling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..51b1d4614c54719706e8cd5102dbf2fc9bcb48f6
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/striking_bowling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b285457e5b71745fb604d17fe5a702670c347b677b7ebf0fac7e98afc4bf079
+size 2158
diff --git a/capspeech/nar/data_preprocessing/clap_embs/striking_pool.npz b/capspeech/nar/data_preprocessing/clap_embs/striking_pool.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3eaa5c73db869c32f74898c93690877da31ea5a9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/striking_pool.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3390ded576e00300ac2a4d90ff49bceec94ce3d4177f364ebb841027a796420d
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/subway,_metro,_underground.npz b/capspeech/nar/data_preprocessing/clap_embs/subway,_metro,_underground.npz
new file mode 100644
index 0000000000000000000000000000000000000000..07a81b12bd53041a0a4792019b5d5a504e49d03c
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/subway,_metro,_underground.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4187df02d01a467ff0026ed11d79d868ade26a1735bca2be861dbe1f4d5419f6
+size 2158
diff --git a/capspeech/nar/data_preprocessing/clap_embs/swimming.npz b/capspeech/nar/data_preprocessing/clap_embs/swimming.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8fb0d8891ad4e15a1355c79212aa5ba3b7b4b9aa
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/swimming.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e13a41485589587245eaa06cbee70a1db78f15f5f26a971b5586a1441f9923f
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/tambourine.npz b/capspeech/nar/data_preprocessing/clap_embs/tambourine.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ff760cce61586beccf24f18db0e35325f8783033
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/tambourine.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41fd31f0e69d003c221325114509979a16adf3572f8e9ee1a1a4fdf696e34150
+size 2157
diff --git a/capspeech/nar/data_preprocessing/clap_embs/tap_dancing.npz b/capspeech/nar/data_preprocessing/clap_embs/tap_dancing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4bd6b8a2da759bece88c1c2be949d9470a0cd159
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/tap_dancing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09121a336a43e65cbfb7c5edd600ba8582544a22278329f167a403454552505c
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/tapping_guitar.npz b/capspeech/nar/data_preprocessing/clap_embs/tapping_guitar.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1ef51a6273a5f33cc7c2b25b5f0a195bad9b0a49
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/tapping_guitar.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9653883401ca56a8f5827661a02bc47ffa967ea2ef78a0aca465e2e58d81a32b
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/tearing.npz b/capspeech/nar/data_preprocessing/clap_embs/tearing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e521c154a19b38db56ca6cd2cdab92c18d7140e2
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/tearing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93b8127a861115bb37c20a5362f578ee940fef72da2fa523d84ef621b0505852
+size 2147
diff --git a/capspeech/nar/data_preprocessing/clap_embs/telephone.npz b/capspeech/nar/data_preprocessing/clap_embs/telephone.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0e11aaadec299fb0f6d5060d8cf9e2206172cd2a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/telephone.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:978a22c94ce7c57aad84a17abbab50c94f1b61f78f78810778a86314c5739d1a
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/telephone_bell_ringing.npz b/capspeech/nar/data_preprocessing/clap_embs/telephone_bell_ringing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..58dc215674a5d78da1deb1585df88fb0a90c3805
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/telephone_bell_ringing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cdbbf5c2a9f67fb6186e9a74a491025ee7006cf1803cc1f857d1d05fb1b0d73
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/thunder.npz b/capspeech/nar/data_preprocessing/clap_embs/thunder.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8e2b610f7fe0dc41a679f916344cef97bbee8cff
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/thunder.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c221ccce3d0f3b3eb037d358504bd304b47f980325eddb06209ae8c967ac5a5c
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/thunderstorm.npz b/capspeech/nar/data_preprocessing/clap_embs/thunderstorm.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3390dc79af52eedd85b032760c8fc4c4d06de930
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/thunderstorm.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ed193cd09dd82ccf64059c690395eecbcd70f41e965a4a232ea87e927fffe68
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/toilet_flush.npz b/capspeech/nar/data_preprocessing/clap_embs/toilet_flush.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1f2b52f1a3102d29ce788e6022a1126319bcab1a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/toilet_flush.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf159299a1c10e59ac913cee56bfa290e34a46512fc6ebed9d783e0eabb52098
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/toilet_flushing.npz b/capspeech/nar/data_preprocessing/clap_embs/toilet_flushing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..609d261e197cc4684101072d8f03df84d69a2563
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/toilet_flushing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da9cca53812b7482374c035a91d27a2eb071f64ba918462aadddd1d2e1fed4ae
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/tornado_roaring.npz b/capspeech/nar/data_preprocessing/clap_embs/tornado_roaring.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5d0f2f5332fb8bf1e59797c496747792eb816484
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/tornado_roaring.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75b5005fe18bb0308b929c8f00ee9661fb2dc0c0c8fcb163a1e253ef7943f1e2
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/tractor_digging.npz b/capspeech/nar/data_preprocessing/clap_embs/tractor_digging.npz
new file mode 100644
index 0000000000000000000000000000000000000000..eb616c80ac7b96d153ce0997957c38bc265e6374
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/tractor_digging.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d88aff6d3797ac2a440267f8bb49ce73fbba5a21d6823d6eb24e12e53f32cc9
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/train.npz b/capspeech/nar/data_preprocessing/clap_embs/train.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8d7410d90f1ace9eb10e3eee11d1459357990523
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/train.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9395921764e0f2e7d1e04c6bc9b14dc1b8198e727f45612182796d40cbc06402
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/train_horning.npz b/capspeech/nar/data_preprocessing/clap_embs/train_horning.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ce948d0de864122ea5f2811c2acb30d9355a77e7
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/train_horning.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c78aeddf56ddc3b08e0eb529ec2bc50448e167236cd139f0aa3769bc6219e37
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/train_wheels_squealing.npz b/capspeech/nar/data_preprocessing/clap_embs/train_wheels_squealing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c918a963db97aa258599acad401ef16ee8b3806a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/train_wheels_squealing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad3aff43cabb8467c944006151d1db9c363d8bff4656a0485edba6328e9c7915
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/train_whistling.npz b/capspeech/nar/data_preprocessing/clap_embs/train_whistling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f576e25cc47508483cdb8b22ad69bbe8189c0f2f
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/train_whistling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c77ba2cd54420d2d06ae964dc48271af58c6019ce2715abc53f46a4d59776aae
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/trumpet.npz b/capspeech/nar/data_preprocessing/clap_embs/trumpet.npz
new file mode 100644
index 0000000000000000000000000000000000000000..93a2e38f2858f0f020155daaf30004e5d64cdf4a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/trumpet.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61197308a56c6d4702295ee96c2bd722a1bf909290747664eef7d230d1043f5b
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/turkey_gobbling.npz b/capspeech/nar/data_preprocessing/clap_embs/turkey_gobbling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..30119fb9c8b5eaaa48af4bc8275f133d4557613d
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/turkey_gobbling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb570cb313add17fb103e106a45e9a23a69ef014f8680b5dfdcb7284bebea69c
+size 2154
diff --git a/capspeech/nar/data_preprocessing/clap_embs/typing_on_computer_keyboard.npz b/capspeech/nar/data_preprocessing/clap_embs/typing_on_computer_keyboard.npz
new file mode 100644
index 0000000000000000000000000000000000000000..983da937cc979e5c1e3663b07a6dbcdd67046f1e
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/typing_on_computer_keyboard.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25050c5103481e5a9a6de5ae1db637053d02a22eef7736563b77cc845f09c902
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/typing_on_typewriter.npz b/capspeech/nar/data_preprocessing/clap_embs/typing_on_typewriter.npz
new file mode 100644
index 0000000000000000000000000000000000000000..306d60752c24c31468d3c17a22c8b109a8932dfc
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/typing_on_typewriter.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a17e6b00fbf4fe97e52bba26b7a631e8eecf93391e11c05d1fa1ede39e2b25a
+size 2151
diff --git a/capspeech/nar/data_preprocessing/clap_embs/underwater_bubbling.npz b/capspeech/nar/data_preprocessing/clap_embs/underwater_bubbling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..947f5cc9b46953a7dd2790a21bfe0a69be54afcb
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/underwater_bubbling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1daeff461008caab7f7e0a1b09a31a66425fceecd16a15ec723ab3ed4220ca1e
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/using_sewing_machines.npz b/capspeech/nar/data_preprocessing/clap_embs/using_sewing_machines.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6019258c9492002fbe7a4a14028440597f7f0cbd
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/using_sewing_machines.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1be4460821657a6619aa60e55eb0b1f62c50c1d56437f961ed5e5b0b6d7fcf9
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/vacuum_cleaner.npz b/capspeech/nar/data_preprocessing/clap_embs/vacuum_cleaner.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f20301a2e8dfc3e131db53bf47f46c7d135e57c9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/vacuum_cleaner.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d12a02ea32a392caf3dc80b9b7fc9589998e4b0ac2c4c6b79b993860465dbe6
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/vacuum_cleaner_cleaning_floors.npz b/capspeech/nar/data_preprocessing/clap_embs/vacuum_cleaner_cleaning_floors.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6bd1e20d75aefdecdae5cd6a5150735bae998bd4
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/vacuum_cleaner_cleaning_floors.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3724f1ff9e34aefd31dc34deb44b72ebd490e1a723d60ec226e9e5a758ad591
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/vehicle_horn,_car_horn,_honking.npz b/capspeech/nar/data_preprocessing/clap_embs/vehicle_horn,_car_horn,_honking.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d30fb6a6ffb6762cfa08d1e3206efdc2e6054c1b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/vehicle_horn,_car_horn,_honking.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eeba31b03c9a559e7d19bc243987da3c46a7dec92cc28c9ef06eaafdc4eb8999
+size 2162
diff --git a/capspeech/nar/data_preprocessing/clap_embs/violin_or_fiddle.npz b/capspeech/nar/data_preprocessing/clap_embs/violin_or_fiddle.npz
new file mode 100644
index 0000000000000000000000000000000000000000..67eb0a4565631d3b938dd84310d5e50d7c6e66f8
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/violin_or_fiddle.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1afcc9ed50b55c5b1c56fd28abfc4b1896a6d2ad58548ac76d4101fdc7b8262
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/volcano_explosion.npz b/capspeech/nar/data_preprocessing/clap_embs/volcano_explosion.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6319d239bf9ce66f5349d7ff99287aaef15c0b56
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/volcano_explosion.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e792241e9893b41c218350b5f6ba17d502df0663d5fd27394983c6d577ceb5f
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/warbler_chirping.npz b/capspeech/nar/data_preprocessing/clap_embs/warbler_chirping.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ca972ae3f08047bd206de4a96cf16f44a924fc96
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/warbler_chirping.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6649744cef8022002c5d155dfdb88f0ff1f2f68dd6c35b36e5f9e4b2ce609698
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/washing_machine.npz b/capspeech/nar/data_preprocessing/clap_embs/washing_machine.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1a638cd659367ca292fc4c7d0974738a12da0bc6
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/washing_machine.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b6cbd02f3a3a9fa478c288f16b58219c746a530b748e90b14c5963259dfea96
+size 2149
diff --git a/capspeech/nar/data_preprocessing/clap_embs/water_drops.npz b/capspeech/nar/data_preprocessing/clap_embs/water_drops.npz
new file mode 100644
index 0000000000000000000000000000000000000000..804f2d260eac772fa75f9199e0c204078af4432d
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/water_drops.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71c4999abb1c1ea93aeca2b820d904be6f91b3c3bd803f68494fd7fc1ef5c2aa
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/waterfall_burbling.npz b/capspeech/nar/data_preprocessing/clap_embs/waterfall_burbling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..995829c5e2338ddbf3e3a46e7c336dc1b93edece
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/waterfall_burbling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01170535cef0b639abdaa2fe33b72341c0b7cb6db1fff76d999b3c80cc50254e
+size 2146
diff --git a/capspeech/nar/data_preprocessing/clap_embs/whale_calling.npz b/capspeech/nar/data_preprocessing/clap_embs/whale_calling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..eac46c55b03e4230eec196969912bc02d78c2f41
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/whale_calling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41b60023e76785e351412ef14dccee1138f2c2a02a60ee001d3895959be070b9
+size 2158
diff --git a/capspeech/nar/data_preprocessing/clap_embs/wind.npz b/capspeech/nar/data_preprocessing/clap_embs/wind.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ae0c69e8dd5db685928fe4c670bdeec3eca58fb5
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/wind.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f93b7aee43159c19ac4d83cf02cfb497a39a52340ac0e8fb2820140422a9dc41
+size 2152
diff --git a/capspeech/nar/data_preprocessing/clap_embs/wind_chime.npz b/capspeech/nar/data_preprocessing/clap_embs/wind_chime.npz
new file mode 100644
index 0000000000000000000000000000000000000000..164d0573c166cef68f792fe9e1823ff34e474acf
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/wind_chime.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dc22de2548444ec09f0f625f132552df82783f6a9fbd166d32aceb447f49e8e
+size 2148
diff --git a/capspeech/nar/data_preprocessing/clap_embs/wind_noise.npz b/capspeech/nar/data_preprocessing/clap_embs/wind_noise.npz
new file mode 100644
index 0000000000000000000000000000000000000000..804b9a2880e1092e10155484b3963aa8715f092d
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/wind_noise.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f30efcfad942d3896af57745a9d731004c725b09bb876981ba1b2d80ce04f7f3
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/wind_rustling_leaves.npz b/capspeech/nar/data_preprocessing/clap_embs/wind_rustling_leaves.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b3d0e1e4454bdcf117fa99b6c7ac26831e6341f6
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/wind_rustling_leaves.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9ee2bd7c7a0ba3492dfecff404312ca1e4f243375177bccd7d56be08199b59e
+size 2139
diff --git a/capspeech/nar/data_preprocessing/clap_embs/wood_thrush_calling.npz b/capspeech/nar/data_preprocessing/clap_embs/wood_thrush_calling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1d4b3c9cbb7ccdfc075bb2406bc0480dd42b47f8
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/wood_thrush_calling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:807f1efe3ed0cecbdaf11ca890c44c86e15f4824bd8777654493a682a5b62c52
+size 2150
diff --git a/capspeech/nar/data_preprocessing/clap_embs/woodpecker_pecking_tree.npz b/capspeech/nar/data_preprocessing/clap_embs/woodpecker_pecking_tree.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5e67c9c5d0c2a8831f60e63534daf6171313bb24
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/woodpecker_pecking_tree.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f574829132e14b79cb1af432a761c03ee0a97a4964699cddda43c674b3f42e4
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/writing.npz b/capspeech/nar/data_preprocessing/clap_embs/writing.npz
new file mode 100644
index 0000000000000000000000000000000000000000..fb7ff62c571691fb5805162ea06e47d9b73738ec
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/writing.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:257a723021474bc36583a5172cadaf07b14e70b6d13b3ccfcabc33281308cdd8
+size 2145
diff --git a/capspeech/nar/data_preprocessing/clap_embs/writing_on_blackboard_with_chalk.npz b/capspeech/nar/data_preprocessing/clap_embs/writing_on_blackboard_with_chalk.npz
new file mode 100644
index 0000000000000000000000000000000000000000..fdcfebf9c455d9bc88a0cf17f488c33ada6fee8b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/writing_on_blackboard_with_chalk.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3aa60cebfcb09ba0c7a6d1301ab0916e6647a2ec1e4faeaa280c9b824a50c745
+size 2156
diff --git a/capspeech/nar/data_preprocessing/clap_embs/yodelling.npz b/capspeech/nar/data_preprocessing/clap_embs/yodelling.npz
new file mode 100644
index 0000000000000000000000000000000000000000..786a5bd376a017ead1124be57a5a6e3d943c1d5a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/yodelling.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:abffa354a927e136f575bd88e4eec3f72c295db613783bd9ddb35cf5255fd87a
+size 2153
diff --git a/capspeech/nar/data_preprocessing/clap_embs/zebra_braying.npz b/capspeech/nar/data_preprocessing/clap_embs/zebra_braying.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5dddc6c743901f448b755e736684d148ed0fc18a
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/clap_embs/zebra_braying.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01b0c4e12f32267047fe65add4133380d0fb1bba89550a1bb045f7e6fce6ef2c
+size 2150
diff --git a/capspeech/nar/data_preprocessing/events.txt b/capspeech/nar/data_preprocessing/events.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e185a9f1ecf87450b8a8938d20a4b39b5e4a278d
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/events.txt
@@ -0,0 +1,395 @@
+people whispering
+Microwave oven
+extending ladders
+mosquito buzzing
+dog whimpering
+coyote howling
+hair dryer drying
+Writing
+rapping
+machine gun shooting
+dog bow-wow
+dog howling
+barn swallow calling
+baby babbling
+Fireworks
+church bell ringing
+car horn
+cat caterwauling
+subway, metro, underground
+waterfall burbling
+lions roaring
+toilet flushing
+skateboarding
+wind
+ripping paper
+vacuum cleaner cleaning floors
+mouse squeaking
+keyboard typing
+playing timpani
+playing harp
+sheep bleating
+eletric blender running
+people slapping
+playing ukulele
+frog
+car engine knocking
+cat purring
+chainsaw
+Violin or fiddle
+people hiccup
+playing acoustic guitar
+donkey, ass braying
+playing french horn
+playing squash
+gibbon howling
+playing harmonica
+playing shofar
+hedge trimmer running
+playing washboard
+running electric fan
+splashing water
+playing bassoon
+people slurping
+playing accordion
+playing oboe
+popping popcorn
+glass breaking
+alarm clock ringing
+mouse click
+Laughter
+magpie calling
+playing snare drum
+people finger snapping
+ferret dooking
+tornado roaring
+Hi-hat
+lawn mowing
+church bells
+cat growling
+cheetah chirrup
+heart sounds, heartbeat
+firing muskets
+vehicle horn, car horn, honking
+turkey gobbling
+ice cream truck, ice cream van
+underwater bubbling
+footsteps on snow
+water drops
+people sobbing
+basketball bounce
+Applause
+playing sitar
+playing gong
+train
+coughing
+people screaming
+Gunshot or gunfire
+chinchilla barking
+cat hissing
+horse clip-clop
+engine
+people battle cry
+typing on computer keyboard
+playing clarinet
+driving motorcycle
+male singing
+singing bowl
+skiing
+driving buses
+alligators, crocodiles hissing
+people eating apple
+door slamming
+Flute
+raining
+Electric piano
+sliding door
+washing machine
+opening or closing car electric windows
+baby crying
+people babbling
+snake hissing
+brushing teeth
+playing tambourine
+Acoustic guitar
+clock tick
+playing castanets
+thunder
+playing didgeridoo
+playing synthesizer
+mouse clicking
+lathe spinning
+spraying water
+hen
+stream burbling
+door wood creaks
+sailing
+dog
+car engine idling
+bowling impact
+driving snowmobile
+toilet flush
+bird squawking
+playing timbales
+playing drum kit
+owl hooting
+striking pool
+Oboe
+duck quacking
+people belly laughing
+lighting firecrackers
+roller coaster running
+blowtorch igniting
+wood thrush calling
+Glockenspiel
+frog croaking
+playing harpsichord
+train horning
+plastic bottle crushing
+playing tabla
+fire crackling
+dog barking
+thunderstorm
+playing banjo
+swimming
+volcano explosion
+playing table tennis
+sea lion barking
+rowboat, canoe, kayak rowing
+Meow
+pouring water
+playing tympani
+rooster
+siren
+parrot talking
+Finger snapping
+playing steel guitar, slide guitar
+Trumpet
+tractor digging
+people coughing
+cat meowing
+Snare drum
+playing erhu
+crow cawing
+playing djembe
+whale calling
+mynah bird singing
+playing tennis
+chopping food
+golf driving
+tapping guitar
+playing cello
+dog growling
+elephant trumpeting
+sea waves
+police radio chatter
+lions growling
+playing lacrosse
+children shouting
+missile launch
+baby laughter
+air conditioning noise
+playing saxophone
+typing on typewriter
+printer printing
+race car, auto racing
+Bus
+pigeon, dove cooing
+playing violin, fiddle
+Double bass
+striking bowling
+fireworks banging
+Harmonica
+playing glockenspiel
+reversing beeps
+playing piano
+breathing
+people marching
+electric shaver, electric razor shaving
+chimpanzee pant-hooting
+cricket chirping
+bird chirping, tweeting
+using sewing machines
+crickets
+cow lowing
+playing cymbal
+vacuum cleaner
+playing zither
+train whistling
+goat bleating
+eating with cutlery
+black capped chickadee calling
+ambulance siren
+playing hockey
+dog baying
+Burping or eructation
+cupboard opening or closing
+air horn
+crying baby
+people eating crisps
+sloshing water
+goose honking
+orchestra
+people giggling
+warbler chirping
+child singing
+dinosaurs bellowing
+motorboat, speedboat acceleration
+airplane
+chicken clucking
+woodpecker pecking tree
+Drawer open or close
+people eating
+drinking sipping
+singing choir
+playing bass guitar
+playing bass drum
+car passing by
+playing tuning fork
+Squeak
+pig oinking
+Computer keyboard
+yodelling
+playing trombone
+clapping
+people sneezing
+pheasant crowing
+writing on blackboard with chalk
+Tambourine
+opening or closing car doors
+sharpen knife
+people whistling
+fireworks
+playing bagpipes
+chainsawing trees
+squishing water
+people farting
+playing electric guitar
+people booing
+female singing
+ocean burbling
+cattle mooing
+footsteps
+Knock
+wind rustling leaves
+cattle, bovinae cowbell
+Clarinet
+police car (siren)
+Fart
+cat
+sheep
+chopping wood
+tap dancing
+playing mandolin
+wind chime
+can opening
+playing hammond organ
+zebra braying
+scuba diving
+chirping birds
+playing steelpan
+playing theremin
+Keys jangling
+beat boxing
+firing cannon
+bouncing on trampoline
+door wood knock
+bathroom ventilation fan running
+snake rattling
+bull bellowing
+electric grinder grinding
+penguins braying
+otter growling
+civil defense siren
+wind noise
+people humming
+clock alarm
+disc scratching
+fire truck siren
+telephone bell ringing
+people sniggering
+playing bongo
+cap gun shooting
+opening or closing drawers
+cow
+hammering nails
+ice cracking
+foghorn
+rain
+playing badminton
+eagle screaming
+playing double bass
+insects
+people running
+planing timber
+cutting hair with electric trimmers
+Cello
+people clapping
+smoke detector beeping
+mouse pattering
+bee, wasp, etc. buzzing
+canary calling
+people burping
+Shatter
+baltimore oriole calling
+cuckoo bird calling
+snoring
+strike lighter
+people cheering
+playing bugle
+playing congas
+playing vibraphone
+hail
+rope skipping
+playing trumpet
+pig
+hand saw
+people gargling
+Scissors
+metronome
+chipmunk chirping
+playing flute
+fox barking
+crackling fire
+playing volleyball
+skidding
+Bass drum
+crow
+elk bugling
+Telephone
+Bark
+chicken crowing
+people nose blowing
+car engine starting
+pumping water
+Saxophone
+fly, housefly buzzing
+Cough
+people eating noodle
+francolin calling
+arc welding
+horse neighing
+Tearing
+helicopter
+playing electronic organ
+Cowbell
+railroad car, train wagon
+cell phone buzzing
+playing cornet
+sneezing
+engine accelerating, revving, vroom
+bird wings flapping
+playing marimba, xylophone
+playing guiro
+people crowd
+train wheels squealing
+slot machine
+laughing
+lip smacking
+forging swords
+Chime
+playing darts
+people shuffling
+Gong
+airplane flyby
+None
\ No newline at end of file
diff --git a/capspeech/nar/data_preprocessing/filemaker.py b/capspeech/nar/data_preprocessing/filemaker.py
new file mode 100644
index 0000000000000000000000000000000000000000..cee62665422ef0b27cead35194b0750cdf789fe4
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/filemaker.py
@@ -0,0 +1,40 @@
+# @ hwang258@jh.edu
+
+import os
+import argparse
+from tqdm import tqdm
+import json
+import glob
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Create manifests for gigaspeech")
+    parser.add_argument('--save_dir', type=str, default=None, help="path to the manifest, phonemes, and encodec codes dirs")
+
+    return parser.parse_args()
+    
+if __name__ == "__main__":
+    args = parse_args()
+    phn_save_root = os.path.join(args.save_dir, "g2p")
+    t5_save_root = os.path.join(args.save_dir, "t5")
+    manifest_root = os.path.join(args.save_dir, "manifest")
+    os.makedirs(manifest_root, exist_ok=True)
+    
+    json_paths = glob.glob(os.path.join(args.save_dir, 'jsons', '*.json'))
+    for json_path in json_paths:
+        savelines = []
+        with open(json_path, 'r') as json_file:
+            jsondata = json.load(json_file)
+        for key in tqdm(range(len(jsondata))):
+            if os.path.exists(os.path.join(phn_save_root, jsondata[key]['segment_id']+".txt")) and \
+                os.path.exists(os.path.join(t5_save_root, jsondata[key]['segment_id']+".npz")):
+                    if jsondata[key]['source'] == 'libritts-r':
+                        tag = jsondata[key]['text'].split(">", 1)[0].replace("<","").strip()
+                    else:
+                        tag = "none"
+                    savelines.append([jsondata[key]['segment_id'], tag])
+
+        outputlines = ''
+        for i in range(len(savelines)):
+            outputlines += savelines[i][0]+'\t'+str(savelines[i][1])+'\n'
+        with open(os.path.join(manifest_root, json_path.split('/')[-1].replace('.json', '')+'.txt'), "w") as f:
+            f.write(outputlines)
\ No newline at end of file
diff --git a/capspeech/nar/data_preprocessing/filemaker_no_se.py b/capspeech/nar/data_preprocessing/filemaker_no_se.py
new file mode 100644
index 0000000000000000000000000000000000000000..a45662dfcfa878ab4d0192622eec374117bff4e9
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/filemaker_no_se.py
@@ -0,0 +1,37 @@
+# @ hwang258@jh.edu
+
+import os
+import argparse
+from tqdm import tqdm
+import json
+import glob
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Create manifests for gigaspeech")
+    parser.add_argument('--save_dir', type=str, default=None, help="path to the manifest, phonemes, and encodec codes dirs")
+
+    return parser.parse_args()
+    
+if __name__ == "__main__":
+    args = parse_args()
+    phn_save_root = os.path.join(args.save_dir, "g2p")
+    t5_save_root = os.path.join(args.save_dir, "t5")
+    manifest_root = os.path.join(args.save_dir, "manifest")
+    os.makedirs(manifest_root, exist_ok=True)
+    
+    json_paths = glob.glob(os.path.join(args.save_dir, 'jsons', '*.json'))
+    for json_path in json_paths:
+        savelines = []
+        with open(json_path, 'r') as json_file:
+            jsondata = json.load(json_file)
+        for key in tqdm(range(len(jsondata))):
+            if os.path.exists(os.path.join(phn_save_root, jsondata[key]['segment_id']+".txt")) and \
+                os.path.exists(os.path.join(t5_save_root, jsondata[key]['segment_id']+".npz")):
+                    tag = "none"
+                    savelines.append([jsondata[key]['segment_id'], tag])
+
+        outputlines = ''
+        for i in range(len(savelines)):
+            outputlines += savelines[i][0]+'\t'+str(savelines[i][1])+'\n'
+        with open(os.path.join(manifest_root, json_path.split('/')[-1].replace('.json', '')+'.txt'), "w") as f:
+            f.write(outputlines)
\ No newline at end of file
diff --git a/capspeech/nar/data_preprocessing/phonemize.py b/capspeech/nar/data_preprocessing/phonemize.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6e6688101056301516bbc9e9876876e17e2de85
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/phonemize.py
@@ -0,0 +1,119 @@
+# @ hwang258@jh.edu
+
+import argparse
+import logging
+import json
+import glob
+import os
+import numpy as np
+import tqdm
+import time
+import multiprocessing
+from g2p_en import G2p
+import nltk
+nltk.download('averaged_perceptron_tagger_eng')
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Encode the gigaspeech phonemes using g2p model")
+    parser.add_argument('--save_dir', type=str, default=None, help="path to the manifest, phonemes, and encodec codes dirs")
+    parser.add_argument('--num_cpus', type=int, default=10)
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s [%(levelname)s] %(filename)s:%(lineno)d || %(message)s"
+    )
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    args = parse_args()
+
+    # get the path
+    phn_save_root = os.path.join(args.save_dir, "g2p")
+    os.makedirs(phn_save_root, exist_ok=True)
+
+    valid_symbols = [
+      'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
+      'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
+      'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
+      'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
+      'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
+      'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
+      'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH', '<BLK>', ',', '.', '!', '?', 
+      '<B_start>', '<B_end>', '<I_start>', '<I_end>'
+    ]
+
+    ### phonemization
+    text_tokenizer = G2p()
+
+    stime = time.time()
+
+    logging.info(f"phonemizing...")
+    json_paths = glob.glob(os.path.join(args.save_dir, 'jsons', '*.json'))
+    for json_path in json_paths:
+        with open(json_path, 'r') as json_file:
+            jsondata = json.load(json_file)
+
+        df_split = np.array_split(jsondata, args.num_cpus)
+        print(len(jsondata))
+        # Optional: Save each part to a separate JSON file
+        cmds = []
+        for idx, part in enumerate(df_split):
+            cmds.append((idx, part))
+
+        def process_one(indx, splitdata):
+            for key in tqdm.tqdm(range(len(splitdata))):
+                save_fn = os.path.join(phn_save_root, splitdata[key]['segment_id']+".txt")
+                if not os.path.exists(save_fn):
+                    text = splitdata[key]['text']
+                    
+                    if splitdata[key]['source'] == "libritts-r":
+                        
+                        text = text.split(">", 1)[1].strip() # remove the audio label
+                        if "<B_start>" in text:
+                            seg1 = text.split("<B_start>")[0]
+                            seg2 = text.split("<B_start>")[1].split("<B_end>")[0]
+                            seg3 = text.split("<B_end>")[1]
+                            phn1 = text_tokenizer(seg1)
+                            if len(phn1) > 0:
+                                phn1.append(" ")
+                            phn1.append("<B_start>")
+                            phn1.append(" ")
+                            phn2 = text_tokenizer(seg2)
+                            if len(phn2) > 0:
+                                phn2.append(" ")
+                            phn2.append("<B_end>")
+                            phn3 = text_tokenizer(seg3)
+                            if len(phn3) > 0:
+                                phn2.append(" ")
+                            phn = [*phn1,*phn2,*phn3]
+                            
+                        elif "<I_start>" in text:
+                            seg1 = text.split("<I_start>")[0]
+                            seg2 = text.split("<I_start>")[1].split("<I_end>")[0]
+                            seg3 = text.split("<I_end>")[1]
+                            phn1 = text_tokenizer(seg1)
+                            if len(phn1) > 0:
+                                phn1.append(" ")
+                            phn1.append("<I_start>")
+                            phn1.append(" ")
+                            phn2 = text_tokenizer(seg2)
+                            if len(phn2) > 0:
+                                phn2.append(" ")
+                            phn2.append("<I_end>")
+                            phn3 = text_tokenizer(seg3)
+                            if len(phn3) > 0:
+                                phn2.append(" ")
+                            phn = [*phn1,*phn2,*phn3]
+                    else:
+                        phn = text_tokenizer(text)
+                        
+                    phn = [item.replace(' ', '<BLK>') for item in phn]
+                    phn = [item for item in phn if item in valid_symbols]
+                    wrong_phn = [item for item in phn if item not in valid_symbols]
+                    if len(wrong_phn) > 0:
+                        print(wrong_phn)
+                    phn_seq = " ".join(phn)
+                    with open(save_fn, "w") as f:
+                        f.write(phn_seq)
+                
+        with multiprocessing.Pool(processes=args.num_cpus) as pool:
+            pool.starmap(process_one, cmds)
\ No newline at end of file
diff --git a/capspeech/nar/data_preprocessing/phonemize_no_se.py b/capspeech/nar/data_preprocessing/phonemize_no_se.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bb13cce700c074b5496b24b7b92411ccffc2ee2
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/phonemize_no_se.py
@@ -0,0 +1,80 @@
+# @ hwang258@jh.edu
+
+import argparse
+import logging
+import json
+import glob
+import os
+import numpy as np
+import tqdm
+import time
+import multiprocessing
+from g2p_en import G2p
+import nltk
+nltk.download('averaged_perceptron_tagger_eng')
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Encode the gigaspeech phonemes using g2p model")
+    parser.add_argument('--save_dir', type=str, default=None, help="path to the manifest, phonemes, and encodec codes dirs")
+    parser.add_argument('--num_cpus', type=int, default=10)
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s [%(levelname)s] %(filename)s:%(lineno)d || %(message)s"
+    )
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    args = parse_args()
+
+    # get the path
+    phn_save_root = os.path.join(args.save_dir, "g2p")
+    os.makedirs(phn_save_root, exist_ok=True)
+
+    valid_symbols = [
+      'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
+      'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
+      'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
+      'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
+      'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
+      'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
+      'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH', '<BLK>', ',', '.', '!', '?', 
+      '<B_start>', '<B_end>', '<I_start>', '<I_end>'
+    ]
+
+    ### phonemization
+    text_tokenizer = G2p()
+
+    stime = time.time()
+
+    logging.info(f"phonemizing...")
+    json_paths = glob.glob(os.path.join(args.save_dir, 'jsons', '*.json'))
+    for json_path in json_paths:
+        with open(json_path, 'r') as json_file:
+            jsondata = json.load(json_file)
+
+        df_split = np.array_split(jsondata, args.num_cpus)
+        print(len(jsondata))
+        # Optional: Save each part to a separate JSON file
+        cmds = []
+        for idx, part in enumerate(df_split):
+            cmds.append((idx, part))
+
+        def process_one(indx, splitdata):
+            for key in tqdm.tqdm(range(len(splitdata))):
+                save_fn = os.path.join(phn_save_root, splitdata[key]['segment_id']+".txt")
+                if not os.path.exists(save_fn):
+                    text = splitdata[key]['text']
+                    
+                    phn = text_tokenizer(text)
+                        
+                    phn = [item.replace(' ', '<BLK>') for item in phn]
+                    phn = [item for item in phn if item in valid_symbols]
+                    wrong_phn = [item for item in phn if item not in valid_symbols]
+                    if len(wrong_phn) > 0:
+                        print(wrong_phn)
+                    phn_seq = " ".join(phn)
+                    with open(save_fn, "w") as f:
+                        f.write(phn_seq)
+                
+        with multiprocessing.Pool(processes=args.num_cpus) as pool:
+            pool.starmap(process_one, cmds)
\ No newline at end of file
diff --git a/capspeech/nar/data_preprocessing/preprocess_agenttts.py b/capspeech/nar/data_preprocessing/preprocess_agenttts.py
new file mode 100644
index 0000000000000000000000000000000000000000..69bf3c89a6be94dd9819dda124b22cffc6c957de
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/preprocess_agenttts.py
@@ -0,0 +1,142 @@
+import json
+import argparse
+from pathlib import Path
+from typing import List, Dict, Set
+from tqdm import tqdm
+import soundfile as sf
+from datasets import load_dataset
+import logging
+import os
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Prepare the CapSpeech dataset")
+    parser.add_argument('--hub', type=str, required=True, help='Huggingface repo')
+    parser.add_argument('--save_dir', type=str, required=True, help='Directory to save the JSON files')
+    parser.add_argument('--cache_dir', type=str, required=True, help='Cache directory for datasets')
+    parser.add_argument('--wav_dir', type=str, required=True, help='Directories containing WAV files')
+    parser.add_argument('--audio_min_length', type=float, default=2.0, help='Minimum audio duration in seconds')
+    parser.add_argument('--audio_max_length', type=float, default=20.0, help='Maximum audio duration in seconds')
+    parser.add_argument('--splits', type=str, nargs='+',
+                        default=['train', 'val'],
+                        help='List of splits to process')
+    parser.add_argument('--debug', action='store_true', help='Enable debug mode with limited data processing')
+    return parser.parse_args()
+
+def setup_logging() -> None:
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s [%(levelname)s] %(message)s',
+        handlers=[
+            logging.StreamHandler()
+        ]
+    )
+
+def process_dataset_split(split, dataset_split, args) -> List[Dict]:
+    """
+    Process a single dataset split and extract relevant records.
+
+    Args:
+        split: The name of the split (e.g., 'train').
+        dataset_split: The dataset split object.
+        args: Parsed command-line arguments.
+
+    Returns:
+        A list of dictionaries containing the processed records.
+    """
+    logging.info(f"Processing split: {split}")
+    filelist: List[Dict] = []
+    total_duration: float = 0.0
+    num_samples: int = len(dataset_split) if not args.debug else 500
+    source_path = {
+        'capspeech-agentdb': args.wav_dir
+    }
+
+    for idx in tqdm(range(num_samples), desc=f"Processing {split}"):
+        try:
+            data = dataset_split[idx]
+        except IndexError:
+            logging.warning(f"Index {idx} out of range for split '{split}'. Skipping.")
+            continue
+
+        audio_path: str = data.get("audio_path", "")
+        duration: float = data.get("speech_duration", 0.0)
+        source: str = data.get("source", "")
+        audio_path = os.path.join(source_path[source], audio_path)
+
+        if not audio_path:
+            logging.warning(f"Missing audio_path at index {idx} in split '{split}'. Skipping.")
+            continue
+
+        if not os.path.exists(audio_path):
+            logging.warning(f"WAV file does not exist: {audio_path}")
+            continue
+
+        if not (args.audio_min_length <= duration <= args.audio_max_length):
+            continue
+
+        record: Dict = {
+            "segment_id": audio_path.split('/')[-2].replace(" ", "")+"_"+audio_path.split('/')[-1].split('.')[0],
+            "audio_path": audio_path,
+            "text": data.get('text', ''),
+            "caption": data.get('caption', ''),
+            "duration": duration,
+            "source": source
+        }
+
+        filelist.append(record)
+        total_duration += duration
+
+    logging.info(f"Total duration for split '{split}': {total_duration / 3600:.2f} hrs.")
+    logging.info(f"Total records for split '{split}': {len(filelist)}")
+    return filelist
+
+
+def save_json(filelist: List[Dict], output_path: Path) -> None:
+    """
+    Save the list of records to a JSON file.
+
+    Args:
+        filelist: List of dictionaries containing the records.
+        output_path: Path to the output JSON file.
+    """
+    try:
+        with output_path.open('w', encoding='utf-8') as json_file:
+            json.dump(filelist, json_file, ensure_ascii=False, indent=4)
+        logging.info(f"Saved {len(filelist)} records to '{output_path}'")
+    except Exception as e:
+        logging.error(f"Failed to save JSON to '{output_path}': {e}")
+
+
+def main() -> None:
+    args = parse_args()
+    setup_logging()
+
+    save_dir: Path = Path(args.save_dir)
+    jsons_dir: Path = save_dir / 'jsons'
+    jsons_dir.mkdir(parents=True, exist_ok=True)
+    logging.info(f"JSON files will be saved to '{jsons_dir}'")
+    logging.info("Loading dataset...")
+    try:
+        ds = load_dataset(args.hub)
+        # ds = load_dataset(args.hub, cache_dir=args.cache_dir)
+    except Exception as e:
+        logging.error(f"Failed to load dataset: {e}")
+        return
+
+    splits_to_process = args.splits
+    available_splits = set(ds.keys())
+    selected_splits = [split for split in splits_to_process if split in available_splits]
+
+    missing_splits = set(splits_to_process) - available_splits
+    if missing_splits:
+        logging.warning(f"The following splits were not found in the dataset and will be skipped: {missing_splits}")
+
+    for split in selected_splits:
+        dataset_split = ds[split]
+        filelist = process_dataset_split(split, dataset_split, args)
+        output_file: Path = jsons_dir / f"{split}.json"
+        save_json(filelist, output_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/capspeech/nar/data_preprocessing/preprocess_captts.py b/capspeech/nar/data_preprocessing/preprocess_captts.py
new file mode 100644
index 0000000000000000000000000000000000000000..c84e6fd1fb01037909c87a71e678042e4792585e
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/preprocess_captts.py
@@ -0,0 +1,147 @@
+import json
+import argparse
+from pathlib import Path
+from typing import List, Dict, Set
+from tqdm import tqdm
+import soundfile as sf
+from datasets import load_dataset
+import logging
+import os
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Prepare the CapSpeech dataset")
+    parser.add_argument('--hub', type=str, required=True, help='Huggingface repo')
+    parser.add_argument('--save_dir', type=str, required=True, help='Directory to save the JSON files')
+    parser.add_argument('--cache_dir', type=str, required=True, help='Cache directory for datasets')
+    parser.add_argument('--libriR_wav_dir', type=str, required=True, help='Directories containing WAV files')
+    parser.add_argument('--other_wav_dir', type=str, required=True, help='Directories containing WAV files')
+    parser.add_argument('--audio_min_length', type=float, default=3.0, help='Minimum audio duration in seconds')
+    parser.add_argument('--audio_max_length', type=float, default=18.0, help='Maximum audio duration in seconds')
+    parser.add_argument('--splits', type=str, nargs='+',
+                        default=['train', 'val'],
+                        help='List of splits to process')
+    parser.add_argument('--debug', action='store_true', help='Enable debug mode with limited data processing')
+    return parser.parse_args()
+
+def setup_logging() -> None:
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s [%(levelname)s] %(message)s',
+        handlers=[
+            logging.StreamHandler()
+        ]
+    )
+
+def process_dataset_split(split, dataset_split, args) -> List[Dict]:
+    """
+    Process a single dataset split and extract relevant records.
+
+    Args:
+        split: The name of the split (e.g., 'train').
+        dataset_split: The dataset split object.
+        args: Parsed command-line arguments.
+
+    Returns:
+        A list of dictionaries containing the processed records.
+    """
+    logging.info(f"Processing split: {split}")
+    filelist: List[Dict] = []
+    total_duration: float = 0.0
+    num_samples: int = len(dataset_split) if not args.debug else 500
+    source_path = {
+        'libritts-r': args.libriR_wav_dir,
+        'voxceleb': args.other_wav_dir,
+        'expresso': args.other_wav_dir,
+        'ears': args.other_wav_dir,
+        'vctk': args.other_wav_dir,
+    }
+
+    for idx in tqdm(range(num_samples), desc=f"Processing {split}"):
+        try:
+            data = dataset_split[idx]
+        except IndexError:
+            logging.warning(f"Index {idx} out of range for split '{split}'. Skipping.")
+            continue
+
+        audio_path: str = data.get("audio_path", "")
+        duration: float = data.get("speech_duration", 0.0)
+        source: str = data.get("source", "")
+        audio_path = os.path.join(source_path[source], audio_path)
+
+        if not audio_path:
+            logging.warning(f"Missing audio_path at index {idx} in split '{split}'. Skipping.")
+            continue
+
+        if not os.path.exists(audio_path):
+            logging.warning(f"WAV file does not exist: {audio_path}")
+            continue
+
+        if not (args.audio_min_length <= duration <= args.audio_max_length):
+            continue
+
+        record: Dict = {
+            "segment_id": audio_path.split('/')[-1].split('.')[0],
+            "audio_path": audio_path,
+            "text": data.get('text', ''),
+            "caption": data.get('caption', ''),
+            "duration": duration,
+            "source": source
+        }
+
+        filelist.append(record)
+        total_duration += duration
+
+    logging.info(f"Total duration for split '{split}': {total_duration / 3600:.2f} hrs.")
+    logging.info(f"Total records for split '{split}': {len(filelist)}")
+    return filelist
+
+
+def save_json(filelist: List[Dict], output_path: Path) -> None:
+    """
+    Save the list of records to a JSON file.
+
+    Args:
+        filelist: List of dictionaries containing the records.
+        output_path: Path to the output JSON file.
+    """
+    try:
+        with output_path.open('w', encoding='utf-8') as json_file:
+            json.dump(filelist, json_file, ensure_ascii=False, indent=4)
+        logging.info(f"Saved {len(filelist)} records to '{output_path}'")
+    except Exception as e:
+        logging.error(f"Failed to save JSON to '{output_path}': {e}")
+
+
+def main() -> None:
+    args = parse_args()
+    setup_logging()
+
+    save_dir: Path = Path(args.save_dir)
+    jsons_dir: Path = save_dir / 'jsons'
+    jsons_dir.mkdir(parents=True, exist_ok=True)
+    logging.info(f"JSON files will be saved to '{jsons_dir}'")
+    logging.info("Loading dataset...")
+    try:
+        ds = load_dataset(args.hub)
+        # ds = load_dataset(args.hub, cache_dir=args.cache_dir)
+    except Exception as e:
+        logging.error(f"Failed to load dataset: {e}")
+        return
+
+    splits_to_process = args.splits
+    available_splits = set(ds.keys())
+    selected_splits = [split for split in splits_to_process if split in available_splits]
+
+    missing_splits = set(splits_to_process) - available_splits
+    if missing_splits:
+        logging.warning(f"The following splits were not found in the dataset and will be skipped: {missing_splits}")
+
+    for split in selected_splits:
+        dataset_split = ds[split]
+        filelist = process_dataset_split(split, dataset_split, args)
+        output_file: Path = jsons_dir / f"{split}.json"
+        save_json(filelist, output_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/capspeech/nar/data_preprocessing/preprocess_capttsse.py b/capspeech/nar/data_preprocessing/preprocess_capttsse.py
new file mode 100644
index 0000000000000000000000000000000000000000..20de2a11ae39bef86c40597fa48b056987d87286
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/preprocess_capttsse.py
@@ -0,0 +1,141 @@
+import json
+import argparse
+from pathlib import Path
+from typing import List, Dict, Set
+from tqdm import tqdm
+import soundfile as sf
+from datasets import load_dataset
+import logging
+import os
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Prepare the CapSpeech dataset")
+    parser.add_argument('--hub', type=str, required=True, help='Huggingface repo')
+    parser.add_argument('--save_dir', type=str, required=True, help='Directory to save the JSON files')
+    parser.add_argument('--cache_dir', type=str, required=True, help='Cache directory for datasets')
+    parser.add_argument('--libriRmix_wav_dir', type=str, required=True, help='Directories containing WAV files')
+    parser.add_argument('--audio_min_length', type=float, default=3.0, help='Minimum audio duration in seconds')
+    parser.add_argument('--audio_max_length', type=float, default=18.0, help='Maximum audio duration in seconds')
+    parser.add_argument('--splits', type=str, nargs='+',
+                        default=['train', 'val'],
+                        help='List of splits to process')
+    parser.add_argument('--debug', action='store_true', help='Enable debug mode with limited data processing')
+    return parser.parse_args()
+
+def setup_logging() -> None:
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s [%(levelname)s] %(message)s',
+        handlers=[
+            logging.StreamHandler()
+        ]
+    )
+
+def process_dataset_split(split, dataset_split, args) -> List[Dict]:
+    """
+    Process a single dataset split and extract relevant records.
+
+    Args:
+        split: The name of the split (e.g., 'train').
+        dataset_split: The dataset split object.
+        args: Parsed command-line arguments.
+
+    Returns:
+        A list of dictionaries containing the processed records.
+    """
+    logging.info(f"Processing split: {split}")
+    filelist: List[Dict] = []
+    total_duration: float = 0.0
+    num_samples: int = len(dataset_split) if not args.debug else 500
+    source_path = {
+        'libritts-r': args.libriRmix_wav_dir,
+    }
+
+    for idx in tqdm(range(num_samples), desc=f"Processing {split}"):
+        try:
+            data = dataset_split[idx]
+        except IndexError:
+            logging.warning(f"Index {idx} out of range for split '{split}'. Skipping.")
+            continue
+
+        audio_path: str = data.get("audio_path", "")
+        duration: float = data.get("speech_duration", 0.0)
+        source: str = data.get("source", "")
+        audio_path = os.path.join(source_path[source], audio_path)
+
+        if not audio_path:
+            logging.warning(f"Missing audio_path at index {idx} in split '{split}'. Skipping.")
+            continue
+
+        if not os.path.exists(audio_path):
+            logging.warning(f"WAV file does not exist: {audio_path}")
+            continue
+
+        if not (args.audio_min_length <= duration <= args.audio_max_length):
+            continue
+
+        record: Dict = {
+            "segment_id": audio_path.split('/')[-1].split('.')[0],
+            "audio_path": audio_path,
+            "text": data.get('text', ''),
+            "caption": data.get('caption', ''),
+            "duration": duration,
+            "source": source
+        }
+
+        filelist.append(record)
+        total_duration += duration
+
+    logging.info(f"Total duration for split '{split}': {total_duration / 3600:.2f} hrs.")
+    logging.info(f"Total records for split '{split}': {len(filelist)}")
+    return filelist
+
+
+def save_json(filelist: List[Dict], output_path: Path) -> None:
+    """
+    Save the list of records to a JSON file.
+
+    Args:
+        filelist: List of dictionaries containing the records.
+        output_path: Path to the output JSON file.
+    """
+    try:
+        with output_path.open('w', encoding='utf-8') as json_file:
+            json.dump(filelist, json_file, ensure_ascii=False, indent=4)
+        logging.info(f"Saved {len(filelist)} records to '{output_path}'")
+    except Exception as e:
+        logging.error(f"Failed to save JSON to '{output_path}': {e}")
+
+
+def main() -> None:
+    args = parse_args()
+    setup_logging()
+
+    save_dir: Path = Path(args.save_dir)
+    jsons_dir: Path = save_dir / 'jsons'
+    jsons_dir.mkdir(parents=True, exist_ok=True)
+    logging.info(f"JSON files will be saved to '{jsons_dir}'")
+    logging.info("Loading dataset...")
+    try:
+        ds = load_dataset(args.hub, cache_dir=args.cache_dir)
+    except Exception as e:
+        logging.error(f"Failed to load dataset: {e}")
+        return
+
+    splits_to_process = args.splits
+    available_splits = set(ds.keys())
+    selected_splits = [split for split in splits_to_process if split in available_splits]
+
+    missing_splits = set(splits_to_process) - available_splits
+    if missing_splits:
+        logging.warning(f"The following splits were not found in the dataset and will be skipped: {missing_splits}")
+
+    for split in selected_splits:
+        dataset_split = ds[split]
+        filelist = process_dataset_split(split, dataset_split, args)
+        output_file: Path = jsons_dir / f"{split}.json"
+        save_json(filelist, output_file)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/capspeech/nar/data_preprocessing/preprocess_pretrain.py b/capspeech/nar/data_preprocessing/preprocess_pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb96427f39c3884a8bcc55e4e12b88d916fbe8c2
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/preprocess_pretrain.py
@@ -0,0 +1,149 @@
+import json
+import argparse
+from pathlib import Path
+from typing import List, Dict, Set
+from tqdm import tqdm
+import soundfile as sf
+from datasets import load_dataset
+import logging
+import os
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Prepare the CapSpeech dataset")
+    parser.add_argument('--hub', type=str, required=True, help='Huggingface repo')
+    parser.add_argument('--save_dir', type=str, required=True, help='Directory to save the JSON files')
+    parser.add_argument('--cache_dir', type=str, required=True, help='Cache directory for datasets')
+    parser.add_argument('--libriRmix_wav_dir', type=str, required=True, help='Directories containing WAV files')
+    parser.add_argument('--mls_wav_dir', type=str, required=True, help='Directories containing WAV files')
+    parser.add_argument('--commonvoice_dir', type=str, required=True, help='Directories containing WAV files')
+    parser.add_argument('--gigaspeech_dir', type=str, required=True, help='Directories containing WAV files')
+    parser.add_argument('--emilia_dir', type=str, required=True, help='Directories containing WAV files')
+    parser.add_argument('--audio_min_length', type=float, default=3.0, help='Minimum audio duration in seconds')
+    parser.add_argument('--audio_max_length', type=float, default=18.0, help='Maximum audio duration in seconds')
+    parser.add_argument('--splits', type=str, nargs='+',
+                        default=['train', 'val'],
+                        help='List of splits to process')
+    parser.add_argument('--debug', action='store_true', help='Enable debug mode with limited data processing')
+    return parser.parse_args()
+
+def setup_logging() -> None:
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s [%(levelname)s] %(message)s',
+        handlers=[
+            logging.StreamHandler()
+        ]
+    )
+
+def process_dataset_split(split, dataset_split, args) -> List[Dict]:
+    """
+    Process a single dataset split and extract relevant records.
+
+    Args:
+        split: The name of the split (e.g., 'train').
+        dataset_split: The dataset split object.
+        args: Parsed command-line arguments.
+
+    Returns:
+        A list of dictionaries containing the processed records.
+    """
+    logging.info(f"Processing split: {split}")
+    filelist: List[Dict] = []
+    total_duration: float = 0.0
+    num_samples: int = len(dataset_split) if not args.debug else 500
+    source_path = {
+        'mls': args.mls_wav_dir,
+        'libritts-r': args.libriRmix_wav_dir,
+        'commonvoice': args.commonvoice_dir,
+        'gigaspeech': args.gigaspeech_dir,
+        'emilia': args.emilia_dir
+    }
+
+    for idx in tqdm(range(num_samples), desc=f"Processing {split}"):
+        try:
+            data = dataset_split[idx]
+        except IndexError:
+            logging.warning(f"Index {idx} out of range for split '{split}'. Skipping.")
+            continue
+
+        audio_path: str = data.get("audio_path", "")
+        duration: float = data.get("speech_duration", 0.0)
+        source: str = data.get("source", "")
+        audio_path = os.path.join(source_path[source], audio_path)
+
+        if not audio_path:
+            logging.warning(f"Missing audio_path at index {idx} in split '{split}'. Skipping.")
+            continue
+
+        if not os.path.exists(audio_path):
+            logging.warning(f"WAV file does not exist: {audio_path}")
+            continue
+
+        if not (args.audio_min_length <= duration <= args.audio_max_length):
+            continue
+
+        record: Dict = {
+            "segment_id": audio_path.split('/')[-1].split('.')[0],
+            "audio_path": audio_path,
+            "text": data.get('text', ''),
+            "caption": data.get('caption', ''),
+            "duration": duration,
+            "source": source
+        }
+
+        filelist.append(record)
+        total_duration += duration
+
+    logging.info(f"Total duration for split '{split}': {total_duration / 3600:.2f} hrs.")
+    logging.info(f"Total records for split '{split}': {len(filelist)}")
+    return filelist
+
+
+def save_json(filelist: List[Dict], output_path: Path) -> None:
+    """
+    Save the list of records to a JSON file.
+
+    Args:
+        filelist: List of dictionaries containing the records.
+        output_path: Path to the output JSON file.
+    """
+    try:
+        with output_path.open('w', encoding='utf-8') as json_file:
+            json.dump(filelist, json_file, ensure_ascii=False, indent=4)
+        logging.info(f"Saved {len(filelist)} records to '{output_path}'")
+    except Exception as e:
+        logging.error(f"Failed to save JSON to '{output_path}': {e}")
+
+
+def main() -> None:
+    args = parse_args()
+    setup_logging()
+
+    save_dir: Path = Path(args.save_dir)
+    jsons_dir: Path = save_dir / 'jsons'
+    jsons_dir.mkdir(parents=True, exist_ok=True)
+    logging.info(f"JSON files will be saved to '{jsons_dir}'")
+    logging.info("Loading dataset...")
+    try:
+        ds = load_dataset(args.hub, cache_dir=args.cache_dir)
+    except Exception as e:
+        logging.error(f"Failed to load dataset: {e}")
+        return
+
+    splits_to_process = args.splits
+    available_splits = set(ds.keys())
+    selected_splits = [split for split in splits_to_process if split in available_splits]
+
+    missing_splits = set(splits_to_process) - available_splits
+    if missing_splits:
+        logging.warning(f"The following splits were not found in the dataset and will be skipped: {missing_splits}")
+
+    for split in selected_splits:
+        dataset_split = ds[split]
+        filelist = process_dataset_split(split, dataset_split, args)
+        output_file: Path = jsons_dir / f"{split}.json"
+        save_json(filelist, output_file)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/capspeech/nar/data_preprocessing/process_agenttts.sh b/capspeech/nar/data_preprocessing/process_agenttts.sh
new file mode 100644
index 0000000000000000000000000000000000000000..47ffa24b3433509f9889622e6bef2732c4f2e12b
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/process_agenttts.sh
@@ -0,0 +1,28 @@
+export CUDA_VISIBLE_DEVICES=0
+
+SAVE_DIR='./agent_data' # to save processed data
+CACHE_DIR='./cache' # to save dataset cache
+WAV_DIR='' # downloaded capspeech-agentdb wav path
+CPUS=30
+N_WORKERS=8
+BATCH_SIZE=64
+HUB='OpenSound/CapSpeech'
+
+python preprocess_agenttts.py \
+    --hub ${HUB} \
+    --save_dir ${SAVE_DIR} \
+    --cache_dir ${CACHE_DIR} \
+    --wav_dir ${WAV_DIR}\
+    --splits train_AgentDB test_AgentDB \
+    --audio_min_length 2.0 \
+    --audio_max_length 20.0 
+
+python phonemize_no_se.py \
+    --save_dir ${SAVE_DIR} \
+    --num_cpus ${CPUS}
+
+python caption.py \
+    --save_dir ${SAVE_DIR}
+
+python filemaker_no_se.py \
+    --save_dir ${SAVE_DIR}
diff --git a/capspeech/nar/data_preprocessing/process_captts.sh b/capspeech/nar/data_preprocessing/process_captts.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f9e9f28e2b681c39b032fc1846f727490f8d0097
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/process_captts.sh
@@ -0,0 +1,30 @@
+export CUDA_VISIBLE_DEVICES=0
+
+SAVE_DIR='./captts_data' # to save processed data
+CACHE_DIR='./cache' # to save dataset cache
+LIBRITTSR_WAV_DIR='' # downloaded libritts-r wav path
+OTHER_WAV_DIR='' # downloaded other wav path
+CPUS=30
+N_WORKERS=8
+BATCH_SIZE=64
+HUB='OpenSound/CapSpeech'
+
+python preprocess_captts.py \
+    --hub ${HUB} \
+    --save_dir ${SAVE_DIR} \
+    --cache_dir ${CACHE_DIR} \
+    --libriR_wav_dir ${LIBRITTSR_WAV_DIR}\
+    --other_wav_dir ${OTHER_WAV_DIR} \
+    --splits train_SFT_CapTTS validation_SFT_CapTTS \
+    --audio_min_length 3.0 \
+    --audio_max_length 18.0 
+
+python phonemize_no_se.py \
+    --save_dir ${SAVE_DIR} \
+    --num_cpus ${CPUS}
+
+python caption.py \
+    --save_dir ${SAVE_DIR}
+
+python filemaker_no_se.py \
+    --save_dir ${SAVE_DIR}
diff --git a/capspeech/nar/data_preprocessing/process_capttsse.sh b/capspeech/nar/data_preprocessing/process_capttsse.sh
new file mode 100644
index 0000000000000000000000000000000000000000..47a3646c76e0bdaf02c063501d6d8963c80ccce8
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/process_capttsse.sh
@@ -0,0 +1,28 @@
+export CUDA_VISIBLE_DEVICES=0
+
+SAVE_DIR='./capttsse_data' # to save processed data
+CACHE_DIR='./cache' # to save dataset cache
+LIBRITTSRMIX_WAV_DIR='' # downloaded librittsrmix wav path
+CPUS=30
+N_WORKERS=8
+BATCH_SIZE=64
+HUB='OpenSound/CapSpeech'
+
+python preprocess_capttsse.py \
+    --hub ${HUB} \
+    --save_dir ${SAVE_DIR} \
+    --cache_dir ${CACHE_DIR} \
+    --libriRmix_wav_dir ${LIBRITTSRMIX_WAV_DIR}\
+    --splits train_SEDB \
+    --audio_min_length 3.0 \
+    --audio_max_length 18.0 
+
+python phonemize.py \
+    --save_dir ${SAVE_DIR} \
+    --num_cpus ${CPUS}
+
+python caption.py \
+    --save_dir ${SAVE_DIR}
+
+python filemaker.py \
+    --save_dir ${SAVE_DIR}
diff --git a/capspeech/nar/data_preprocessing/process_pretrain.sh b/capspeech/nar/data_preprocessing/process_pretrain.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9db654b2281ed028e269ca29e63d48e43c45bd24
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/process_pretrain.sh
@@ -0,0 +1,39 @@
+export CUDA_VISIBLE_DEVICES=0
+
+SAVE_DIR='./pretrain_data' # to save processed data
+CACHE_DIR='./cache' # to save dataset cache
+MLS_WAV_DIR='' # downloaded mls wav path
+LIBRITTSRMIX_WAV_DIR='' # downloaded librittsrmix wav path
+GIGASPEECH_WAV_DIR='' # downloaded gigaspeech wav path
+COMMONVOICE_WAV_DIR='' # downloaded commonvoice wav path
+EMILIA_WAV_DIR='' # downloaded emilia wav path
+CPUS=30
+N_WORKERS=8
+BATCH_SIZE=64
+HUB='OpenSound/CapSpeech'
+
+python preprocess_pretrain.py \
+    --hub ${HUB} \
+    --save_dir ${SAVE_DIR} \
+    --cache_dir ${CACHE_DIR} \
+    --libriRmix_wav_dir ${LIBRITTSRMIX_WAV_DIR}\
+    --mls_wav_dir ${MLS_WAV_DIR} \
+    --commonvoice_dir ${COMMONVOICE_WAV_DIR} \
+    --gigaspeech_dir ${GIGASPEECH_WAV_DIR} \
+    --emilia_dir ${EMILIA_WAV_DIR} \
+    --splits train_PT validation_PT \
+    --audio_min_length 3.0 \
+    --audio_max_length 18.0 
+
+python phonemize.py \
+    --save_dir ${SAVE_DIR} \
+    --num_cpus ${CPUS}
+
+python caption.py \
+    --save_dir ${SAVE_DIR}
+
+python filemaker.py \
+    --save_dir ${SAVE_DIR}
+
+python vocab.py \
+    --save_dir ${SAVE_DIR}
diff --git a/capspeech/nar/data_preprocessing/vocab.py b/capspeech/nar/data_preprocessing/vocab.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1b85b8b8f02af878d734fa5ed075b2e041bb34
--- /dev/null
+++ b/capspeech/nar/data_preprocessing/vocab.py
@@ -0,0 +1,34 @@
+# @ hwang258@jh.edu
+
+import os
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Create the vocab set of gigaspeech")
+    parser.add_argument('--save_dir', type=str, default=None, help="path to the manifest, phonemes, and encodec codes dirs")
+    return parser.parse_args()
+    
+if __name__ == "__main__":
+    args = parse_args()
+    savepath = os.path.join(args.save_dir, 'vocab.txt')
+    phn_vocab = []
+    valid_symbols = [
+      'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
+      'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
+      'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
+      'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
+      'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
+      'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
+      'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH', '<BLK>', ',', '.', '!', '?', 
+      '<B_start>', '<B_end>', '<I_start>', '<I_end>'
+    ]
+
+
+    phn_vocab = set(valid_symbols)
+    
+    with open(savepath, "w") as f:
+        for i, phn in enumerate(list(phn_vocab)):
+            if i < len(list(phn_vocab)) - 1:
+                f.write(f"{str(i)} {phn}\n")
+            else:
+                f.write(f"{str(i)} {phn}")
\ No newline at end of file
diff --git a/capspeech/nar/dataset/__init__.py b/capspeech/nar/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/nar/dataset/capspeech.py b/capspeech/nar/dataset/capspeech.py
new file mode 100644
index 0000000000000000000000000000000000000000..015800f8d38d32e8539be4909dca23d733d47665
--- /dev/null
+++ b/capspeech/nar/dataset/capspeech.py
@@ -0,0 +1,193 @@
+# @ hwang258@jhu.edu
+import os
+import json
+import torch
+import random
+import logging
+import shutil
+import typing as tp
+import numpy as np
+import torchaudio
+import sys
+from torch.utils.data import Dataset
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+
+def read_json(path):
+    with open(path, 'r') as f:
+        return json.load(f)
+
+
+class CapSpeech(Dataset):
+    def __init__(
+        self,
+        dataset_dir: str = None,
+        clap_emb_dir: str = None,
+        t5_folder_name: str = "t5",
+        phn_folder_name: str = "g2p",
+        manifest_name: str = "manifest",
+        json_name: str = "jsons",
+        dynamic_batching: bool = True,
+        text_pad_token: int = -1,
+        audio_pad_token: float = 0.0,
+        split: str = "val",
+        sr: int = 24000,
+        norm_audio: bool = False,
+        vocab_file: str = None,
+    ):
+        super().__init__()
+        self.dataset_dir = dataset_dir
+        self.clap_emb_dir = clap_emb_dir
+        self.t5_folder_name = t5_folder_name
+        self.phn_folder_name = phn_folder_name
+        self.manifest_name = manifest_name
+        self.json_name = json_name
+        self.dynamic_batching = dynamic_batching
+        self.text_pad_token = text_pad_token
+        self.audio_pad_token = torch.tensor(audio_pad_token)
+        self.split = split
+        self.sr = sr
+        self.norm_audio = norm_audio
+
+        assert self.split in ['train', 'train_small', 'val', 'test']
+        manifest_fn = os.path.join(self.dataset_dir, self.manifest_name, self.split+".txt")
+
+        meta = read_json(os.path.join(self.dataset_dir, self.json_name, self.split + ".json"))
+        self.meta = {item["segment_id"]: item["audio_path"] for item in meta}
+
+        with open(manifest_fn, "r") as rf:
+            data = [l.strip().split("\t") for l in rf.readlines()]
+
+        # data = [item for item in data if item[2] == 'none'] # remove sound effects
+
+        self.data = [item[0] for item in data]
+        self.tag_list = [item[1] for item in data]
+
+        logging.info(f"number of data points for {self.split} split: {len(self.data)}")
+
+        # phoneme vocabulary
+        if vocab_file is None:
+            vocab_fn = os.path.join(self.dataset_dir, "vocab.txt")
+        else:
+            vocab_fn = vocab_file
+        with open(vocab_fn, "r") as f:
+            temp = [l.strip().split(" ") for l in f.readlines() if len(l) != 0]
+            self.phn2num = {item[1]:int(item[0]) for item in temp}
+
+    def __len__(self):
+        return len(self.data)
+
+    def _load_audio(self, audio_path):
+        try:
+            y, sr = torchaudio.load(audio_path)
+            if y.shape[0] > 1:
+                y = y.mean(dim=0, keepdim=True)
+            if sr != self.sr:
+                resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=self.sr)
+                y = resampler(y)
+            if self.norm_audio:
+                eps = 1e-9
+                max_val = torch.max(torch.abs(y))
+                y = y / (max_val + eps)
+            if torch.isnan(y.mean()):
+                return None
+            return y
+        except:
+            return None
+
+    def _load_phn_enc(self, index):
+        try:
+            seg_id = self.data[index]
+            pf = os.path.join(self.dataset_dir, self.phn_folder_name, seg_id+".txt")
+            audio_path = self.meta[seg_id]
+            cf = os.path.join(self.dataset_dir, self.t5_folder_name, seg_id+".npz")
+            tagf = os.path.join(self.clap_emb_dir, self.tag_list[index]+'.npz')
+            with open(pf, "r") as p:
+                phns = [l.strip() for l in p.readlines()]
+                assert len(phns) == 1, phns
+                x = [self.phn2num[item] for item in phns[0].split(" ")]
+            c = np.load(cf)['arr_0']
+            c = torch.tensor(c).squeeze()
+            tag = np.load(tagf)['arr_0']
+            tag = torch.tensor(tag).squeeze()
+            y = self._load_audio(audio_path)
+            if y is not None:
+                return x, y, c, tag
+            return None, None, None, None
+        except:
+            return None, None, None, None
+
+    def __getitem__(self, index):
+        x, y, c, tag = self._load_phn_enc(index)
+        if x is None:
+            return {
+                "x": None,
+                "x_len": None,
+                "y": None,
+                "y_len": None,
+                "c": None,
+                "c_len": None,
+                "tag": None
+            }
+        x_len, y_len, c_len = len(x), len(y[0]), len(c)
+        y_len = y_len / self.sr
+
+        if y_len * self.sr / 256 <= x_len:
+            return {
+                "x": None,
+                "x_len": None,
+                "y": None,
+                "y_len": None,
+                "c": None,
+                "c_len": None,
+                "tag": None
+            }
+            
+        x = torch.LongTensor(x)
+        return {
+            "x": x,
+            "x_len": x_len,
+            "y": y,
+            "y_len": y_len,
+            "c": c,
+            "c_len": c_len,
+            "tag": tag
+        }
+
+    def collate(self, batch):
+        out = {key:[] for key in batch[0]}
+        for item in batch:
+            if item['x'] == None: # deal with load failure
+                continue
+            if item['c'].ndim != 2:
+                continue
+            for key, val in item.items():
+                out[key].append(val)
+
+        res = {}
+        res["x"] = torch.nn.utils.rnn.pad_sequence(out["x"], batch_first=True, padding_value=self.text_pad_token)
+        res["x_lens"] = torch.LongTensor(out["x_len"])
+        if self.dynamic_batching:
+            res['y'] = torch.nn.utils.rnn.pad_sequence([item.transpose(1,0) for item in out['y']],padding_value=self.audio_pad_token)
+            res['y'] = res['y'].permute(1,2,0) # T B K -> B K T
+        else:
+            res['y'] = torch.stack(out['y'], dim=0)
+
+        res["y_lens"] = torch.Tensor(out["y_len"])
+        res['c'] = torch.nn.utils.rnn.pad_sequence(out['c'], batch_first=True)
+        res["c_lens"] = torch.LongTensor(out["c_len"])
+        res["tag"] = torch.stack(out['tag'], dim=0)
+        return res
+
+
+if __name__ == "__main__":    
+    # debug
+    import argparse
+    from torch.utils.data import DataLoader
+    from accelerate import Accelerator
+
+    dataset = CapSpeech(
+        dataset_dir="./data/capspeech",
+        clap_emb_dir="./data/clap_embs/",
+        split="val"
+    )
diff --git a/capspeech/nar/duration_predictor.py b/capspeech/nar/duration_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e818598809789263d425ab04ed99c4a5b66a7e5
--- /dev/null
+++ b/capspeech/nar/duration_predictor.py
@@ -0,0 +1,61 @@
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
+import torch
+
+# load data
+dataset = load_dataset("OpenSound/CapSpeech")
+
+# load model
+model_name = "bert-base-uncased"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
+
+# special tokens
+with open("events.txt", "r") as f:
+    events = [line.strip() for line in f]
+events = ["<"+event.lower().replace(" ", "_")+">" for event in events]
+events.append("<B_start>")
+events.append("<B_end>")
+events.append("<I_start>")
+events.append("<I_end>")
+special_tokens_dict = {"additional_special_tokens": events}
+num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
+print(f"Added {num_added_toks} special tokens.")
+model.resize_token_embeddings(len(tokenizer))
+
+# data preprocessing
+def tokenize_fn(example):
+    # You can change the delimiter if needed (e.g., "[SEP]", " | ", or nothing)
+    combined = example["text"] + " [SEP] " + example["caption"]
+    return tokenizer(combined, padding="max_length", truncation=True, max_length=400)
+
+tokenized_dataset = dataset.map(tokenize_fn)
+tokenized_dataset = tokenized_dataset.rename_column("speech_duration", "labels")
+tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
+
+# hyperparameters
+training_args = TrainingArguments(
+    output_dir="./duration_predictor",
+    per_device_train_batch_size=256,
+    num_train_epochs=2,
+    learning_rate=1e-4,
+    warmup_steps=1000,
+    save_strategy="steps",
+    save_steps=3000,
+    evaluation_strategy="epoch",
+    logging_dir="./logs_dp",
+)
+
+# training
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_dataset["train_PT"],
+    eval_dataset=tokenized_dataset["validation_PT"],
+)
+trainer.train()
+
+# test
+preds = trainer.predict(tokenized_dataset["test"])
+print("Predictions:", preds.predictions[:10])
+print("Ground Truth:", preds.label_ids[:10])
diff --git a/capspeech/nar/events.txt b/capspeech/nar/events.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a7fb37a73888498da3fdcafabbf6ae400016af3
--- /dev/null
+++ b/capspeech/nar/events.txt
@@ -0,0 +1,395 @@
+people whispering
+Microwave oven
+extending ladders
+mosquito buzzing
+dog whimpering
+coyote howling
+hair dryer drying
+Writing
+rapping
+machine gun shooting
+dog bow-wow
+dog howling
+barn swallow calling
+baby babbling
+Fireworks
+church bell ringing
+car horn
+cat caterwauling
+subway, metro, underground
+waterfall burbling
+lions roaring
+toilet flushing
+skateboarding
+wind
+ripping paper
+vacuum cleaner cleaning floors
+mouse squeaking
+keyboard typing
+playing timpani
+playing harp
+sheep bleating
+eletric blender running
+people slapping
+playing ukulele
+frog
+car engine knocking
+cat purring
+chainsaw
+Violin or fiddle
+people hiccup
+playing acoustic guitar
+donkey, ass braying
+playing french horn
+playing squash
+gibbon howling
+playing harmonica
+playing shofar
+hedge trimmer running
+playing washboard
+running electric fan
+splashing water
+playing bassoon
+people slurping
+playing accordion
+playing oboe
+popping popcorn
+glass breaking
+alarm clock ringing
+mouse click
+Laughter
+magpie calling
+playing snare drum
+people finger snapping
+ferret dooking
+tornado roaring
+Hi-hat
+lawn mowing
+church bells
+cat growling
+cheetah chirrup
+heart sounds, heartbeat
+firing muskets
+vehicle horn, car horn, honking
+turkey gobbling
+ice cream truck, ice cream van
+underwater bubbling
+footsteps on snow
+water drops
+people sobbing
+basketball bounce
+Applause
+playing sitar
+playing gong
+train
+coughing
+people screaming
+Gunshot or gunfire
+chinchilla barking
+cat hissing
+horse clip-clop
+engine
+people battle cry
+typing on computer keyboard
+playing clarinet
+driving motorcycle
+male singing
+singing bowl
+skiing
+driving buses
+alligators, crocodiles hissing
+people eating apple
+door slamming
+Flute
+raining
+Electric piano
+sliding door
+washing machine
+opening or closing car electric windows
+baby crying
+people babbling
+snake hissing
+brushing teeth
+playing tambourine
+Acoustic guitar
+clock tick
+playing castanets
+thunder
+playing didgeridoo
+playing synthesizer
+mouse clicking
+lathe spinning
+spraying water
+hen
+stream burbling
+door wood creaks
+sailing
+dog
+car engine idling
+bowling impact
+driving snowmobile
+toilet flush
+bird squawking
+playing timbales
+playing drum kit
+owl hooting
+striking pool
+Oboe
+duck quacking
+people belly laughing
+lighting firecrackers
+roller coaster running
+blowtorch igniting
+wood thrush calling
+Glockenspiel
+frog croaking
+playing harpsichord
+train horning
+plastic bottle crushing
+playing tabla
+fire crackling
+dog barking
+thunderstorm
+playing banjo
+swimming
+volcano explosion
+playing table tennis
+sea lion barking
+rowboat, canoe, kayak rowing
+Meow
+pouring water
+playing tympani
+rooster
+siren
+parrot talking
+Finger snapping
+playing steel guitar, slide guitar
+Trumpet
+tractor digging
+people coughing
+cat meowing
+Snare drum
+playing erhu
+crow cawing
+playing djembe
+whale calling
+mynah bird singing
+playing tennis
+chopping food
+golf driving
+tapping guitar
+playing cello
+dog growling
+elephant trumpeting
+sea waves
+police radio chatter
+lions growling
+playing lacrosse
+children shouting
+missile launch
+baby laughter
+air conditioning noise
+playing saxophone
+typing on typewriter
+printer printing
+race car, auto racing
+Bus
+pigeon, dove cooing
+playing violin, fiddle
+Double bass
+striking bowling
+fireworks banging
+Harmonica
+playing glockenspiel
+reversing beeps
+playing piano
+breathing
+people marching
+electric shaver, electric razor shaving
+chimpanzee pant-hooting
+cricket chirping
+bird chirping, tweeting
+using sewing machines
+crickets
+cow lowing
+playing cymbal
+vacuum cleaner
+playing zither
+train whistling
+goat bleating
+eating with cutlery
+black capped chickadee calling
+ambulance siren
+playing hockey
+dog baying
+Burping or eructation
+cupboard opening or closing
+air horn
+crying baby
+people eating crisps
+sloshing water
+goose honking
+orchestra
+people giggling
+warbler chirping
+child singing
+dinosaurs bellowing
+motorboat, speedboat acceleration
+airplane
+chicken clucking
+woodpecker pecking tree
+Drawer open or close
+people eating
+drinking sipping
+singing choir
+playing bass guitar
+playing bass drum
+car passing by
+playing tuning fork
+Squeak
+pig oinking
+Computer keyboard
+yodelling
+playing trombone
+clapping
+people sneezing
+pheasant crowing
+writing on blackboard with chalk
+Tambourine
+opening or closing car doors
+sharpen knife
+people whistling
+fireworks
+playing bagpipes
+chainsawing trees
+squishing water
+people farting
+playing electric guitar
+people booing
+female singing
+ocean burbling
+cattle mooing
+footsteps
+Knock
+wind rustling leaves
+cattle, bovinae cowbell
+Clarinet
+police car (siren)
+Fart
+cat
+sheep
+chopping wood
+tap dancing
+playing mandolin
+wind chime
+can opening
+playing hammond organ
+zebra braying
+scuba diving
+chirping birds
+playing steelpan
+playing theremin
+Keys jangling
+beat boxing
+firing cannon
+bouncing on trampoline
+door wood knock
+bathroom ventilation fan running
+snake rattling
+bull bellowing
+electric grinder grinding
+penguins braying
+otter growling
+civil defense siren
+wind noise
+people humming
+clock alarm
+disc scratching
+fire truck siren
+telephone bell ringing
+people sniggering
+playing bongo
+cap gun shooting
+opening or closing drawers
+cow
+hammering nails
+ice cracking
+foghorn
+rain
+playing badminton
+eagle screaming
+playing double bass
+insects
+people running
+planing timber
+cutting hair with electric trimmers
+Cello
+people clapping
+smoke detector beeping
+mouse pattering
+bee, wasp, etc. buzzing
+canary calling
+people burping
+Shatter
+baltimore oriole calling
+cuckoo bird calling
+snoring
+strike lighter
+people cheering
+playing bugle
+playing congas
+playing vibraphone
+hail
+rope skipping
+playing trumpet
+pig
+hand saw
+people gargling
+Scissors
+metronome
+chipmunk chirping
+playing flute
+fox barking
+crackling fire
+playing volleyball
+skidding
+Bass drum
+crow
+elk bugling
+Telephone
+Bark
+chicken crowing
+people nose blowing
+car engine starting
+pumping water
+Saxophone
+fly, housefly buzzing
+Cough
+people eating noodle
+francolin calling
+arc welding
+horse neighing
+Tearing
+helicopter
+playing electronic organ
+Cowbell
+railroad car, train wagon
+cell phone buzzing
+playing cornet
+sneezing
+engine accelerating, revving, vroom
+bird wings flapping
+playing marimba, xylophone
+playing guiro
+people crowd
+train wheels squealing
+slot machine
+laughing
+lip smacking
+forging swords
+Chime
+playing darts
+people shuffling
+Gong
+airplane flyby
+None
diff --git a/capspeech/nar/finetune.py b/capspeech/nar/finetune.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef83f8ccfdf85b5974dc5e0c81bb0d19b38a0672
--- /dev/null
+++ b/capspeech/nar/finetune.py
@@ -0,0 +1,287 @@
+import os
+import time
+import random
+import argparse
+import numpy as np
+from tqdm import tqdm
+from accelerate import Accelerator
+from einops import rearrange
+from cached_path import cached_path
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+
+# replace this with BigVGAN
+import bigvgan
+
+from model.modules import MelSpec
+from network.crossdit import CrossDiT
+from dataset.capspeech import CapSpeech
+from utils import load_checkpoint, make_pad_mask
+from utils import get_lr_scheduler, load_yaml_with_includes
+from inference import eval_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    # Config settings
+    parser.add_argument('--config-name', type=str, required=True)
+    parser.add_argument('--pretrained-ckpt', type=str, required=True)
+
+    # Training settings
+    parser.add_argument("--amp", type=str, default='fp16')
+    parser.add_argument('--epochs', type=int, default=15)
+    parser.add_argument('--num-workers', type=int, default=32)
+    parser.add_argument('--num-threads', type=int, default=1)
+    parser.add_argument('--eval-every-step', type=int, default=1000)
+    # save all states including optimizer every save-every-step
+    parser.add_argument('--save-every-step', type=int, default=1000)
+    parser.add_argument('--resume-from', type=str, default=None, help='Path to checkpoint to resume training')
+
+    # Log and random seed
+    parser.add_argument('--random-seed', type=int, default=2025)
+    parser.add_argument('--log-step', type=int, default=200)
+    parser.add_argument('--log-dir', type=str, default='./logs/')
+    parser.add_argument('--save-dir', type=str, default='./ckpts/')
+    return parser.parse_args()
+
+
+def setup_directories(args, params):
+    args.log_dir = os.path.join(args.log_dir, params['model_name']) + '/'
+    args.save_dir = os.path.join(args.save_dir, params['model_name']) + '/'
+
+    os.makedirs(args.log_dir, exist_ok=True)
+    os.makedirs(args.save_dir, exist_ok=True)
+
+
+def set_device(args):
+    torch.set_num_threads(args.num_threads)
+    if torch.cuda.is_available():
+        args.device = 'cuda'
+        torch.cuda.manual_seed_all(args.random_seed)
+        torch.backends.cuda.matmul.allow_tf32 = True
+        if torch.backends.cudnn.is_available():
+            torch.backends.cudnn.deterministic = True
+            torch.backends.cudnn.benchmark = False
+    else:
+        args.device = 'cpu'
+
+
+def prepare_batch(batch, mel, latent_sr):
+    x, x_lens, y, y_lens, c, c_lens, tag = batch["x"], batch["x_lens"], batch["y"], batch["y_lens"], batch["c"], batch["c_lens"], batch["tag"]
+
+    # add len for clap embedding
+    x_lens = x_lens + 1
+
+    with torch.no_grad():
+        audio_clip = mel(y)
+
+        audio_clip = rearrange(audio_clip, 'b d n -> b n d')
+        y_lens = (y_lens * latent_sr).long()
+
+    return x, x_lens, audio_clip, y_lens, c, c_lens, tag
+
+
+if __name__ == '__main__':
+
+    args = parse_args()
+    params = load_yaml_with_includes(args.config_name)
+
+    # random seed
+    set_device(args)
+    random.seed(args.random_seed)
+    torch.manual_seed(args.random_seed)
+
+    accelerator = Accelerator(mixed_precision=args.amp,
+                              gradient_accumulation_steps=params['opt']['accumulation_steps'],
+                              step_scheduler_with_optimizer=False)
+
+    # dataset
+    train_set = CapSpeech(**params['data']['trainset'])
+    train_loader = DataLoader(train_set, num_workers=args.num_workers,
+                              batch_size=params['opt']['batch_size'], shuffle=True,
+                              collate_fn=train_set.collate)
+
+    val_set = CapSpeech(**params['data']['valset'])
+    val_loader = DataLoader(val_set, num_workers=0,
+                            batch_size=1, shuffle=False,
+                            collate_fn=val_set.collate)
+                            
+    # load dit
+    model = CrossDiT(**params['model'])
+    model.load_state_dict(torch.load(args.pretrained_ckpt)["model"])
+
+    # mel spectrogram - move to accelerator device after preparation
+    mel = MelSpec(**params['mel'])
+    latent_sr = params['mel']['target_sample_rate'] / params['mel']['hop_length']
+
+    # load vocoder
+    vocoder = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_24khz_100band_256x', use_cuda_kernel=False)
+    vocoder.remove_weight_norm()
+    vocoder = vocoder.eval().to(accelerator.device)
+
+    # prepare opt
+    optimizer = torch.optim.AdamW(model.parameters(), lr=params['opt']['learning_rate'])
+
+    if args.resume_from is not None and os.path.exists(args.resume_from):
+        checkpoint = torch.load(args.resume_from, map_location='cpu')
+        model.load_state_dict(checkpoint["model"])
+        optimizer.load_state_dict(checkpoint["optimizer"])
+        global_step = checkpoint["global_step"]
+        start_epoch = checkpoint["epoch"] + 1  # Continue from the next epoch
+        print(f"Resuming training from checkpoint: {args.resume_from}, starting from epoch {start_epoch}.")
+    else:
+        global_step = 0
+        start_epoch = 0
+    
+    lr_scheduler = get_lr_scheduler(optimizer, 'customized', **params['opt']['lr_scheduler'])
+    
+    # Prepare with accelerator
+    (model, optimizer, lr_scheduler, 
+     train_loader, val_loader) = accelerator.prepare(model, optimizer, lr_scheduler, train_loader, val_loader)
+    
+    # Move mel and vocos to the same device as model AFTER preparation
+    mel = mel.to(accelerator.device)
+    vocoder = vocoder.to(accelerator.device)
+
+    # Add synchronization point
+    accelerator.wait_for_everyone()
+
+    losses = 0.0
+
+    if accelerator.is_main_process:
+        setup_directories(args, params)
+        trainable_params = sum(param.nelement() for param in model.parameters() if param.requires_grad)
+        print("Number of trainable parameters: %.2fM" % (trainable_params / 1e6))
+    
+    # Add synchronization point
+    accelerator.wait_for_everyone()
+
+    # REMOVED initial evaluation to prevent deadlock
+    # We'll evaluate after the first epoch or at the first eval step
+
+    for epoch in range(start_epoch, args.epochs):
+        model.train()
+        
+        # Use accelerator's progress bar for correct handling in distributed setup
+        progress_bar = tqdm(train_loader, disable=not accelerator.is_local_main_process)
+        
+        for step, batch in enumerate(progress_bar):
+            with accelerator.accumulate(model):
+                (text, text_lens, audio_clips, audio_lens, prompt, prompt_lens, clap) = prepare_batch(batch, mel, latent_sr)
+                # prepare flow mathing
+                x1 = audio_clips
+                x0 = torch.randn_like(x1)
+                t = torch.rand((x1.shape[0],), dtype=x1.dtype, device=x1.device)
+                sigma = rearrange(t, 'b -> b 1 1')
+                noisy_x1 = (1 - sigma) * x0.clone() + sigma * x1.clone()
+                flow = x1.clone() - x0.clone()
+                # option: audio-prompt based zero-shot tts
+                # tts_mask = create_tts_mask(seq_len, x1.shape[1], params['opt']['mask_range'])
+                # # cond = x1.clone(), cond[tts_mask[..., None]] = 0
+                # cond = torch.where(tts_mask[..., None], torch.zeros_like(x1), x1)
+                cond = None
+
+                # prepare batch cfg
+                drop_prompt = (torch.rand(x1.shape[0]) < params['opt']['drop_spk'])
+                drop_text = drop_prompt & (torch.rand(x1.shape[0]) < params['opt']['drop_text'])
+
+                prompt[drop_prompt] = 0.0
+                prompt_lens[drop_prompt] = 1
+                clap[drop_text] = 0.0
+                text[drop_text] = -1
+
+                seq_len_audio = audio_clips.shape[1]
+                pad_mask = make_pad_mask(audio_lens, seq_len_audio).to(audio_clips.device)
+
+                seq_len_prompt = prompt.shape[1]
+                prompt_mask = make_pad_mask(prompt_lens, seq_len_prompt).to(prompt.device)
+
+                pred = model(x=noisy_x1, cond=cond,
+                             prompt=prompt, clap=clap, text=text, time=t,
+                             mask=pad_mask, prompt_mask=prompt_mask)
+
+                loss = F.mse_loss(pred, flow, reduction="none")
+                loss = loss[pad_mask].mean()
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    if 'grad_clip' in params['opt'] and params['opt']['grad_clip'] > 0:
+                        accelerator.clip_grad_norm_(model.parameters(),
+                                                    max_norm=params['opt']['grad_clip'])
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Fixed step counting - increment only once per actual step, not per accumulation step
+            if accelerator.sync_gradients:
+                global_step += 1
+                losses += loss.item()
+
+                # Add progress bar description
+                if accelerator.is_local_main_process:
+                    progress_bar.set_description(f"Epoch {epoch+1}, Loss: {loss.item():.6f}")
+
+                if global_step % args.log_step == 0:
+                    losses = losses / args.log_step  # Calculate average loss
+                    
+                    if accelerator.is_main_process:
+                        current_time = time.asctime(time.localtime(time.time()))
+                        epoch_info = f'Epoch: [{epoch + 1}][{args.epochs}]'
+                        batch_info = f'Global Step: {global_step}'
+                        loss_info = f'Loss: {losses:.6f}'
+
+                        # Extract the learning rate from the optimizer
+                        lr = optimizer.param_groups[0]['lr']
+                        lr_info = f'Learning Rate: {lr:.6f}'
+
+                        log_message = f'{current_time}\n{epoch_info}    {batch_info}    {loss_info}    {lr_info}\n'
+
+                        with open(args.log_dir + 'log.txt', mode='a') as n:
+                            n.write(log_message)
+
+                    # Reset loss accumulator
+                    losses = 0.0
+                
+                # Evaluation logic
+                if global_step % args.eval_every_step == 0:
+                    # Set model to eval mode
+                    model.eval()
+                    
+                    # Synchronize before evaluation
+                    accelerator.wait_for_everyone()
+                    
+                    if accelerator.is_main_process:
+                        # Get unwrapped model for evaluation
+                        unwrapped_model = accelerator.unwrap_model(model)
+                        
+                        # Run evaluation without specifying device
+                        eval_model(unwrapped_model, vocoder, mel, val_loader, params,
+                                   steps=25, cfg=2.0,
+                                   sway_sampling_coef=-1.0, 
+                                   # Remove explicit device setting
+                                   epoch=global_step, save_path=args.log_dir + 'output/', val_num=1)
+                        
+                        # Save model checkpoint
+                        accelerator.save({
+                            "model": unwrapped_model.state_dict(),
+                            "optimizer": optimizer.state_dict(),
+                            "epoch": epoch,
+                            "global_step": global_step,
+                        }, args.save_dir + str(global_step) + '.pt')
+                        
+                        # Save full state including optimizer if needed
+                        if global_step % args.save_every_step == 0:
+                            accelerator.save_state(f"{args.save_dir}{global_step}")
+                    
+                    # Synchronize after evaluation and saving
+                    accelerator.wait_for_everyone()
+                    
+                    # Set model back to train mode
+                    model.train()
+
+        # Synchronize at the end of each epoch
+        accelerator.wait_for_everyone()
diff --git a/capspeech/nar/generate.py b/capspeech/nar/generate.py
new file mode 100644
index 0000000000000000000000000000000000000000..41af40c1802ace04dd998da4f098c7fb5753cb06
--- /dev/null
+++ b/capspeech/nar/generate.py
@@ -0,0 +1,229 @@
+import os
+import time
+import random
+import argparse
+import numpy as np
+from tqdm import tqdm
+from huggingface_hub import snapshot_download
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from capspeech.nar import bigvgan
+import librosa
+from capspeech.nar.utils import make_pad_mask
+from capspeech.nar.model.modules import MelSpec
+from capspeech.nar.network.crossdit import CrossDiT
+from capspeech.nar.inference import sample
+from capspeech.nar.utils import load_yaml_with_includes
+import soundfile as sf
+from transformers import T5EncoderModel, AutoTokenizer
+from g2p_en import G2p
+import laion_clap
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import re
+import time
+
+def seed_everything(seed):
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+
+valid_symbols = [
+      'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
+      'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
+      'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
+      'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
+      'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
+      'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
+      'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH', '<BLK>', ',', '.', '!', '?', 
+      '<B_start>', '<B_end>', '<I_start>', '<I_end>'
+    ]
+
+def encode(text, text_tokenizer):
+    if '<B_start>' in text:
+        assert '<B_end>' in text, text
+        text = text.split(">", 1)[1].strip() # remove the audio label
+        seg1 = text.split("<B_start>")[0]
+        seg2 = text.split("<B_start>")[1].split("<B_end>")[0]
+        seg3 = text.split("<B_end>")[1]
+        phn1 = text_tokenizer(seg1)
+        if len(phn1) > 0:
+            phn1.append(" ")
+        phn1.append("<B_start>")
+        phn1.append(" ")
+        phn2 = text_tokenizer(seg2)
+        if len(phn2) > 0:
+            phn2.append(" ")
+        phn2.append("<B_end>")
+        phn3 = text_tokenizer(seg3)
+        if len(phn3) > 0:
+            phn2.append(" ")
+        phn = [*phn1,*phn2,*phn3]
+
+    elif '<I_start>' in text:
+        assert '<I_end>' in text, text
+        text = text.split(">", 1)[1].strip() # remove the audio label
+        seg1 = text.split("<I_start>")[0]
+        seg2 = text.split("<I_start>")[1].split("<I_end>")[0]
+        seg3 = text.split("<I_end>")[1]
+        phn1 = text_tokenizer(seg1)
+        if len(phn1) > 0:
+            phn1.append(" ")
+        phn1.append("<I_start>")
+        phn1.append(" ")
+        phn2 = text_tokenizer(seg2)
+        if len(phn2) > 0:
+            phn2.append(" ")
+        phn2.append("<I_end>")
+        phn3 = text_tokenizer(seg3)
+        if len(phn3) > 0:
+            phn2.append(" ")
+        phn = [*phn1,*phn2,*phn3]
+            
+    else:
+        phn = text_tokenizer(text)
+        
+    phn = [item.replace(' ', '<BLK>') for item in phn]
+    phn = [item for item in phn if item in valid_symbols]
+    return phn
+
+def estimate_duration_range(text):
+    words = text.strip().split()
+    num_words = len(words)
+    min_duration = num_words / 4.0
+    max_duration = num_words / 1.5
+    ref_min = num_words / 3.0
+    ref_max = num_words / 1.5
+    return min_duration, max_duration, ref_min, ref_max
+
+def get_duration(text, predicted_duration):
+    cleaned_text = re.sub(r"<[^>]*>", "", text)
+    min_dur, max_dur, ref_min, ref_max = estimate_duration_range(cleaned_text)
+    event_dur = random.uniform(0.5, 2.0) if "<I_start>" in text else 0
+    if predicted_duration < min_dur + event_dur or predicted_duration > max_dur + event_dur:
+        return round(random.uniform(ref_min, ref_max), 2) + event_dur
+    return predicted_duration
+
+def run(
+        model_list, 
+        device, 
+        duration, 
+        transcript, 
+        caption,
+        speed=1.0,
+        steps=25,
+        cfg=2.0
+    ):
+    model, vocoder, phn2num, text_tokenizer, clap_model, duration_tokenizer, duration_model, caption_tokenizer, caption_encoder = model_list
+    print("Start Generation...")
+    start_time = time.time()
+    if "<B_start>" in transcript or "<I_start>" in transcript:
+        tag = transcript.split(">", 1)[0].strip()
+        tag = tag[1:].lower().replace("_"," ")
+    else:
+        tag = "none"
+
+    phn = encode(transcript, text_tokenizer)
+    text_tokens = [phn2num[item] for item in phn]
+    text = torch.LongTensor(text_tokens).unsqueeze(0).to(device)
+    if duration is None:
+        duration_inputs = caption + " <NEW_SEP> " + transcript
+        duration_inputs = duration_tokenizer(duration_inputs, return_tensors="pt", padding="max_length", truncation=True, max_length=400)
+    
+    with torch.no_grad():
+        batch_encoding = caption_tokenizer(caption, return_tensors="pt")
+        ori_tokens = batch_encoding["input_ids"].to(device)
+        prompt = caption_encoder(input_ids=ori_tokens).last_hidden_state.squeeze().unsqueeze(0).to(device)
+        tag_data = [tag] 
+        tag_embed = clap_model.get_text_embedding(tag_data, use_tensor=True)
+        clap = tag_embed.squeeze().unsqueeze(0).to(device)
+
+        if duration is None:
+            duration_outputs = duration_model(**duration_inputs)
+            predicted_duration = duration_outputs.logits.squeeze().item()
+            duration = get_duration(transcript, predicted_duration)
+    if speed == 0:
+        speed = 1
+    duration = duration / speed
+    audio_clips = torch.zeros([1, math.ceil(duration*24000/256), 100]).to(device)
+    cond = None
+    seq_len_prompt = prompt.shape[1]
+    prompt_lens = torch.Tensor([prompt.shape[1]])
+    prompt_mask = make_pad_mask(prompt_lens, seq_len_prompt).to(prompt.device)
+    gen = sample(model, vocoder,
+                 audio_clips, cond, text, prompt, clap, prompt_mask,
+                 steps=steps, cfg=cfg,
+                 sway_sampling_coef=-1.0, device=device)
+
+    end_time = time.time()
+    audio_len = gen.shape[-1] / 24000 # sampling rate fixed in this work
+    rtf = (end_time-start_time)/audio_len
+    print(f"RTF: {rtf:.4f}")
+    return gen
+
+def load_model(device, task):
+    print("Downloading model from Huggingface...")
+    local_dir = snapshot_download(
+        repo_id="OpenSound/CapSpeech-models"
+    )
+    if task == "PT":
+        model_path = os.path.join(local_dir, "nar_PT.pt")
+    elif task == "CapTTS":
+        model_path = os.path.join(local_dir, "nar_CapTTS.pt")
+    elif task == "EmoCapTTS":
+        model_path = os.path.join(local_dir, "nar_EmoCapTTS.pt")
+    elif task == "AccCapTTS":
+        model_path = os.path.join(local_dir, "nar_AccCapTTS.pt")
+    elif task == "AgentTTS":
+        model_path = os.path.join(local_dir, "nar_AgentTTS.pt")
+    else:
+        assert 1 == 0, task
+
+    print("Loading models...")
+    params = load_yaml_with_includes(os.path.join(local_dir, "nar_pretrain.yaml"))
+    model = CrossDiT(**params['model']).to(device)
+    checkpoint = torch.load(model_path)['model']
+    model.load_state_dict(checkpoint, strict=True)
+
+    # mel spectrogram
+    mel = MelSpec(**params['mel']).to(device)
+    latent_sr = params['mel']['target_sample_rate'] / params['mel']['hop_length']
+
+    # load vocab
+    vocab_fn = os.path.join(os.path.join(local_dir, "vocab.txt"))
+    with open(vocab_fn, "r") as f:
+        temp = [l.strip().split(" ") for l in f.readlines() if len(l) != 0]
+        phn2num = {item[1]:int(item[0]) for item in temp}
+
+    # load g2p
+    text_tokenizer = G2p()
+
+    # load vocoder
+    # instantiate the model. You can optionally set use_cuda_kernel=True for faster inference.
+    vocoder = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_24khz_100band_256x', use_cuda_kernel=False)
+
+    # remove weight norm in the model and set to eval mode
+    vocoder.remove_weight_norm()
+    vocoder = vocoder.eval().to(device)
+
+    # load t5
+    caption_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
+    caption_encoder = T5EncoderModel.from_pretrained("google/flan-t5-large").to(device).eval()
+
+    # load clap
+    clap_model = laion_clap.CLAP_Module(enable_fusion=False)
+    clap_model.load_ckpt(os.path.join(local_dir, "clap-630k-best.pt"))
+
+    # load duration predictor
+    duration_tokenizer = AutoTokenizer.from_pretrained(os.path.join(local_dir, "nar_duration_predictor"))
+    duration_model = AutoModelForSequenceClassification.from_pretrained(os.path.join(local_dir, "nar_duration_predictor"))
+    duration_model.eval()
+    model_list = [model, vocoder, phn2num, text_tokenizer, clap_model, duration_tokenizer, duration_model, caption_tokenizer, caption_encoder]
+
+    return model_list
diff --git a/capspeech/nar/inference.py b/capspeech/nar/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..b50c880e6ff7c93ffa02910c3a3fc5326a399762
--- /dev/null
+++ b/capspeech/nar/inference.py
@@ -0,0 +1,95 @@
+import os
+import torch
+import librosa
+import pandas as pd
+import soundfile as sf
+from tqdm import tqdm
+from torchdiffeq import odeint
+from einops import rearrange
+from capspeech.nar.utils import make_pad_mask
+
+@torch.no_grad()
+def sample(model, vocoder,
+           x, cond, text, prompt, clap, prompt_mask,
+           steps=25, cfg=2.0,
+           sway_sampling_coef=-1.0, device='cuda'):
+
+    model.eval()
+    vocoder.eval()
+
+    y0 = torch.randn_like(x)
+
+    neg_text = torch.ones_like(text) * -1
+    neg_clap = torch.zeros_like(clap)
+    neg_prompt = torch.zeros_like(prompt)
+    neg_prompt_mask = torch.zeros_like(prompt_mask)
+    neg_prompt_mask[:, 0] = 1
+
+    def fn(t, x):
+        pred = model(x=x, cond=cond, text=text, time=t, 
+                     prompt=prompt, clap=clap,
+                     mask=None,
+                     prompt_mask=prompt_mask)
+
+        null_pred = model(x=x, cond=cond, text=neg_text, time=t, 
+                          prompt=neg_prompt, clap=neg_clap,
+                          mask=None,
+                          prompt_mask=neg_prompt_mask)
+        return pred + (pred - null_pred) * cfg
+
+    t_start = 0
+    t = torch.linspace(t_start, 1, steps, device=device)
+    if sway_sampling_coef is not None:
+        t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
+
+    trajectory = odeint(fn, y0, t, method="euler")
+    out = trajectory[-1]
+    out = rearrange(out, 'b n d -> b d n')
+
+    with torch.inference_mode():
+        wav_gen = vocoder(out)
+    wav_gen_float = wav_gen.squeeze().cpu().numpy() # wav_gen is FloatTensor with shape [1, T_time]
+    return wav_gen_float
+
+
+def prepare_batch(batch, mel, latent_sr):
+    x, x_lens, y, y_lens, c, c_lens, tag = batch["x"], batch["x_lens"], batch["y"], batch["y_lens"], batch["c"], batch["c_lens"], batch["tag"]
+
+    # add len for clap embedding
+    x_lens = x_lens + 1
+
+    with torch.no_grad():
+        audio_clip = mel(y)
+        audio_clip = rearrange(audio_clip, 'b d n -> b n d')
+        y_lens = (y_lens * latent_sr).long()
+
+    return x, x_lens, audio_clip, y_lens, c, c_lens, tag
+
+# use ground truth duration for simple inference
+@torch.no_grad()
+def eval_model(model, vocos, mel, val_loader, params,
+               steps=25, cfg=2.0,
+               sway_sampling_coef=-1.0, device='cuda',
+               epoch=0, save_path='logs/eval/', val_num=5):
+
+    save_path = save_path + '/' + str(epoch) + '/'
+    os.makedirs(save_path, exist_ok=True)
+
+    latent_sr = params['mel']['target_sample_rate'] / params['mel']['hop_length']
+
+    for step, batch in enumerate(tqdm(val_loader)):
+        (text, text_lens, audio_clips, audio_lens, prompt, prompt_lens, clap) = prepare_batch(batch, mel, latent_sr)
+        cond = None
+
+        seq_len_prompt = prompt.shape[1]
+        prompt_mask = make_pad_mask(prompt_lens, seq_len_prompt).to(prompt.device)
+
+        gen = sample(model, vocos,
+                     audio_clips, cond, text, prompt, clap, prompt_mask,
+                     steps=steps, cfg=cfg,
+                     sway_sampling_coef=sway_sampling_coef, device=device)
+
+        sf.write(save_path + f'{step}.wav', gen, samplerate=params['mel']['target_sample_rate'])
+
+        if step + 1 >= val_num:
+            break
\ No newline at end of file
diff --git a/capspeech/nar/meldataset.py b/capspeech/nar/meldataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcfd5d2852ccf647efec1ece3e82d23b386d8a40
--- /dev/null
+++ b/capspeech/nar/meldataset.py
@@ -0,0 +1,409 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+import math
+import os
+import random
+import torch
+import torch.utils.data
+import numpy as np
+import librosa
+from librosa.filters import mel as librosa_mel_fn
+import pathlib
+from tqdm import tqdm
+from typing import List, Tuple, Optional
+import os
+import shutil
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))
+
+MAX_WAV_VALUE = 32767.0  # NOTE: 32768.0 -1 to prevent int16 overflow (results in popping sound in corner cases)
+
+
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+
+
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+
+
+def spectral_normalize_torch(magnitudes):
+    return dynamic_range_compression_torch(magnitudes)
+
+
+def spectral_de_normalize_torch(magnitudes):
+    return dynamic_range_decompression_torch(magnitudes)
+
+
+mel_basis_cache = {}
+hann_window_cache = {}
+
+
+def mel_spectrogram(
+    y: torch.Tensor,
+    n_fft: int,
+    num_mels: int,
+    sampling_rate: int,
+    hop_size: int,
+    win_size: int,
+    fmin: int,
+    fmax: int = None,
+    center: bool = False,
+) -> torch.Tensor:
+    """
+    Calculate the mel spectrogram of an input signal.
+    This function uses slaney norm for the librosa mel filterbank (using librosa.filters.mel) and uses Hann window for STFT (using torch.stft).
+
+    Args:
+        y (torch.Tensor): Input signal.
+        n_fft (int): FFT size.
+        num_mels (int): Number of mel bins.
+        sampling_rate (int): Sampling rate of the input signal.
+        hop_size (int): Hop size for STFT.
+        win_size (int): Window size for STFT.
+        fmin (int): Minimum frequency for mel filterbank.
+        fmax (int): Maximum frequency for mel filterbank. If None, defaults to half the sampling rate (fmax = sr / 2.0) inside librosa_mel_fn
+        center (bool): Whether to pad the input to center the frames. Default is False.
+
+    Returns:
+        torch.Tensor: Mel spectrogram.
+    """
+    if torch.min(y) < -1.0:
+        print(f"[WARNING] Min value of input waveform signal is {torch.min(y)}")
+    if torch.max(y) > 1.0:
+        print(f"[WARNING] Max value of input waveform signal is {torch.max(y)}")
+
+    device = y.device
+    key = f"{n_fft}_{num_mels}_{sampling_rate}_{hop_size}_{win_size}_{fmin}_{fmax}_{device}"
+
+    if key not in mel_basis_cache:
+        mel = librosa_mel_fn(
+            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
+        )
+        mel_basis_cache[key] = torch.from_numpy(mel).float().to(device)
+        hann_window_cache[key] = torch.hann_window(win_size).to(device)
+
+    mel_basis = mel_basis_cache[key]
+    hann_window = hann_window_cache[key]
+
+    padding = (n_fft - hop_size) // 2
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (padding, padding), mode="reflect"
+    ).squeeze(1)
+
+    spec = torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window,
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
+
+    mel_spec = torch.matmul(mel_basis, spec)
+    mel_spec = spectral_normalize_torch(mel_spec)
+
+    return mel_spec
+
+
+def get_mel_spectrogram(wav, h):
+    """
+    Generate mel spectrogram from a waveform using given hyperparameters.
+
+    Args:
+        wav (torch.Tensor): Input waveform.
+        h: Hyperparameters object with attributes n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax.
+
+    Returns:
+        torch.Tensor: Mel spectrogram.
+    """
+    return mel_spectrogram(
+        wav,
+        h.n_fft,
+        h.num_mels,
+        h.sampling_rate,
+        h.hop_size,
+        h.win_size,
+        h.fmin,
+        h.fmax,
+    )
+
+
+def get_dataset_filelist(a):
+    training_files = []
+    validation_files = []
+    list_unseen_validation_files = []
+
+    with open(a.input_training_file, "r", encoding="utf-8") as fi:
+        training_files = [
+            os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav")
+            for x in fi.read().split("\n")
+            if len(x) > 0
+        ]
+        print(f"first training file: {training_files[0]}")
+
+    with open(a.input_validation_file, "r", encoding="utf-8") as fi:
+        validation_files = [
+            os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav")
+            for x in fi.read().split("\n")
+            if len(x) > 0
+        ]
+        print(f"first validation file: {validation_files[0]}")
+
+    for i in range(len(a.list_input_unseen_validation_file)):
+        with open(a.list_input_unseen_validation_file[i], "r", encoding="utf-8") as fi:
+            unseen_validation_files = [
+                os.path.join(a.list_input_unseen_wavs_dir[i], x.split("|")[0] + ".wav")
+                for x in fi.read().split("\n")
+                if len(x) > 0
+            ]
+            print(
+                f"first unseen {i}th validation fileset: {unseen_validation_files[0]}"
+            )
+            list_unseen_validation_files.append(unseen_validation_files)
+
+    return training_files, validation_files, list_unseen_validation_files
+
+
+class MelDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        training_files: List[str],
+        hparams: AttrDict,
+        segment_size: int,
+        n_fft: int,
+        num_mels: int,
+        hop_size: int,
+        win_size: int,
+        sampling_rate: int,
+        fmin: int,
+        fmax: Optional[int],
+        split: bool = True,
+        shuffle: bool = True,
+        device: str = None,
+        fmax_loss: Optional[int] = None,
+        fine_tuning: bool = False,
+        base_mels_path: str = None,
+        is_seen: bool = True,
+    ):
+        self.audio_files = training_files
+        random.seed(1234)
+        if shuffle:
+            random.shuffle(self.audio_files)
+        self.hparams = hparams
+        self.is_seen = is_seen
+        if self.is_seen:
+            self.name = pathlib.Path(self.audio_files[0]).parts[0]
+        else:
+            self.name = "-".join(pathlib.Path(self.audio_files[0]).parts[:2]).strip("/")
+
+        self.segment_size = segment_size
+        self.sampling_rate = sampling_rate
+        self.split = split
+        self.n_fft = n_fft
+        self.num_mels = num_mels
+        self.hop_size = hop_size
+        self.win_size = win_size
+        self.fmin = fmin
+        self.fmax = fmax
+        self.fmax_loss = fmax_loss
+        self.device = device
+        self.fine_tuning = fine_tuning
+        self.base_mels_path = base_mels_path
+
+        print("[INFO] checking dataset integrity...")
+        for i in tqdm(range(len(self.audio_files))):
+            assert os.path.exists(
+                self.audio_files[i]
+            ), f"{self.audio_files[i]} not found"
+
+    def __getitem__(
+        self, index: int
+    ) -> Tuple[torch.Tensor, torch.Tensor, str, torch.Tensor]:
+        try:
+            filename = self.audio_files[index]
+
+            # Use librosa.load that ensures loading waveform into mono with [-1, 1] float values
+            # Audio is ndarray with shape [T_time]. Disable auto-resampling here to minimize overhead
+            # The on-the-fly resampling during training will be done only for the obtained random chunk
+            audio, source_sampling_rate = librosa.load(filename, sr=None, mono=True)
+
+            # Main logic that uses <mel, audio> pair for training BigVGAN
+            if not self.fine_tuning:
+                if self.split:  # Training step
+                    # Obtain randomized audio chunk
+                    if source_sampling_rate != self.sampling_rate:
+                        # Adjust segment size to crop if the source sr is different
+                        target_segment_size = math.ceil(
+                            self.segment_size
+                            * (source_sampling_rate / self.sampling_rate)
+                        )
+                    else:
+                        target_segment_size = self.segment_size
+
+                    # Compute upper bound index for the random chunk
+                    random_chunk_upper_bound = max(
+                        0, audio.shape[0] - target_segment_size
+                    )
+
+                    # Crop or pad audio to obtain random chunk with target_segment_size
+                    if audio.shape[0] >= target_segment_size:
+                        audio_start = random.randint(0, random_chunk_upper_bound)
+                        audio = audio[audio_start : audio_start + target_segment_size]
+                    else:
+                        audio = np.pad(
+                            audio,
+                            (0, target_segment_size - audio.shape[0]),
+                            mode="constant",
+                        )
+
+                    # Resample audio chunk to self.sampling rate
+                    if source_sampling_rate != self.sampling_rate:
+                        audio = librosa.resample(
+                            audio,
+                            orig_sr=source_sampling_rate,
+                            target_sr=self.sampling_rate,
+                        )
+                        if audio.shape[0] > self.segment_size:
+                            # trim last elements to match self.segment_size (e.g., 16385 for 44khz downsampled to 24khz -> 16384)
+                            audio = audio[: self.segment_size]
+
+                else:  # Validation step
+                    # Resample full audio clip to target sampling rate
+                    if source_sampling_rate != self.sampling_rate:
+                        audio = librosa.resample(
+                            audio,
+                            orig_sr=source_sampling_rate,
+                            target_sr=self.sampling_rate,
+                        )
+                    # Trim last elements to match audio length to self.hop_size * n for evaluation
+                    if (audio.shape[0] % self.hop_size) != 0:
+                        audio = audio[: -(audio.shape[0] % self.hop_size)]
+
+                # BigVGAN is trained using volume-normalized waveform
+                audio = librosa.util.normalize(audio) * 0.95
+
+                # Cast ndarray to torch tensor
+                audio = torch.FloatTensor(audio)
+                audio = audio.unsqueeze(0)  # [B(1), self.segment_size]
+
+                # Compute mel spectrogram corresponding to audio
+                mel = mel_spectrogram(
+                    audio,
+                    self.n_fft,
+                    self.num_mels,
+                    self.sampling_rate,
+                    self.hop_size,
+                    self.win_size,
+                    self.fmin,
+                    self.fmax,
+                    center=False,
+                )  # [B(1), self.num_mels, self.segment_size // self.hop_size]
+
+            # Fine-tuning logic that uses pre-computed mel. Example: Using TTS model-generated mel as input
+            else:
+                # For fine-tuning, assert that the waveform is in the defined sampling_rate
+                # Fine-tuning won't support on-the-fly resampling to be fool-proof (the dataset should have been prepared properly)
+                assert (
+                    source_sampling_rate == self.sampling_rate
+                ), f"For fine_tuning, waveform must be in the spcified sampling rate {self.sampling_rate}, got {source_sampling_rate}"
+
+                # Cast ndarray to torch tensor
+                audio = torch.FloatTensor(audio)
+                audio = audio.unsqueeze(0)  # [B(1), T_time]
+
+                # Load pre-computed mel from disk
+                mel = np.load(
+                    os.path.join(
+                        self.base_mels_path,
+                        os.path.splitext(os.path.split(filename)[-1])[0] + ".npy",
+                    )
+                )
+                mel = torch.from_numpy(mel)
+
+                if len(mel.shape) < 3:
+                    mel = mel.unsqueeze(0)  # ensure [B, C, T]
+
+                if self.split:
+                    frames_per_seg = math.ceil(self.segment_size / self.hop_size)
+
+                    if audio.size(1) >= self.segment_size:
+                        mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
+                        mel = mel[:, :, mel_start : mel_start + frames_per_seg]
+                        audio = audio[
+                            :,
+                            mel_start
+                            * self.hop_size : (mel_start + frames_per_seg)
+                            * self.hop_size,
+                        ]
+
+                    # Pad pre-computed mel and audio to match length to ensuring fine-tuning without error.
+                    # NOTE: this may introduce a single-frame misalignment of the <pre-computed mel, audio>
+                    # To remove possible misalignment, it is recommended to prepare the <pre-computed mel, audio> pair where the audio length is the integer multiple of self.hop_size
+                    mel = torch.nn.functional.pad(
+                        mel, (0, frames_per_seg - mel.size(2)), "constant"
+                    )
+                    audio = torch.nn.functional.pad(
+                        audio, (0, self.segment_size - audio.size(1)), "constant"
+                    )
+
+            # Compute mel_loss used by spectral regression objective. Uses self.fmax_loss instead (usually None)
+            mel_loss = mel_spectrogram(
+                audio,
+                self.n_fft,
+                self.num_mels,
+                self.sampling_rate,
+                self.hop_size,
+                self.win_size,
+                self.fmin,
+                self.fmax_loss,
+                center=False,
+            )  # [B(1), self.num_mels, self.segment_size // self.hop_size]
+
+            # Shape sanity checks
+            assert (
+                audio.shape[1] == mel.shape[2] * self.hop_size
+                and audio.shape[1] == mel_loss.shape[2] * self.hop_size
+            ), f"Audio length must be mel frame length * hop_size. Got audio shape {audio.shape} mel shape {mel.shape} mel_loss shape {mel_loss.shape}"
+
+            return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
+
+        # If it encounters error during loading the data, skip this sample and load random other sample to the batch
+        except Exception as e:
+            if self.fine_tuning:
+                raise e  # Terminate training if it is fine-tuning. The dataset should have been prepared properly.
+            else:
+                print(
+                    f"[WARNING] Failed to load waveform, skipping! filename: {filename} Error: {e}"
+                )
+                return self[random.randrange(len(self))]
+
+    def __len__(self):
+        return len(self.audio_files)
\ No newline at end of file
diff --git a/capspeech/nar/model/__init__.py b/capspeech/nar/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8dfdbecd777361e57529ddf922fa2ea902923d4
--- /dev/null
+++ b/capspeech/nar/model/__init__.py
@@ -0,0 +1,6 @@
+from capspeech.nar.model.cfm import CFM
+
+from capspeech.nar.model.backbones.unett import UNetT
+from capspeech.nar.model.backbones.dit import DiT
+
+from capspeech.nar.model.trainer import Trainer
diff --git a/capspeech/nar/model/backbones/README.md b/capspeech/nar/model/backbones/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..155671e16fbf128a243ece9033cefd47b957af88
--- /dev/null
+++ b/capspeech/nar/model/backbones/README.md
@@ -0,0 +1,20 @@
+## Backbones quick introduction
+
+
+### unett.py
+- flat unet transformer
+- structure same as in e2-tts & voicebox paper except using rotary pos emb
+- update: allow possible abs pos emb & convnextv2 blocks for embedded text before concat
+
+### dit.py
+- adaln-zero dit
+- embedded timestep as condition
+- concatted noised_input + masked_cond + embedded_text, linear proj in
+- possible abs pos emb & convnextv2 blocks for embedded text before concat
+- possible long skip connection (first layer to last layer)
+
+### mmdit.py
+- sd3 structure
+- timestep as condition
+- left stream: text embedded and applied a abs pos emb
+- right stream: masked_cond & noised_input concatted and with same conv pos emb as unett
diff --git a/capspeech/nar/model/backbones/__init__.py b/capspeech/nar/model/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/nar/model/backbones/dit.py b/capspeech/nar/model/backbones/dit.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7c822794ab0e74d78afbea29dfbf8a85b20bf63
--- /dev/null
+++ b/capspeech/nar/model/backbones/dit.py
@@ -0,0 +1,158 @@
+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+
+from __future__ import annotations
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from einops import repeat
+
+from x_transformers.x_transformers import RotaryEmbedding
+
+from capspeech.nar.model.modules import (
+    TimestepEmbedding,
+    ConvNeXtV2Block,
+    ConvPositionEmbedding,
+    DiTBlock,
+    AdaLayerNormZero_Final,
+    precompute_freqs_cis, get_pos_embed_indices,
+)
+
+
+# Text embedding
+
+class TextEmbedding(nn.Module):
+    def __init__(self, text_num_embeds, text_dim, conv_layers = 0, conv_mult = 2):
+        super().__init__()
+        self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim)  # use 0 as filler token
+        
+        if conv_layers > 0:
+            self.extra_modeling = True
+            self.precompute_max_pos = 4096  # ~44s of 24khz audio
+            self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
+            self.text_blocks = nn.Sequential(*[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)])
+        else:
+            self.extra_modeling = False
+
+    def forward(self, text: int['b nt'], seq_len, drop_text = False):
+        batch, text_len = text.shape[0], text.shape[1]
+        text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
+        text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
+        text = F.pad(text, (0, seq_len - text_len), value = 0)
+
+        if drop_text:  # cfg for text
+            text = torch.zeros_like(text)
+
+        text = self.text_embed(text) # b n -> b n d
+
+        # possible extra modeling
+        if self.extra_modeling:
+            # sinus pos emb
+            batch_start = torch.zeros((batch,), dtype=torch.long)
+            pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
+            text_pos_embed = self.freqs_cis[pos_idx]
+            text = text + text_pos_embed
+
+            # convnextv2 blocks
+            text = self.text_blocks(text)
+
+        return text
+
+
+# noised input audio and context mixing embedding
+
+class InputEmbedding(nn.Module):
+    def __init__(self, mel_dim, text_dim, out_dim):
+        super().__init__()
+        self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
+        self.conv_pos_embed = ConvPositionEmbedding(dim = out_dim)
+
+    def forward(self, x: float['b n d'], cond: float['b n d'], text_embed: float['b n d'], drop_audio_cond = False):
+        if drop_audio_cond:  # cfg for cond audio
+            cond = torch.zeros_like(cond)
+
+        x = self.proj(torch.cat((x, cond, text_embed), dim = -1))
+        x = self.conv_pos_embed(x) + x
+        return x
+    
+
+# Transformer backbone using DiT blocks
+
+class DiT(nn.Module):
+    def __init__(self, *, 
+                 dim, depth = 8, heads = 8, dim_head = 64, dropout = 0.1, ff_mult = 4,
+                 mel_dim = 100, text_num_embeds = 256, text_dim = None, conv_layers = 0,
+                 long_skip_connection = False,
+    ):
+        super().__init__()
+
+        self.time_embed = TimestepEmbedding(dim)
+        if text_dim is None:
+            text_dim = mel_dim
+        self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers = conv_layers)
+        self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
+
+        self.rotary_embed = RotaryEmbedding(dim_head)
+
+        self.dim = dim
+        self.depth = depth
+        
+        self.transformer_blocks = nn.ModuleList(
+            [
+                DiTBlock(
+                    dim = dim,
+                    heads = heads,
+                    dim_head = dim_head,
+                    ff_mult = ff_mult,
+                    dropout = dropout
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.long_skip_connection = nn.Linear(dim * 2, dim, bias = False) if long_skip_connection else None
+        
+        self.norm_out = AdaLayerNormZero_Final(dim)  # final modulation
+        self.proj_out = nn.Linear(dim, mel_dim)
+
+    def forward(
+        self,
+        x: float['b n d'],     # nosied input audio
+        cond: float['b n d'],  # masked cond audio
+        text: int['b nt'],     # text
+        time: float['b'] | float[''],  # time step
+        drop_audio_cond,  # cfg for cond audio
+        drop_text,        # cfg for text
+        mask: bool['b n'] | None = None,
+    ):
+        batch, seq_len = x.shape[0], x.shape[1]
+        if time.ndim == 0:
+            time = repeat(time, ' -> b', b = batch)
+        
+        # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
+        t = self.time_embed(time)
+        text_embed = self.text_embed(text, seq_len, drop_text = drop_text)
+        x = self.input_embed(x, cond, text_embed, drop_audio_cond = drop_audio_cond)
+        
+        rope = self.rotary_embed.forward_from_seq_len(seq_len)
+
+        if self.long_skip_connection is not None:
+            residual = x
+
+        for block in self.transformer_blocks:
+            x = block(x, t, mask = mask, rope = rope)
+
+        if self.long_skip_connection is not None:
+            x = self.long_skip_connection(torch.cat((x, residual), dim = -1))
+
+        x = self.norm_out(x, t)
+        output = self.proj_out(x)
+
+        return output
diff --git a/capspeech/nar/model/backbones/unett.py b/capspeech/nar/model/backbones/unett.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa5eb55fbdd9678fb558828eddc60a936da4f5dd
--- /dev/null
+++ b/capspeech/nar/model/backbones/unett.py
@@ -0,0 +1,201 @@
+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+
+from __future__ import annotations
+from typing import Literal
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from einops import repeat, pack, unpack
+
+from x_transformers import RMSNorm
+from x_transformers.x_transformers import RotaryEmbedding
+
+from capspeech.nar.model.modules import (
+    TimestepEmbedding,
+    ConvNeXtV2Block,
+    ConvPositionEmbedding,
+    Attention,
+    AttnProcessor,
+    FeedForward,
+    precompute_freqs_cis, get_pos_embed_indices,
+)
+
+
+# Text embedding
+
+class TextEmbedding(nn.Module):
+    def __init__(self, text_num_embeds, text_dim, conv_layers = 0, conv_mult = 2):
+        super().__init__()
+        self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim)  # use 0 as filler token
+
+        if conv_layers > 0:
+            self.extra_modeling = True
+            self.precompute_max_pos = 4096  # ~44s of 24khz audio
+            self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
+            self.text_blocks = nn.Sequential(*[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)])
+        else:
+            self.extra_modeling = False
+
+    def forward(self, text: int['b nt'], seq_len, drop_text = False):
+        batch, text_len = text.shape[0], text.shape[1]
+        text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
+        text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
+        text = F.pad(text, (0, seq_len - text_len), value = 0)
+
+        if drop_text:  # cfg for text
+            text = torch.zeros_like(text)
+
+        text = self.text_embed(text) # b n -> b n d
+
+        # possible extra modeling
+        if self.extra_modeling:
+            # sinus pos emb
+            batch_start = torch.zeros((batch,), dtype=torch.long)
+            pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
+            text_pos_embed = self.freqs_cis[pos_idx]
+            text = text + text_pos_embed
+
+            # convnextv2 blocks
+            text = self.text_blocks(text)
+
+        return text
+
+
+# noised input audio and context mixing embedding
+
+class InputEmbedding(nn.Module):
+    def __init__(self, mel_dim, text_dim, out_dim):
+        super().__init__()
+        self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
+        self.conv_pos_embed = ConvPositionEmbedding(dim = out_dim)
+
+    def forward(self, x: float['b n d'], cond: float['b n d'], text_embed: float['b n d'], drop_audio_cond = False):
+        if drop_audio_cond:  # cfg for cond audio
+            cond = torch.zeros_like(cond)
+
+        x = self.proj(torch.cat((x, cond, text_embed), dim = -1))
+        x = self.conv_pos_embed(x) + x
+        return x
+
+
+# Flat UNet Transformer backbone
+
+class UNetT(nn.Module):
+    def __init__(self, *,
+                 dim, depth = 8, heads = 8, dim_head = 64, dropout = 0.1, ff_mult = 4,
+                 mel_dim = 100, text_num_embeds = 256, text_dim = None, conv_layers = 0,
+                 skip_connect_type: Literal['add', 'concat', 'none'] = 'concat',
+    ):
+        super().__init__()
+        assert depth % 2 == 0, "UNet-Transformer's depth should be even."
+
+        self.time_embed = TimestepEmbedding(dim)
+        if text_dim is None:
+            text_dim = mel_dim
+        self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers = conv_layers)
+        self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
+
+        self.rotary_embed = RotaryEmbedding(dim_head)
+
+        # transformer layers & skip connections
+
+        self.dim = dim
+        self.skip_connect_type = skip_connect_type
+        needs_skip_proj = skip_connect_type == 'concat'
+
+        self.depth = depth
+        self.layers = nn.ModuleList([])
+
+        for idx in range(depth):
+            is_later_half = idx >= (depth // 2)
+
+            attn_norm = RMSNorm(dim)
+            attn = Attention(
+                processor = AttnProcessor(),
+                dim = dim,
+                heads = heads,
+                dim_head = dim_head,
+                dropout = dropout,
+                )
+
+            ff_norm = RMSNorm(dim)
+            ff = FeedForward(dim = dim, mult = ff_mult, dropout = dropout, approximate = "tanh")
+
+            skip_proj = nn.Linear(dim * 2, dim, bias = False) if needs_skip_proj and is_later_half else None
+
+            self.layers.append(nn.ModuleList([
+                skip_proj,
+                attn_norm,
+                attn,
+                ff_norm,
+                ff,
+            ]))
+
+        self.norm_out = RMSNorm(dim)
+        self.proj_out = nn.Linear(dim, mel_dim)
+
+    def forward(
+        self,
+        x: float['b n d'],     # nosied input audio
+        cond: float['b n d'],  # masked cond audio
+        text: int['b nt'],     # text
+        time: float['b'] | float[''],  # time step
+        drop_audio_cond,  # cfg for cond audio
+        drop_text,        # cfg for text
+        mask: bool['b n'] | None = None,
+    ):
+        batch, seq_len = x.shape[0], x.shape[1]
+        if time.ndim == 0:
+            time = repeat(time, ' -> b', b = batch)
+        
+        # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
+        t = self.time_embed(time)
+        text_embed = self.text_embed(text, seq_len, drop_text = drop_text)
+        x = self.input_embed(x, cond, text_embed, drop_audio_cond = drop_audio_cond)
+
+        # postfix time t to input x, [b n d] -> [b n+1 d]
+        x, ps = pack((t, x), 'b * d')
+        if mask is not None:
+            mask = F.pad(mask, (1, 0), value=1)
+        
+        rope = self.rotary_embed.forward_from_seq_len(seq_len + 1)
+
+        # flat unet transformer
+        skip_connect_type = self.skip_connect_type
+        skips = []
+        for idx, (maybe_skip_proj, attn_norm, attn, ff_norm, ff) in enumerate(self.layers):
+            layer = idx + 1
+
+            # skip connection logic
+            is_first_half = layer <= (self.depth // 2)
+            is_later_half = not is_first_half
+
+            if is_first_half:
+                skips.append(x)
+
+            if is_later_half:
+                skip = skips.pop()
+                if skip_connect_type == 'concat':
+                    x = torch.cat((x, skip), dim = -1)
+                    x = maybe_skip_proj(x)
+                elif skip_connect_type == 'add':
+                    x = x + skip
+
+            # attention and feedforward blocks
+            x = attn(attn_norm(x), rope = rope, mask = mask) + x
+            x = ff(ff_norm(x)) + x
+
+        assert len(skips) == 0
+
+        _, x = unpack(self.norm_out(x), ps, 'b * d')
+
+        return self.proj_out(x)
diff --git a/capspeech/nar/model/cfm.py b/capspeech/nar/model/cfm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dc888661f1326d5951d345c4cdddf536caf2839
--- /dev/null
+++ b/capspeech/nar/model/cfm.py
@@ -0,0 +1,279 @@
+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+
+from __future__ import annotations
+from typing import Callable
+from random import random
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+
+from torchdiffeq import odeint
+
+from einops import rearrange
+
+from capspeech.nar.model.modules import MelSpec
+
+from capspeech.nar.model.utils import (
+    default, exists, 
+    list_str_to_idx, list_str_to_tensor, 
+    lens_to_mask, mask_from_frac_lengths,
+) 
+
+
+class CFM(nn.Module):
+    def __init__(
+        self,
+        transformer: nn.Module,
+        sigma = 0.,
+        odeint_kwargs: dict = dict(
+            # atol = 1e-5,
+            # rtol = 1e-5,
+            method = 'euler'  # 'midpoint'
+        ),
+        audio_drop_prob = 0.3,
+        cond_drop_prob = 0.2,
+        num_channels = None,
+        mel_spec_module: nn.Module | None = None,
+        mel_spec_kwargs: dict = dict(),
+        frac_lengths_mask: tuple[float, float] = (0.7, 1.),
+        vocab_char_map: dict[str: int] | None = None
+    ):
+        super().__init__()
+
+        self.frac_lengths_mask = frac_lengths_mask
+
+        # mel spec
+        self.mel_spec = default(mel_spec_module, MelSpec(**mel_spec_kwargs))
+        num_channels = default(num_channels, self.mel_spec.n_mel_channels)
+        self.num_channels = num_channels
+
+        # classifier-free guidance
+        self.audio_drop_prob = audio_drop_prob
+        self.cond_drop_prob = cond_drop_prob
+
+        # transformer
+        self.transformer = transformer
+        dim = transformer.dim
+        self.dim = dim
+
+        # conditional flow related
+        self.sigma = sigma
+
+        # sampling related
+        self.odeint_kwargs = odeint_kwargs
+
+        # vocab map for tokenization
+        self.vocab_char_map = vocab_char_map
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    @torch.no_grad()
+    def sample(
+        self,
+        cond: float['b n d'] | float['b nw'],
+        text: int['b nt'] | list[str],
+        duration: int | int['b'],
+        *,
+        lens: int['b'] | None = None,
+        steps = 32,
+        cfg_strength = 1., 
+        sway_sampling_coef = None,
+        seed: int | None = None,
+        max_duration = 4096, 
+        vocoder: Callable[[float['b d n']], float['b nw']] | None = None,
+        no_ref_audio = False,
+        duplicate_test = False,
+        t_inter = 0.1,
+        edit_mask = None,
+    ):
+        self.eval()
+
+        # raw wave
+
+        if cond.ndim == 2:
+            cond = self.mel_spec(cond)
+            cond = rearrange(cond, 'b d n -> b n d')
+            assert cond.shape[-1] == self.num_channels
+
+        batch, cond_seq_len, device = *cond.shape[:2], cond.device
+        if not exists(lens):
+            lens = torch.full((batch,), cond_seq_len, device = device, dtype = torch.long)
+
+        # text
+
+        if isinstance(text, list):
+            if exists(self.vocab_char_map):
+                text = list_str_to_idx(text, self.vocab_char_map).to(device)
+            else:
+                text = list_str_to_tensor(text).to(device)
+            assert text.shape[0] == batch
+
+        if exists(text):
+            text_lens = (text != -1).sum(dim = -1)
+            lens = torch.maximum(text_lens, lens) # make sure lengths are at least those of the text characters
+
+        # duration
+
+        cond_mask = lens_to_mask(lens)
+        if edit_mask is not None:
+            cond_mask = cond_mask & edit_mask
+
+        if isinstance(duration, int):
+            duration = torch.full((batch,), duration, device = device, dtype = torch.long)
+
+        duration = torch.maximum(lens + 1, duration) # just add one token so something is generated
+        duration = duration.clamp(max = max_duration)
+        max_duration = duration.amax()
+        
+        # duplicate test corner for inner time step oberservation
+        if duplicate_test:
+            test_cond = F.pad(cond, (0, 0, cond_seq_len, max_duration - 2*cond_seq_len), value = 0.)
+            
+        cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value = 0.)
+        cond_mask = F.pad(cond_mask, (0, max_duration - cond_mask.shape[-1]), value = False)
+        cond_mask = rearrange(cond_mask, '... -> ... 1')
+        step_cond = torch.where(cond_mask, cond, torch.zeros_like(cond))  # allow direct control (cut cond audio) with lens passed in
+
+        if batch > 1:
+            mask = lens_to_mask(duration)
+        else:  # save memory and speed up, as single inference need no mask currently
+            mask = None
+
+        # test for no ref audio
+        if no_ref_audio:
+            cond = torch.zeros_like(cond)
+
+        # neural ode
+
+        def fn(t, x):
+            # at each step, conditioning is fixed
+            # step_cond = torch.where(cond_mask, cond, torch.zeros_like(cond))
+
+            # predict flow
+            pred = self.transformer(x = x, cond = step_cond, text = text, time = t, mask = mask, drop_audio_cond = False, drop_text = False)
+            if cfg_strength < 1e-5:
+                return pred
+            
+            null_pred = self.transformer(x = x, cond = step_cond, text = text, time = t, mask = mask, drop_audio_cond = True, drop_text = True)
+            return pred + (pred - null_pred) * cfg_strength
+
+        # noise input
+        # to make sure batch inference result is same with different batch size, and for sure single inference
+        # still some difference maybe due to convolutional layers
+        y0 = []
+        for dur in duration:
+            if exists(seed):
+                torch.manual_seed(seed)
+            y0.append(torch.randn(dur, self.num_channels, device = self.device))
+        y0 = pad_sequence(y0, padding_value = 0, batch_first = True)
+
+        t_start = 0
+
+        # duplicate test corner for inner time step oberservation
+        if duplicate_test:
+            t_start = t_inter
+            y0 = (1 - t_start) * y0 + t_start * test_cond
+            steps = int(steps * (1 - t_start))
+
+        t = torch.linspace(t_start, 1, steps, device = self.device)
+        if sway_sampling_coef is not None:
+            t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
+
+        trajectory = odeint(fn, y0, t, **self.odeint_kwargs)
+        
+        sampled = trajectory[-1]
+        out = sampled
+        out = torch.where(cond_mask, cond, out)
+
+        if exists(vocoder):
+            out = rearrange(out, 'b n d -> b d n')
+            out = vocoder(out)
+
+        return out, trajectory
+
+    def forward(
+        self,
+        inp: float['b n d'] | float['b nw'], # mel or raw wave
+        text: int['b nt'] | list[str],
+        *,
+        lens: int['b'] | None = None,
+        noise_scheduler: str | None = None,
+    ):
+        # handle raw wave
+        if inp.ndim == 2:
+            inp = self.mel_spec(inp)
+            inp = rearrange(inp, 'b d n -> b n d')
+            assert inp.shape[-1] == self.num_channels
+
+        batch, seq_len, dtype, device, σ1 = *inp.shape[:2], inp.dtype, self.device, self.sigma
+
+        # handle text as string
+        if isinstance(text, list):
+            if exists(self.vocab_char_map):
+                text = list_str_to_idx(text, self.vocab_char_map).to(device)
+            else:
+                text = list_str_to_tensor(text).to(device)
+            assert text.shape[0] == batch
+
+        # lens and mask
+        if not exists(lens):
+            lens = torch.full((batch,), seq_len, device = device)
+        
+        mask = lens_to_mask(lens, length = seq_len)  # useless here, as collate_fn will pad to max length in batch
+
+        # get a random span to mask out for training conditionally
+        frac_lengths = torch.zeros((batch,), device = self.device).float().uniform_(*self.frac_lengths_mask)
+        rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
+
+        if exists(mask):
+            rand_span_mask &= mask
+
+        # mel is x1
+        x1 = inp
+
+        # x0 is gaussian noise
+        x0 = torch.randn_like(x1)
+
+        # time step
+        time = torch.rand((batch,), dtype = dtype, device = self.device)
+        # TODO. noise_scheduler
+
+        # sample xt (φ_t(x) in the paper)
+        t = rearrange(time, 'b -> b 1 1')
+        φ = (1 - t) * x0 + t * x1
+        flow = x1 - x0
+
+        # only predict what is within the random mask span for infilling
+        cond = torch.where(
+            rand_span_mask[..., None],
+            torch.zeros_like(x1), x1
+        )
+
+        # transformer and cfg training with a drop rate
+        drop_audio_cond = random() < self.audio_drop_prob  # p_drop in voicebox paper
+        if random() < self.cond_drop_prob:  # p_uncond in voicebox paper
+            drop_audio_cond = True
+            drop_text = True
+        else:
+            drop_text = False
+            
+        # if want rigourously mask out padding, record in collate_fn in dataset.py, and pass in here
+        # adding mask will use more memory, thus also need to adjust batchsampler with scaled down threshold for long sequences
+        pred = self.transformer(x = φ, cond = cond, text = text, time = time, drop_audio_cond = drop_audio_cond, drop_text = drop_text)
+
+        # flow matching loss
+        loss = F.mse_loss(pred, flow, reduction = 'none')
+        loss = loss[rand_span_mask]
+
+        return loss.mean(), cond, pred
diff --git a/capspeech/nar/model/dataset.py b/capspeech/nar/model/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..adf8116fc8430a59da7eeaae42d8b7c3fcc4df86
--- /dev/null
+++ b/capspeech/nar/model/dataset.py
@@ -0,0 +1,257 @@
+import json
+import random
+from tqdm import tqdm
+
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset, Sampler
+import torchaudio
+from datasets import load_dataset, load_from_disk
+from datasets import Dataset as Dataset_
+
+from einops import rearrange
+
+from capspeech.nar.model.modules import MelSpec
+
+
+class HFDataset(Dataset):
+    def __init__(
+        self,
+        hf_dataset: Dataset,
+        target_sample_rate = 24_000,
+        n_mel_channels = 100,
+        hop_length = 256,
+    ):
+        self.data = hf_dataset
+        self.target_sample_rate = target_sample_rate
+        self.hop_length = hop_length
+        self.mel_spectrogram = MelSpec(target_sample_rate=target_sample_rate, n_mel_channels=n_mel_channels, hop_length=hop_length)
+        
+    def get_frame_len(self, index):
+        row = self.data[index]
+        audio = row['audio']['array']
+        sample_rate = row['audio']['sampling_rate']
+        return audio.shape[-1] / sample_rate * self.target_sample_rate / self.hop_length
+
+    def __len__(self):
+        return len(self.data)
+    
+    def __getitem__(self, index):
+        row = self.data[index]
+        audio = row['audio']['array']
+
+        # logger.info(f"Audio shape: {audio.shape}")
+
+        sample_rate = row['audio']['sampling_rate']
+        duration = audio.shape[-1] / sample_rate
+
+        if duration > 30 or duration < 0.3:
+            return self.__getitem__((index + 1) % len(self.data))
+        
+        audio_tensor = torch.from_numpy(audio).float()
+        
+        if sample_rate != self.target_sample_rate:
+            resampler = torchaudio.transforms.Resample(sample_rate, self.target_sample_rate)
+            audio_tensor = resampler(audio_tensor)
+        
+        audio_tensor = rearrange(audio_tensor, 't -> 1 t')
+        
+        mel_spec = self.mel_spectrogram(audio_tensor)
+        
+        mel_spec = rearrange(mel_spec, '1 d t -> d t')
+        
+        text = row['text']
+        
+        return dict(
+            mel_spec = mel_spec,
+            text = text,
+        )
+
+
+class CustomDataset(Dataset):
+    def __init__(
+        self,
+        custom_dataset: Dataset,
+        durations = None,
+        target_sample_rate = 24_000,
+        hop_length = 256,
+        n_mel_channels = 100,
+        preprocessed_mel = False,
+    ):
+        self.data = custom_dataset
+        self.durations = durations
+        self.target_sample_rate = target_sample_rate
+        self.hop_length = hop_length
+        self.preprocessed_mel = preprocessed_mel
+        if not preprocessed_mel:
+            self.mel_spectrogram = MelSpec(target_sample_rate=target_sample_rate, hop_length=hop_length, n_mel_channels=n_mel_channels)
+
+    def get_frame_len(self, index):
+        if self.durations is not None:  # Please make sure the separately provided durations are correct, otherwise 99.99% OOM
+            return self.durations[index] * self.target_sample_rate / self.hop_length
+        return self.data[index]["duration"] * self.target_sample_rate / self.hop_length
+    
+    def __len__(self):
+        return len(self.data)
+    
+    def __getitem__(self, index):
+        row = self.data[index]
+        audio_path = row["audio_path"]
+        text = row["text"]
+        duration = row["duration"]
+
+        if self.preprocessed_mel:
+            mel_spec = torch.tensor(row["mel_spec"])
+
+        else:
+            audio, source_sample_rate = torchaudio.load(audio_path)
+
+            if duration > 30 or duration < 0.3:
+                return self.__getitem__((index + 1) % len(self.data))
+            
+            if source_sample_rate != self.target_sample_rate:
+                resampler = torchaudio.transforms.Resample(source_sample_rate, self.target_sample_rate)
+                audio = resampler(audio)
+            
+            mel_spec = self.mel_spectrogram(audio)
+            mel_spec = rearrange(mel_spec, '1 d t -> d t')
+        
+        return dict(
+            mel_spec = mel_spec,
+            text = text,
+        )
+    
+
+# Dynamic Batch Sampler
+
+class DynamicBatchSampler(Sampler[list[int]]):
+    """ Extension of Sampler that will do the following:
+        1.  Change the batch size (essentially number of sequences)
+            in a batch to ensure that the total number of frames are less
+            than a certain threshold.
+        2.  Make sure the padding efficiency in the batch is high.
+    """
+
+    def __init__(self, sampler: Sampler[int], frames_threshold: int, max_samples=0, random_seed=None, drop_last: bool = False):
+        self.sampler = sampler
+        self.frames_threshold = frames_threshold
+        self.max_samples = max_samples
+
+        indices, batches = [], []
+        data_source = self.sampler.data_source
+        
+        for idx in tqdm(self.sampler, desc=f"Sorting with sampler... if slow, check whether dataset is provided with duration"):
+            indices.append((idx, data_source.get_frame_len(idx)))
+        indices.sort(key=lambda elem : elem[1])
+
+        batch = []
+        batch_frames = 0
+        for idx, frame_len in tqdm(indices, desc=f"Creating dynamic batches with {frames_threshold} audio frames per gpu"):
+            if batch_frames + frame_len <= self.frames_threshold and (max_samples == 0 or len(batch) < max_samples):
+                batch.append(idx)
+                batch_frames += frame_len
+            else:
+                if len(batch) > 0:
+                    batches.append(batch)
+                if frame_len <= self.frames_threshold:
+                    batch = [idx]
+                    batch_frames = frame_len
+                else:
+                    batch = []
+                    batch_frames = 0
+
+        if not drop_last and len(batch) > 0:
+            batches.append(batch)
+
+        del indices
+
+        # if want to have different batches between epochs, may just set a seed and log it in ckpt
+        # cuz during multi-gpu training, although the batch on per gpu not change between epochs, the formed general minibatch is different
+        # e.g. for epoch n, use (random_seed + n)
+        random.seed(random_seed)
+        random.shuffle(batches)
+
+        self.batches = batches
+
+    def __iter__(self):
+        return iter(self.batches)
+
+    def __len__(self):
+        return len(self.batches)
+
+
+# Load dataset
+
+def load_dataset(
+        dataset_name: str,
+        tokenizer: str = "pinyin",
+        dataset_type: str = "CustomDataset", 
+        audio_type: str = "raw", 
+        mel_spec_kwargs: dict = dict()
+        ) -> CustomDataset | HFDataset:
+    '''
+    dataset_type    - "CustomDataset" if you want to use tokenizer name and default data path to load for train_dataset
+                    - "CustomDatasetPath" if you just want to pass the full path to a preprocessed dataset without relying on tokenizer
+    '''
+    
+    print("Loading dataset ...")
+
+    if dataset_type == "CustomDataset":
+        if audio_type == "raw":
+            try:
+                train_dataset = load_from_disk(f"data/{dataset_name}_{tokenizer}/raw")
+            except:
+                train_dataset = Dataset_.from_file(f"data/{dataset_name}_{tokenizer}/raw.arrow")
+            preprocessed_mel = False
+        elif audio_type == "mel":
+            train_dataset = Dataset_.from_file(f"data/{dataset_name}_{tokenizer}/mel.arrow")
+            preprocessed_mel = True
+        with open(f"data/{dataset_name}_{tokenizer}/duration.json", 'r', encoding='utf-8') as f:
+            data_dict = json.load(f)
+        durations = data_dict["duration"]
+        train_dataset = CustomDataset(train_dataset, durations=durations, preprocessed_mel=preprocessed_mel, **mel_spec_kwargs)
+        
+    elif dataset_type == "CustomDatasetPath":
+        try:
+            train_dataset = load_from_disk(f"{dataset_name}/raw")
+        except:
+            train_dataset = Dataset_.from_file(f"{dataset_name}/raw.arrow")
+            
+        with open(f"{dataset_name}/duration.json", 'r', encoding='utf-8') as f:
+            data_dict = json.load(f)
+        durations = data_dict["duration"]
+        train_dataset = CustomDataset(train_dataset, durations=durations, preprocessed_mel=preprocessed_mel, **mel_spec_kwargs)
+            
+    elif dataset_type == "HFDataset":
+        print("Should manually modify the path of huggingface dataset to your need.\n" +
+              "May also the corresponding script cuz different dataset may have different format.")
+        pre, post = dataset_name.split("_")
+        train_dataset = HFDataset(load_dataset(f"{pre}/{pre}", split=f"train.{post}", cache_dir="./data"),)
+
+    return train_dataset
+
+
+# collation
+
+def collate_fn(batch):
+    mel_specs = [item['mel_spec'].squeeze(0) for item in batch]
+    mel_lengths = torch.LongTensor([spec.shape[-1] for spec in mel_specs])
+    max_mel_length = mel_lengths.amax()
+
+    padded_mel_specs = []
+    for spec in mel_specs:  # TODO. maybe records mask for attention here
+        padding = (0, max_mel_length - spec.size(-1))
+        padded_spec = F.pad(spec, padding, value = 0)
+        padded_mel_specs.append(padded_spec)
+    
+    mel_specs = torch.stack(padded_mel_specs)
+
+    text = [item['text'] for item in batch]
+    text_lengths = torch.LongTensor([len(item) for item in text])
+
+    return dict(
+        mel = mel_specs,
+        mel_lengths = mel_lengths,
+        text = text,
+        text_lengths = text_lengths,
+    )
diff --git a/capspeech/nar/model/ecapa_tdnn.py b/capspeech/nar/model/ecapa_tdnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..30b611eda2dd8dc8fed4f59997e976181c07f78e
--- /dev/null
+++ b/capspeech/nar/model/ecapa_tdnn.py
@@ -0,0 +1,268 @@
+# just for speaker similarity evaluation, third-party code
+
+# From https://github.com/microsoft/UniSpeech/blob/main/downstreams/speaker_verification/models/
+# part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
+
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+''' Res2Conv1d + BatchNorm1d + ReLU
+'''
+
+class Res2Conv1dReluBn(nn.Module):
+    '''
+    in_channels == out_channels == channels
+    '''
+
+    def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True, scale=4):
+        super().__init__()
+        assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
+        self.scale = scale
+        self.width = channels // scale
+        self.nums = scale if scale == 1 else scale - 1
+
+        self.convs = []
+        self.bns = []
+        for i in range(self.nums):
+            self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
+            self.bns.append(nn.BatchNorm1d(self.width))
+        self.convs = nn.ModuleList(self.convs)
+        self.bns = nn.ModuleList(self.bns)
+
+    def forward(self, x):
+        out = []
+        spx = torch.split(x, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            # Order: conv -> relu -> bn
+            sp = self.convs[i](sp)
+            sp = self.bns[i](F.relu(sp))
+            out.append(sp)
+        if self.scale != 1:
+            out.append(spx[self.nums])
+        out = torch.cat(out, dim=1)
+
+        return out
+
+
+''' Conv1d + BatchNorm1d + ReLU
+'''
+
+class Conv1dReluBn(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True):
+        super().__init__()
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
+        self.bn = nn.BatchNorm1d(out_channels)
+
+    def forward(self, x):
+        return self.bn(F.relu(self.conv(x)))
+
+
+''' The SE connection of 1D case.
+'''
+
+class SE_Connect(nn.Module):
+    def __init__(self, channels, se_bottleneck_dim=128):
+        super().__init__()
+        self.linear1 = nn.Linear(channels, se_bottleneck_dim)
+        self.linear2 = nn.Linear(se_bottleneck_dim, channels)
+
+    def forward(self, x):
+        out = x.mean(dim=2)
+        out = F.relu(self.linear1(out))
+        out = torch.sigmoid(self.linear2(out))
+        out = x * out.unsqueeze(2)
+
+        return out
+
+
+''' SE-Res2Block of the ECAPA-TDNN architecture.
+'''
+
+# def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
+#     return nn.Sequential(
+#         Conv1dReluBn(channels, 512, kernel_size=1, stride=1, padding=0),
+#         Res2Conv1dReluBn(512, kernel_size, stride, padding, dilation, scale=scale),
+#         Conv1dReluBn(512, channels, kernel_size=1, stride=1, padding=0),
+#         SE_Connect(channels)
+#     )
+
+class SE_Res2Block(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, scale, se_bottleneck_dim):
+        super().__init__()
+        self.Conv1dReluBn1 = Conv1dReluBn(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+        self.Res2Conv1dReluBn = Res2Conv1dReluBn(out_channels, kernel_size, stride, padding, dilation, scale=scale)
+        self.Conv1dReluBn2 = Conv1dReluBn(out_channels, out_channels, kernel_size=1, stride=1, padding=0)
+        self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
+
+        self.shortcut = None
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+            )
+
+    def forward(self, x):
+        residual = x
+        if self.shortcut:
+            residual = self.shortcut(x)
+
+        x = self.Conv1dReluBn1(x)
+        x = self.Res2Conv1dReluBn(x)
+        x = self.Conv1dReluBn2(x)
+        x = self.SE_Connect(x)
+
+        return x + residual
+
+
+''' Attentive weighted mean and standard deviation pooling.
+'''
+
+class AttentiveStatsPool(nn.Module):
+    def __init__(self, in_dim, attention_channels=128, global_context_att=False):
+        super().__init__()
+        self.global_context_att = global_context_att
+
+        # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
+        if global_context_att:
+            self.linear1 = nn.Conv1d(in_dim * 3, attention_channels, kernel_size=1)  # equals W and b in the paper
+        else:
+            self.linear1 = nn.Conv1d(in_dim, attention_channels, kernel_size=1)  # equals W and b in the paper
+        self.linear2 = nn.Conv1d(attention_channels, in_dim, kernel_size=1)  # equals V and k in the paper
+
+    def forward(self, x):
+
+        if self.global_context_att:
+            context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
+            context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
+            x_in = torch.cat((x, context_mean, context_std), dim=1)
+        else:
+            x_in = x
+
+        # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
+        alpha = torch.tanh(self.linear1(x_in))
+        # alpha = F.relu(self.linear1(x_in))
+        alpha = torch.softmax(self.linear2(alpha), dim=2)
+        mean = torch.sum(alpha * x, dim=2)
+        residuals = torch.sum(alpha * (x ** 2), dim=2) - mean ** 2
+        std = torch.sqrt(residuals.clamp(min=1e-9))
+        return torch.cat([mean, std], dim=1)
+
+
+class ECAPA_TDNN(nn.Module):
+    def __init__(self, feat_dim=80, channels=512, emb_dim=192, global_context_att=False,
+                 feat_type='wavlm_large', sr=16000, feature_selection="hidden_states", update_extract=False, config_path=None):
+        super().__init__()
+
+        self.feat_type = feat_type
+        self.feature_selection = feature_selection
+        self.update_extract = update_extract
+        self.sr = sr
+        
+        torch.hub._validate_not_a_forked_repo=lambda a,b,c: True
+        try:
+            local_s3prl_path = os.path.expanduser("~/.cache/torch/hub/s3prl_s3prl_main")
+            self.feature_extract = torch.hub.load(local_s3prl_path, feat_type, source='local', config_path=config_path)
+        except:
+            self.feature_extract = torch.hub.load('s3prl/s3prl', feat_type)
+
+        if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(self.feature_extract.model.encoder.layers[23].self_attn, "fp32_attention"):
+            self.feature_extract.model.encoder.layers[23].self_attn.fp32_attention = False
+        if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(self.feature_extract.model.encoder.layers[11].self_attn, "fp32_attention"):
+            self.feature_extract.model.encoder.layers[11].self_attn.fp32_attention = False
+
+        self.feat_num = self.get_feat_num()
+        self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
+
+        if feat_type != 'fbank' and feat_type != 'mfcc':
+            freeze_list = ['final_proj', 'label_embs_concat', 'mask_emb', 'project_q', 'quantizer']
+            for name, param in self.feature_extract.named_parameters():
+                for freeze_val in freeze_list:
+                    if freeze_val in name:
+                        param.requires_grad = False
+                        break
+
+        if not self.update_extract:
+            for param in self.feature_extract.parameters():
+                param.requires_grad = False
+
+        self.instance_norm = nn.InstanceNorm1d(feat_dim)
+        # self.channels = [channels] * 4 + [channels * 3]
+        self.channels = [channels] * 4 + [1536]
+
+        self.layer1 = Conv1dReluBn(feat_dim, self.channels[0], kernel_size=5, padding=2)
+        self.layer2 = SE_Res2Block(self.channels[0], self.channels[1], kernel_size=3, stride=1, padding=2, dilation=2, scale=8, se_bottleneck_dim=128)
+        self.layer3 = SE_Res2Block(self.channels[1], self.channels[2], kernel_size=3, stride=1, padding=3, dilation=3, scale=8, se_bottleneck_dim=128)
+        self.layer4 = SE_Res2Block(self.channels[2], self.channels[3], kernel_size=3, stride=1, padding=4, dilation=4, scale=8, se_bottleneck_dim=128)
+
+        # self.conv = nn.Conv1d(self.channels[-1], self.channels[-1], kernel_size=1)
+        cat_channels = channels * 3
+        self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
+        self.pooling = AttentiveStatsPool(self.channels[-1], attention_channels=128, global_context_att=global_context_att)
+        self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
+        self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
+
+
+    def get_feat_num(self):
+        self.feature_extract.eval()
+        wav = [torch.randn(self.sr).to(next(self.feature_extract.parameters()).device)]
+        with torch.no_grad():
+            features = self.feature_extract(wav)
+        select_feature = features[self.feature_selection]
+        if isinstance(select_feature, (list, tuple)):
+            return len(select_feature)
+        else:
+            return 1
+
+    def get_feat(self, x):
+        if self.update_extract:
+            x = self.feature_extract([sample for sample in x])
+        else:
+            with torch.no_grad():
+                if self.feat_type == 'fbank' or self.feat_type == 'mfcc':
+                    x = self.feature_extract(x) + 1e-6  # B x feat_dim x time_len
+                else:
+                    x = self.feature_extract([sample for sample in x])
+
+        if self.feat_type == 'fbank':
+            x = x.log()
+
+        if self.feat_type != "fbank" and self.feat_type != "mfcc":
+            x = x[self.feature_selection]
+            if isinstance(x, (list, tuple)):
+                x = torch.stack(x, dim=0)
+            else:
+                x = x.unsqueeze(0)
+            norm_weights = F.softmax(self.feature_weight, dim=-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
+            x = (norm_weights * x).sum(dim=0)
+            x = torch.transpose(x, 1, 2) + 1e-6
+
+        x = self.instance_norm(x)
+        return x
+
+    def forward(self, x):
+        x = self.get_feat(x)
+
+        out1 = self.layer1(x)
+        out2 = self.layer2(out1)
+        out3 = self.layer3(out2)
+        out4 = self.layer4(out3)
+
+        out = torch.cat([out2, out3, out4], dim=1)
+        out = F.relu(self.conv(out))
+        out = self.bn(self.pooling(out))
+        out = self.linear(out)
+
+        return out
+
+
+def ECAPA_TDNN_SMALL(feat_dim, emb_dim=256, feat_type='wavlm_large', sr=16000, feature_selection="hidden_states", update_extract=False, config_path=None):
+    return ECAPA_TDNN(feat_dim=feat_dim, channels=512, emb_dim=emb_dim,
+                      feat_type=feat_type, sr=sr, feature_selection=feature_selection, update_extract=update_extract, config_path=config_path)
diff --git a/capspeech/nar/model/modules.py b/capspeech/nar/model/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..63900c0505a16341698832a48d01ed1172e7e205
--- /dev/null
+++ b/capspeech/nar/model/modules.py
@@ -0,0 +1,615 @@
+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+
+from __future__ import annotations
+from typing import Optional
+import math
+from torch.utils.checkpoint import checkpoint
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torchaudio
+
+from einops import rearrange
+# from x_transformers.x_transformers import apply_rotary_pos_emb
+from inspect import isfunction
+from torch.amp import autocast
+
+
+# raw wav to mel spec
+
+class MelSpec(torch.nn.Module):
+    def __init__(self, target_sample_rate=24000, filter_length=1024, hop_length=256, n_mel_channels=100, f_min=0, f_max=12000, normalize=False, power=1, norm=None, center=True,):
+        super().__init__()
+        self.frame_length = filter_length
+        self.hop_length = hop_length
+        self.mel = torchaudio.transforms.MelSpectrogram(
+            sample_rate=target_sample_rate,
+            n_fft=filter_length,
+            win_length=filter_length,
+            hop_length=hop_length,
+            center=False,
+            power=1.0,
+            norm="slaney",
+            n_mels=n_mel_channels,
+            mel_scale="slaney",
+            f_min=0,
+            f_max=12000
+        )
+
+    @torch.no_grad()
+    def forward(self, x, target_length=None):
+        if len(x.shape) == 3:
+            x = rearrange(x, 'b 1 nw -> b nw')
+        assert len(x.shape) == 2
+        x = F.pad(x, ((self.frame_length - self.hop_length) // 2,
+                      (self.frame_length - self.hop_length) // 2), "reflect")
+        mel = self.mel(x)
+
+        target_length = mel.shape[-1] if target_length is None else target_length
+        logmel = torch.zeros(mel.shape[0], mel.shape[1], target_length).to(mel.device)
+        logmel[:, :, :mel.shape[2]] = mel
+
+        logmel = torch.log(torch.clamp(logmel, min=1e-5))
+        return logmel
+
+# class MelSpec(nn.Module):
+#     def __init__(
+#             self,
+#             filter_length=1024,
+#             hop_length=256,
+#             win_length=1024,
+#             n_mel_channels=100,
+#             target_sample_rate=24_000,
+#             normalize=False,
+#             power=2,
+#             norm='slaney',
+#             center=True,
+#             mel_scale='slaney',
+#     ):
+#         super().__init__()
+#         self.n_mel_channels = n_mel_channels
+
+#         self.mel_stft = torchaudio.transforms.MelSpectrogram(
+#             sample_rate=target_sample_rate,
+#             n_fft=filter_length,
+#             win_length=win_length,
+#             hop_length=hop_length,
+#             n_mels=n_mel_channels,
+#             power=power,
+#             center=center,
+#             normalized=normalize,
+#             norm=norm,
+#             mel_scale=mel_scale
+#         )
+
+#         self.register_buffer('dummy', torch.tensor(0), persistent=False)
+
+#     def forward(self, inp):
+#         if len(inp.shape) == 3:
+#             inp = rearrange(inp, 'b 1 nw -> b nw')
+
+#         assert len(inp.shape) == 2
+
+#         if self.dummy.device != inp.device:
+#             self.to(inp.device)
+
+#         mel = self.mel_stft(inp)
+#         mel = mel.clamp(min=1e-5).log()
+#         return mel
+
+
+# sinusoidal position embedding
+
+class SinusPositionEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x, scale=1000):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+
+
+# convolutional position embedding
+
+class ConvPositionEmbedding(nn.Module):
+    def __init__(self, dim, kernel_size=31, groups=16):
+        super().__init__()
+        assert kernel_size % 2 != 0
+        self.conv1d = nn.Sequential(
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
+            nn.Mish(),
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
+            nn.Mish(),
+        )
+
+    def forward(self, x: float['b n d'], mask: bool['b n'] | None = None):
+        if mask is not None:
+            mask = mask[..., None]
+            x = x.masked_fill(~mask, 0.)
+
+        x = rearrange(x, 'b n d -> b d n')
+        x = self.conv1d(x)
+        out = rearrange(x, 'b d n -> b n d')
+
+        if mask is not None:
+            out = out.masked_fill(~mask, 0.)
+
+        return out
+
+
+# rotary positional embedding related
+
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.):
+    # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+    # has some connection to NTK literature
+    # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+    # https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
+    theta *= theta_rescale_factor ** (dim / (dim - 2))
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    freqs_cos = torch.cos(freqs)  # real part
+    freqs_sin = torch.sin(freqs)  # imaginary part
+    return torch.cat([freqs_cos, freqs_sin], dim=-1)
+
+
+def get_pos_embed_indices(start, length, max_pos, scale=1.):
+    # length = length if isinstance(length, int) else length.max()
+    scale = scale * torch.ones_like(start, dtype=torch.float32)  # in case scale is a scalar
+    pos = start.unsqueeze(1) + (
+            torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0) *
+            scale.unsqueeze(1)).long()
+    # avoid extra long error.
+    pos = torch.where(pos < max_pos, pos, max_pos - 1)
+    return pos
+
+
+# Global Response Normalization layer (Instance Normalization ?)
+
+class GRN(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, dim))
+
+    def forward(self, x):
+        Gx = torch.norm(x, p=2, dim=1, keepdim=True)
+        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * Nx) + self.beta + x
+
+
+# ConvNeXt-V2 Block https://github.com/facebookresearch/ConvNeXt-V2/blob/main/models/convnextv2.py
+# ref: https://github.com/bfs18/e2_tts/blob/main/rfwave/modules.py#L108
+
+class ConvNeXtV2Block(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            intermediate_dim: int,
+            dilation: int = 1,
+    ):
+        super().__init__()
+        padding = (dilation * (7 - 1)) // 2
+        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=padding,
+                                groups=dim, dilation=dilation)  # depthwise conv
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)
+        # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.grn = GRN(intermediate_dim)
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        x = x.transpose(1, 2)  # b n d -> b d n
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # b d n -> b n d
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.grn(x)
+        x = self.pwconv2(x)
+        return residual + x
+
+
+# AdaLayerNormZero
+# return with modulated x for attn input, and params for later mlp modulation
+
+class AdaLayerNormZero(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(dim, dim * 6)
+
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+
+    def forward(self, x, emb=None):
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
+
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+
+
+# AdaLayerNormZero for final layer
+# return only with modulated x for attn input, cuz no more mlp modulation
+
+class AdaLayerNormZero_Final(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(dim, dim * 2)
+
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+
+    def forward(self, x, emb):
+        emb = self.linear(self.silu(emb))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+
+
+# FeedForward
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, dropout=0.,
+                 approximate: str = 'none'):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+
+        activation = nn.GELU(approximate=approximate)
+        project_in = nn.Sequential(
+            nn.Linear(dim, inner_dim),
+            activation
+        )
+        self.ff = nn.Sequential(
+            project_in,
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out)
+        )
+
+    def forward(self, x):
+        return self.ff(x)
+
+
+# Attention with possible joint part
+# modified from diffusers/src/diffusers/models/attention_processor.py
+
+class Attention(nn.Module):
+    def __init__(
+            self,
+            processor: AttnProcessor,
+            dim: int,
+            heads: int = 8,
+            dim_head: int = 64,
+            dropout: float = 0.0,
+            qk_norm: bool = True,
+            # context_dim: Optional[int] = None,  # if not None -> joint attention
+            # context_pre_only=None,
+    ):
+        super().__init__()
+
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+        self.processor = processor
+
+        self.dim = dim
+        self.heads = heads
+        self.inner_dim = dim_head * heads
+        self.dropout = dropout
+
+        # self.context_dim = context_dim
+        # self.context_pre_only = context_pre_only
+
+        self.to_q = nn.Linear(dim, self.inner_dim)
+        self.to_k = nn.Linear(dim, self.inner_dim)
+        self.to_v = nn.Linear(dim, self.inner_dim)
+
+        if qk_norm is None:
+            self.q_norm = None
+            self.k_norm = None
+        elif qk_norm is True:
+            self.q_norm = nn.LayerNorm(dim_head, eps=1e-6)
+            self.k_norm = nn.LayerNorm(dim_head, eps=1e-6)
+        else:
+            raise ValueError(f"Unimplemented qk_norm: {qk_norm}")
+
+        # if self.context_dim is not None:
+        #     self.to_k_c = nn.Linear(context_dim, self.inner_dim)
+        #     self.to_v_c = nn.Linear(context_dim, self.inner_dim)
+        #     if self.context_pre_only is not None:
+        #         self.to_q_c = nn.Linear(context_dim, self.inner_dim)
+
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(self.inner_dim, dim))
+        self.to_out.append(nn.Dropout(dropout))
+
+        # if self.context_pre_only is not None and not self.context_pre_only:
+        #     self.to_out_c = nn.Linear(self.inner_dim, dim)
+
+    def forward(self, x, c=None, mask=None,
+                rope=None, c_rope=None, ) -> torch.Tensor:
+        # if c is not None:
+        #     return self.processor(self, x, c = c, mask = mask, rope = rope, c_rope = c_rope)
+        # else:
+        #     return self.processor(self, x, mask = mask, rope = rope)
+        return self.processor(self, x=x, c=c,
+                              mask=mask, rope=rope, c_rope=c_rope)
+
+
+# Attention processor
+
+def create_mask(q_shape, k_shape, device, q_mask=None, k_mask=None):
+    def default(val, d):
+        return val if val is not None else (d() if isfunction(d) else d)
+
+    b, i, j, device = q_shape[0], q_shape[-2], k_shape[-2], device
+    q_mask = default(q_mask, torch.ones((b, i), device=device, dtype=torch.bool))
+    k_mask = default(k_mask, torch.ones((b, j), device=device, dtype=torch.bool))
+    attn_mask = rearrange(q_mask, 'b i -> b 1 i 1') * rearrange(k_mask, 'b j -> b 1 1 j')
+    return attn_mask
+
+
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = torch.stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
+
+
+@autocast('cuda', enabled = False)
+def apply_rotary_pos_emb(t, freqs, scale = 1):
+    rot_dim, seq_len, orig_dtype = freqs.shape[-1], t.shape[-2], t.dtype
+
+    freqs = freqs[:, -seq_len:, :]
+    scale = scale[:, -seq_len:, :] if isinstance(scale, torch.Tensor) else scale
+
+    if t.ndim == 4 and freqs.ndim == 3:
+        freqs = rearrange(freqs, 'b n d -> b 1 n d')
+
+    # partial rotary embeddings, Wang et al. GPT-J
+    t, t_unrotated = t[..., :rot_dim], t[..., rot_dim:]
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    out = torch.cat((t, t_unrotated), dim = -1)
+
+    return out.type(orig_dtype)
+
+
+class AttnProcessor:
+    def __init__(self):
+        pass
+
+    def __call__(
+            self,
+            attn: Attention,
+            x: float['b n d'],  # noised input x
+            mask: bool['b n'] | None = None,
+            rope=None,  # rotary position embedding
+            c=None,  # context
+            c_rope=None,  # context rope
+    ) -> torch.FloatTensor:
+
+        batch_size = x.shape[0]
+
+        if c is None:
+            c = x
+
+        # `sample` projections.
+        query = attn.to_q(x)
+        key = attn.to_k(c)
+        value = attn.to_v(c)
+
+        # attention
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.q_norm is not None:
+            query = attn.q_norm(query)
+        if attn.k_norm is not None:
+            key = attn.k_norm(key)
+
+        # apply rotary position embedding
+        if rope is not None:
+            freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale ** -1.0) if xpos_scale is not None else (1.0, 1.0)
+
+            query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
+            key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
+
+        # mask. e.g. inference got a batch with different target durations, mask out the padding
+        # if mask is not None:
+        #     attn_mask = mask
+        #     attn_mask = rearrange(attn_mask, 'b n -> b 1 1 n')
+        #     attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
+        # else:
+        #     attn_mask = None
+        if mask is not None:
+            attn_mask = create_mask(x.shape, c.shape,
+                                    x.device, None, mask)
+        else:
+            attn_mask = None
+
+        x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask,
+                                           dropout_p=0.0, is_causal=False)
+        x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        x = x.to(query.dtype)
+
+        # linear proj
+        x = attn.to_out[0](x)
+        # dropout
+        x = attn.to_out[1](x)
+
+        # if mask is not None:
+        #     mask = rearrange(mask, 'b n -> b n 1')
+        #     x = x.masked_fill(~mask, 0.)
+
+        return x
+
+
+# DiT Block
+
+class DiTBlock(nn.Module):
+
+    def __init__(self, dim, heads, dim_head,
+                 ff_mult=4, dropout=0.1,
+                 qk_norm=False,
+                 use_checkpoint=True):
+        super().__init__()
+
+        self.attn_norm = AdaLayerNormZero(dim)
+        self.attn = Attention(
+            processor=AttnProcessor(),
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            dropout=dropout,
+            qk_norm=qk_norm,
+        )
+
+        self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, mult=ff_mult,
+                              dropout=dropout, approximate="tanh")
+
+        self.use_checkpoint = checkpoint
+
+    def forward(self, x, t, mask=None, rope=None):
+        if self.use_checkpoint:
+            return checkpoint(self._forward, x, t, mask, rope)
+        else:
+            return self._forward(x, t, mask, rope)
+
+    # x: noised input, t: time embedding
+    def _forward(self, x, t, mask=None, rope=None):
+        # pre-norm & modulation for attention input
+        norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
+
+        # attention
+        attn_output = self.attn(x=norm, mask=mask, rope=rope)
+
+        # process attention output for input x
+        x = x + gate_msa.unsqueeze(1) * attn_output
+
+        norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(norm)
+        x = x + gate_mlp.unsqueeze(1) * ff_output
+
+        return x
+
+
+# Cross DiT Block
+class CrossDiTBlock(nn.Module):
+
+    def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1,
+                 qk_norm=False,
+                 use_checkpoint=True, skip=False):
+        super().__init__()
+
+        self.attn_norm = AdaLayerNormZero(dim)
+        self.attn = Attention(
+            processor=AttnProcessor(),
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            dropout=dropout,
+            qk_norm=qk_norm,
+        )
+
+        self.cross_norm = nn.LayerNorm(dim, eps=1e-6)
+        self.context_norm = nn.LayerNorm(dim, eps=1e-6)
+        self.cross_attn = Attention(
+            processor=AttnProcessor(),
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            dropout=dropout,
+            qk_norm=qk_norm,
+        )
+
+        # Zero out the weight
+        nn.init.constant_(self.cross_attn.to_out[0].weight, 0.0)
+        # Zero out the bias if it exists
+        if self.cross_attn.to_out[0].bias is not None:
+            nn.init.constant_(self.cross_attn.to_out[0].bias, 0.0)
+
+        self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
+
+        self.use_checkpoint = checkpoint
+
+        self.skip = skip
+        if self.skip:
+            self.skip_norm = nn.LayerNorm(dim*2, eps=1e-6)
+            self.skip_linear = nn.Linear(dim*2, dim)
+
+    def forward(self, x, t, mask=None, rope=None,
+                context=None, context_mask=None, skip=None):
+        if self.use_checkpoint:
+            return checkpoint(self._forward, x, t, mask, rope, context, context_mask, skip, use_reentrant=False)
+        else:
+            return self._forward(x, t, mask, rope, context, context_mask, skip)
+
+    def _forward(self, x, t, mask=None, rope=None,
+                 context=None, context_mask=None, skip=None):
+        if self.skip:
+            cat = torch.cat([x, skip], dim=-1)
+            cat = self.skip_norm(cat)
+            x = self.skip_linear(cat)
+
+        # pre-norm & modulation for attention input
+        norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
+
+        # attention
+        attn_output = self.attn(x=norm, mask=mask, rope=rope)
+
+        # process attention output for input x
+        x = x + gate_msa.unsqueeze(1) * attn_output
+
+        # process cross attention
+        x = x + self.cross_attn(x=self.cross_norm(x), c=self.context_norm(context),
+                                mask=context_mask, rope=None)
+
+        norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(norm)
+        x = x + gate_mlp.unsqueeze(1) * ff_output
+
+        return x
+
+
+# time step conditioning embedding
+
+class TimestepEmbedding(nn.Module):
+    def __init__(self, dim, freq_embed_dim=256):
+        super().__init__()
+        self.time_embed = SinusPositionEmbedding(freq_embed_dim)
+        self.time_mlp = nn.Sequential(
+            nn.Linear(freq_embed_dim, dim),
+            nn.SiLU(),
+            nn.Linear(dim, dim)
+        )
+
+    def forward(self, timestep: float['b']):
+        time_hidden = self.time_embed(timestep)
+        time = self.time_mlp(time_hidden)  # b d
+        return time
diff --git a/capspeech/nar/model/trainer.py b/capspeech/nar/model/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd50b8b4564f9c68fd732b693f1a9923442532c1
--- /dev/null
+++ b/capspeech/nar/model/trainer.py
@@ -0,0 +1,250 @@
+from __future__ import annotations
+
+import os
+import gc
+from tqdm import tqdm
+import wandb
+
+import torch
+from torch.optim import AdamW
+from torch.utils.data import DataLoader, Dataset, SequentialSampler
+from torch.optim.lr_scheduler import LinearLR, SequentialLR
+
+from einops import rearrange
+
+from accelerate import Accelerator
+from accelerate.utils import DistributedDataParallelKwargs
+
+from ema_pytorch import EMA
+
+from capspeech.nar.model import CFM
+from capspeech.nar.model.utils import exists, default
+from capspeech.nar.model.dataset import DynamicBatchSampler, collate_fn
+
+
+# trainer
+
+class Trainer:
+    def __init__(
+        self,
+        model: CFM,
+        epochs,
+        learning_rate,
+        num_warmup_updates = 20000,
+        save_per_updates = 1000, 
+        checkpoint_path = None,
+        batch_size = 32, 
+        batch_size_type: str = "sample",
+        max_samples = 32,
+        grad_accumulation_steps = 1,
+        max_grad_norm = 1.0,
+        noise_scheduler: str | None = None,
+        duration_predictor: torch.nn.Module | None = None,
+        wandb_project = "test_e2-tts",
+        wandb_run_name = "test_run",
+        wandb_resume_id: str = None,
+        last_per_steps = None,
+        accelerate_kwargs: dict = dict(),
+        ema_kwargs: dict = dict()
+    ):
+        
+        ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters = True)
+
+        self.accelerator = Accelerator(
+            log_with = "wandb",
+            kwargs_handlers = [ddp_kwargs],
+            gradient_accumulation_steps = grad_accumulation_steps,
+            **accelerate_kwargs
+        )
+        
+        if exists(wandb_resume_id):
+            init_kwargs={"wandb": {"resume": "allow", "name": wandb_run_name, 'id': wandb_resume_id}}
+        else:
+            init_kwargs={"wandb": {"resume": "allow", "name": wandb_run_name}}
+        self.accelerator.init_trackers(
+            project_name = wandb_project, 
+            init_kwargs=init_kwargs,
+            config={"epochs": epochs,
+                    "learning_rate": learning_rate,
+                    "num_warmup_updates": num_warmup_updates, 
+                    "batch_size": batch_size,
+                    "batch_size_type": batch_size_type,
+                    "max_samples": max_samples,
+                    "grad_accumulation_steps": grad_accumulation_steps,
+                    "max_grad_norm": max_grad_norm,
+                    "gpus": self.accelerator.num_processes,
+                    "noise_scheduler": noise_scheduler}
+            )
+
+        self.model = model
+
+        if self.is_main:
+            self.ema_model = EMA(
+                model,
+                include_online_model = False,
+                **ema_kwargs
+            )
+
+            self.ema_model.to(self.accelerator.device)
+
+        self.epochs = epochs
+        self.num_warmup_updates = num_warmup_updates
+        self.save_per_updates = save_per_updates
+        self.last_per_steps = default(last_per_steps, save_per_updates * grad_accumulation_steps)
+        self.checkpoint_path = default(checkpoint_path, 'ckpts/test_e2-tts')
+
+        self.batch_size = batch_size
+        self.batch_size_type = batch_size_type
+        self.max_samples = max_samples
+        self.grad_accumulation_steps = grad_accumulation_steps
+        self.max_grad_norm = max_grad_norm
+
+        self.noise_scheduler = noise_scheduler
+
+        self.duration_predictor = duration_predictor
+
+        self.optimizer = AdamW(model.parameters(), lr=learning_rate)
+        self.model, self.optimizer = self.accelerator.prepare(
+            self.model, self.optimizer
+        )
+
+    @property
+    def is_main(self):
+        return self.accelerator.is_main_process
+
+    def save_checkpoint(self, step, last=False):
+        self.accelerator.wait_for_everyone()
+        if self.is_main:
+            checkpoint = dict(
+                model_state_dict = self.accelerator.unwrap_model(self.model).state_dict(),
+                optimizer_state_dict = self.accelerator.unwrap_model(self.optimizer).state_dict(),
+                ema_model_state_dict = self.ema_model.state_dict(),
+                scheduler_state_dict = self.scheduler.state_dict(),
+                step = step
+            )
+            if not os.path.exists(self.checkpoint_path):
+                os.makedirs(self.checkpoint_path)
+            if last == True:
+                self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_last.pt")
+                print(f"Saved last checkpoint at step {step}")
+            else:
+                self.accelerator.save(checkpoint, f"{self.checkpoint_path}/model_{step}.pt")
+
+    def load_checkpoint(self):
+        if not exists(self.checkpoint_path) or not os.path.exists(self.checkpoint_path) or not os.listdir(self.checkpoint_path):
+            return 0
+        
+        self.accelerator.wait_for_everyone()
+        if "model_last.pt" in os.listdir(self.checkpoint_path):
+            latest_checkpoint = "model_last.pt"
+        else:
+            latest_checkpoint = sorted([f for f in os.listdir(self.checkpoint_path) if f.endswith('.pt')], key=lambda x: int(''.join(filter(str.isdigit, x))))[-1]
+        # checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", map_location=self.accelerator.device)  # rather use accelerator.load_state ಥ_ಥ
+        checkpoint = torch.load(f"{self.checkpoint_path}/{latest_checkpoint}", weights_only=True, map_location="cpu")
+
+        if self.is_main:
+            self.ema_model.load_state_dict(checkpoint['ema_model_state_dict'])
+
+        if 'step' in checkpoint:
+            self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint['model_state_dict'])
+            self.accelerator.unwrap_model(self.optimizer).load_state_dict(checkpoint['optimizer_state_dict'])
+            if self.scheduler:
+                self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
+            step = checkpoint['step']
+        else:
+            checkpoint['model_state_dict'] = {k.replace("ema_model.", ""): v for k, v in checkpoint['ema_model_state_dict'].items() if k not in ["initted", "step"]}
+            self.accelerator.unwrap_model(self.model).load_state_dict(checkpoint['model_state_dict'])
+            step = 0
+
+        del checkpoint; gc.collect()
+        return step
+
+    def train(self, train_dataset: Dataset, num_workers=16, resumable_with_seed: int = None):
+        
+        if exists(resumable_with_seed):
+            generator = torch.Generator()
+            generator.manual_seed(resumable_with_seed)
+        else: 
+            generator = None
+
+        if self.batch_size_type == "sample":
+            train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, num_workers=num_workers, pin_memory=True, persistent_workers=True,
+                                          batch_size=self.batch_size, shuffle=True, generator=generator)
+        elif self.batch_size_type == "frame":
+            self.accelerator.even_batches = False
+            sampler = SequentialSampler(train_dataset)
+            batch_sampler = DynamicBatchSampler(sampler, self.batch_size, max_samples=self.max_samples, random_seed=resumable_with_seed, drop_last=False)
+            train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, num_workers=num_workers, pin_memory=True, persistent_workers=True,
+                                          batch_sampler=batch_sampler)
+        else:
+            raise ValueError(f"batch_size_type must be either 'sample' or 'frame', but received {self.batch_size_type}")
+        
+        #  accelerator.prepare() dispatches batches to devices;
+        #  which means the length of dataloader calculated before, should consider the number of devices
+        warmup_steps = self.num_warmup_updates * self.accelerator.num_processes  # consider a fixed warmup steps while using accelerate multi-gpu ddp
+                                                                                 # otherwise by default with split_batches=False, warmup steps change with num_processes
+        total_steps = len(train_dataloader) * self.epochs / self.grad_accumulation_steps
+        decay_steps = total_steps - warmup_steps
+        warmup_scheduler = LinearLR(self.optimizer, start_factor=1e-8, end_factor=1.0, total_iters=warmup_steps)
+        decay_scheduler = LinearLR(self.optimizer, start_factor=1.0, end_factor=1e-8, total_iters=decay_steps)
+        self.scheduler = SequentialLR(self.optimizer, 
+                                      schedulers=[warmup_scheduler, decay_scheduler],
+                                      milestones=[warmup_steps])
+        train_dataloader, self.scheduler = self.accelerator.prepare(train_dataloader, self.scheduler)  # actual steps = 1 gpu steps / gpus
+        start_step = self.load_checkpoint()
+        global_step = start_step
+
+        if exists(resumable_with_seed):
+            orig_epoch_step = len(train_dataloader)
+            skipped_epoch = int(start_step // orig_epoch_step)
+            skipped_batch = start_step % orig_epoch_step
+            skipped_dataloader = self.accelerator.skip_first_batches(train_dataloader, num_batches=skipped_batch)
+        else:
+            skipped_epoch = 0
+
+        for epoch in range(skipped_epoch, self.epochs):
+            self.model.train()
+            if exists(resumable_with_seed) and epoch == skipped_epoch:
+                progress_bar = tqdm(skipped_dataloader, desc=f"Epoch {epoch+1}/{self.epochs}", unit="step", disable=not self.accelerator.is_local_main_process, 
+                                    initial=skipped_batch, total=orig_epoch_step)
+            else:
+                progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{self.epochs}", unit="step", disable=not self.accelerator.is_local_main_process)
+
+            for batch in progress_bar:
+                with self.accelerator.accumulate(self.model):
+                    text_inputs = batch['text']
+                    mel_spec = rearrange(batch['mel'], 'b d n -> b n d')
+                    mel_lengths = batch["mel_lengths"]
+
+                    # TODO. add duration predictor training
+                    if self.duration_predictor is not None and self.accelerator.is_local_main_process:
+                        dur_loss = self.duration_predictor(mel_spec, lens=batch.get('durations'))
+                        self.accelerator.log({"duration loss": dur_loss.item()}, step=global_step)
+
+                    loss, cond, pred = self.model(mel_spec, text=text_inputs, lens=mel_lengths, noise_scheduler=self.noise_scheduler)
+                    self.accelerator.backward(loss)
+
+                    if self.max_grad_norm > 0 and self.accelerator.sync_gradients:
+                        self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
+
+                    self.optimizer.step()
+                    self.scheduler.step()
+                    self.optimizer.zero_grad()
+
+                if self.is_main:
+                    self.ema_model.update()
+
+                global_step += 1
+
+                if self.accelerator.is_local_main_process:
+                    self.accelerator.log({"loss": loss.item(), "lr": self.scheduler.get_last_lr()[0]}, step=global_step)
+                
+                progress_bar.set_postfix(step=str(global_step), loss=loss.item())
+                
+                if global_step % (self.save_per_updates * self.grad_accumulation_steps) == 0:
+                    self.save_checkpoint(global_step)
+                
+                if global_step % self.last_per_steps == 0:
+                    self.save_checkpoint(global_step, last=True)
+        
+        self.accelerator.end_training()
diff --git a/capspeech/nar/model/utils.py b/capspeech/nar/model/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..98f60c64da1b161e39489e8e8023a75799c35608
--- /dev/null
+++ b/capspeech/nar/model/utils.py
@@ -0,0 +1,580 @@
+from __future__ import annotations
+
+import os
+import re
+import math
+import random
+import string
+from tqdm import tqdm
+from collections import defaultdict
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+
+import torch
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+import torchaudio
+
+import einx
+from einops import rearrange, reduce
+
+import jieba
+from pypinyin import lazy_pinyin, Style
+
+from capspeech.nar.model.ecapa_tdnn import ECAPA_TDNN_SMALL
+from capspeech.nar.model.modules import MelSpec
+
+
+# seed everything
+
+def seed_everything(seed = 0):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+# helpers
+
+def exists(v):
+    return v is not None
+
+def default(v, d):
+    return v if exists(v) else d
+
+# tensor helpers
+
+def lens_to_mask(
+    t: int['b'],
+    length: int | None = None
+) -> bool['b n']:
+
+    if not exists(length):
+        length = t.amax()
+
+    seq = torch.arange(length, device = t.device)
+    return einx.less('n, b -> b n', seq, t)
+
+def mask_from_start_end_indices(
+    seq_len: int['b'],
+    start: int['b'],
+    end: int['b']
+):
+    max_seq_len = seq_len.max().item()  
+    seq = torch.arange(max_seq_len, device = start.device).long()
+    return einx.greater_equal('n, b -> b n', seq, start) & einx.less('n, b -> b n', seq, end)
+
+def mask_from_frac_lengths(
+    seq_len: int['b'],
+    frac_lengths: float['b']
+):
+    lengths = (frac_lengths * seq_len).long()
+    max_start = seq_len - lengths
+
+    rand = torch.rand_like(frac_lengths)
+    start = (max_start * rand).long().clamp(min = 0)
+    end = start + lengths
+
+    return mask_from_start_end_indices(seq_len, start, end)
+
+def maybe_masked_mean(
+    t: float['b n d'],
+    mask: bool['b n'] = None
+) -> float['b d']:
+
+    if not exists(mask):
+        return t.mean(dim = 1)
+
+    t = einx.where('b n, b n d, -> b n d', mask, t, 0.)
+    num = reduce(t, 'b n d -> b d', 'sum')
+    den = reduce(mask.float(), 'b n -> b', 'sum')
+
+    return einx.divide('b d, b -> b d', num, den.clamp(min = 1.))
+
+
+# simple utf-8 tokenizer, since paper went character based
+def list_str_to_tensor(
+    text: list[str],
+    padding_value = -1
+) -> int['b nt']:
+    list_tensors = [torch.tensor([*bytes(t, 'UTF-8')]) for t in text]  # ByT5 style
+    text = pad_sequence(list_tensors, padding_value = padding_value, batch_first = True)
+    return text
+
+# char tokenizer, based on custom dataset's extracted .txt file
+def list_str_to_idx(
+    text: list[str] | list[list[str]],
+    vocab_char_map: dict[str, int],  # {char: idx}
+    padding_value = -1
+) -> int['b nt']:
+    list_idx_tensors = [torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text]  # pinyin or char style
+    text = pad_sequence(list_idx_tensors, padding_value = padding_value, batch_first = True)
+    return text
+
+
+# Get tokenizer
+
+def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
+    ''' 
+    tokenizer   - "pinyin" do g2p for only chinese characters, need .txt vocab_file
+                - "char" for char-wise tokenizer, need .txt vocab_file
+                - "byte" for utf-8 tokenizer
+                - "custom" if you're directly passing in a path to the vocab.txt you want to use
+    vocab_size  - if use "pinyin", all available pinyin types, common alphabets (also those with accent) and symbols
+                - if use "char", derived from unfiltered character & symbol counts of custom dataset
+                - if use "byte", set to 256 (unicode byte range) 
+    ''' 
+    if tokenizer in ["pinyin", "char"]:
+        with open (f"data/{dataset_name}_{tokenizer}/vocab.txt", "r", encoding="utf-8") as f:
+            vocab_char_map = {}
+            for i, char in enumerate(f):
+                vocab_char_map[char[:-1]] = i
+        vocab_size = len(vocab_char_map)
+        assert vocab_char_map[" "] == 0, "make sure space is of idx 0 in vocab.txt, cuz 0 is used for unknown char"
+
+    elif tokenizer == "byte":
+        vocab_char_map = None
+        vocab_size = 256
+    elif tokenizer == "custom":
+        with open (dataset_name, "r", encoding="utf-8") as f:
+            vocab_char_map = {}
+            for i, char in enumerate(f):
+                vocab_char_map[char[:-1]] = i
+        vocab_size = len(vocab_char_map)
+
+    return vocab_char_map, vocab_size
+
+
+# convert char to pinyin
+
+def convert_char_to_pinyin(text_list, polyphone = True):
+    final_text_list = []
+    god_knows_why_en_testset_contains_zh_quote = str.maketrans({'“': '"', '”': '"', '‘': "'", '’': "'"})  # in case librispeech (orig no-pc) test-clean
+    custom_trans = str.maketrans({';': ','})  # add custom trans here, to address oov
+    for text in text_list:
+        char_list = []
+        text = text.translate(god_knows_why_en_testset_contains_zh_quote)
+        text = text.translate(custom_trans)
+        for seg in jieba.cut(text):
+            seg_byte_len = len(bytes(seg, 'UTF-8'))
+            if seg_byte_len == len(seg):  # if pure alphabets and symbols
+                if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
+                    char_list.append(" ")
+                char_list.extend(seg)
+            elif polyphone and seg_byte_len == 3 * len(seg):  # if pure chinese characters
+                seg = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
+                for c in seg:
+                    if c not in "。，、；：？！《》【】—…":
+                        char_list.append(" ")
+                    char_list.append(c)
+            else:  # if mixed chinese characters, alphabets and symbols
+                for c in seg:
+                    if ord(c) < 256:
+                        char_list.extend(c)
+                    else:
+                        if c not in "。，、；：？！《》【】—…":
+                            char_list.append(" ")
+                            char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
+                        else:  # if is zh punc
+                            char_list.append(c)
+        final_text_list.append(char_list)
+
+    return final_text_list
+
+
+# save spectrogram
+def save_spectrogram(spectrogram, path):
+    plt.figure(figsize=(12, 4))
+    plt.imshow(spectrogram, origin='lower', aspect='auto')
+    plt.colorbar()
+    plt.savefig(path)
+    plt.close()
+
+
+# seedtts testset metainfo: utt, prompt_text, prompt_wav, gt_text, gt_wav
+def get_seedtts_testset_metainfo(metalst):
+    f = open(metalst); lines = f.readlines(); f.close()
+    metainfo = []
+    for line in lines:
+        if len(line.strip().split('|')) == 5:
+            utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split('|')
+        elif len(line.strip().split('|')) == 4:
+            utt, prompt_text, prompt_wav, gt_text = line.strip().split('|')
+            gt_wav = os.path.join(os.path.dirname(metalst), "wavs", utt + ".wav")
+        if not os.path.isabs(prompt_wav):
+            prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
+        metainfo.append((utt, prompt_text, prompt_wav, gt_text, gt_wav))
+    return metainfo
+
+
+# librispeech test-clean metainfo: gen_utt, ref_txt, ref_wav, gen_txt, gen_wav
+def get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path):
+    f = open(metalst); lines = f.readlines(); f.close()
+    metainfo = []
+    for line in lines:
+        ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split('\t')
+
+        # ref_txt = ref_txt[0] + ref_txt[1:].lower() + '.'  # if use librispeech test-clean (no-pc)
+        ref_spk_id, ref_chaptr_id, _ =  ref_utt.split('-')
+        ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + '.flac')
+
+        # gen_txt = gen_txt[0] + gen_txt[1:].lower() + '.'  # if use librispeech test-clean (no-pc)
+        gen_spk_id, gen_chaptr_id, _ =  gen_utt.split('-')
+        gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + '.flac')
+
+        metainfo.append((gen_utt, ref_txt, ref_wav, " " + gen_txt, gen_wav))
+
+    return metainfo
+
+
+# padded to max length mel batch
+def padded_mel_batch(ref_mels):
+    max_mel_length = torch.LongTensor([mel.shape[-1] for mel in ref_mels]).amax()
+    padded_ref_mels = []
+    for mel in ref_mels:
+        padded_ref_mel = F.pad(mel, (0, max_mel_length - mel.shape[-1]), value = 0)
+        padded_ref_mels.append(padded_ref_mel)
+    padded_ref_mels = torch.stack(padded_ref_mels)
+    padded_ref_mels = rearrange(padded_ref_mels, 'b d n -> b n d')
+    return padded_ref_mels
+
+
+# get prompts from metainfo containing: utt, prompt_text, prompt_wav, gt_text, gt_wav
+
+def get_inference_prompt(
+    metainfo, 
+    speed = 1., tokenizer = "pinyin", polyphone = True, 
+    target_sample_rate = 24000, n_mel_channels = 100, hop_length = 256, target_rms = 0.1,
+    use_truth_duration = False,
+    infer_batch_size = 1, num_buckets = 200, min_secs = 3, max_secs = 40,
+):
+    prompts_all = []
+
+    min_tokens = min_secs * target_sample_rate // hop_length
+    max_tokens = max_secs * target_sample_rate // hop_length
+
+    batch_accum = [0] * num_buckets
+    utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = \
+        ([[] for _ in range(num_buckets)] for _ in range(6))
+
+    mel_spectrogram = MelSpec(target_sample_rate=target_sample_rate, n_mel_channels=n_mel_channels, hop_length=hop_length)
+
+    for utt, prompt_text, prompt_wav, gt_text, gt_wav in tqdm(metainfo, desc="Processing prompts..."):
+
+        # Audio
+        ref_audio, ref_sr = torchaudio.load(prompt_wav)
+        ref_rms = torch.sqrt(torch.mean(torch.square(ref_audio)))
+        if ref_rms < target_rms:
+            ref_audio = ref_audio * target_rms / ref_rms
+        assert ref_audio.shape[-1] > 5000, f"Empty prompt wav: {prompt_wav}, or torchaudio backend issue."
+        if ref_sr != target_sample_rate:
+            resampler = torchaudio.transforms.Resample(ref_sr, target_sample_rate)
+            ref_audio = resampler(ref_audio)
+
+        # Text
+        if len(prompt_text[-1].encode('utf-8')) == 1:
+            prompt_text = prompt_text + " "
+        text = [prompt_text + gt_text]
+        if tokenizer == "pinyin":
+            text_list = convert_char_to_pinyin(text, polyphone = polyphone)
+        else:
+            text_list = text
+
+        # Duration, mel frame length
+        ref_mel_len = ref_audio.shape[-1] // hop_length
+        if use_truth_duration:
+            gt_audio, gt_sr = torchaudio.load(gt_wav)
+            if gt_sr != target_sample_rate:
+                resampler = torchaudio.transforms.Resample(gt_sr, target_sample_rate)
+                gt_audio = resampler(gt_audio)
+            total_mel_len = ref_mel_len + int(gt_audio.shape[-1] / hop_length / speed)
+
+            # # test vocoder resynthesis
+            # ref_audio = gt_audio
+        else:
+            zh_pause_punc = r"。，、；：？！"
+            ref_text_len = len(prompt_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, prompt_text))
+            gen_text_len = len(gt_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gt_text))
+            total_mel_len = ref_mel_len + int(ref_mel_len / ref_text_len * gen_text_len / speed)
+
+        # to mel spectrogram
+        ref_mel = mel_spectrogram(ref_audio)
+        ref_mel = rearrange(ref_mel, '1 d n -> d n')
+
+        # deal with batch
+        assert infer_batch_size > 0, "infer_batch_size should be greater than 0."
+        assert min_tokens <= total_mel_len <= max_tokens, \
+            f"Audio {utt} has duration {total_mel_len*hop_length//target_sample_rate}s out of range [{min_secs}, {max_secs}]."
+        bucket_i = math.floor((total_mel_len - min_tokens) / (max_tokens - min_tokens + 1) * num_buckets)
+
+        utts[bucket_i].append(utt)
+        ref_rms_list[bucket_i].append(ref_rms)
+        ref_mels[bucket_i].append(ref_mel)
+        ref_mel_lens[bucket_i].append(ref_mel_len)
+        total_mel_lens[bucket_i].append(total_mel_len)
+        final_text_list[bucket_i].extend(text_list)
+
+        batch_accum[bucket_i] += total_mel_len
+
+        if batch_accum[bucket_i] >= infer_batch_size:
+            # print(f"\n{len(ref_mels[bucket_i][0][0])}\n{ref_mel_lens[bucket_i]}\n{total_mel_lens[bucket_i]}")
+            prompts_all.append((
+                utts[bucket_i], 
+                ref_rms_list[bucket_i], 
+                padded_mel_batch(ref_mels[bucket_i]), 
+                ref_mel_lens[bucket_i], 
+                total_mel_lens[bucket_i], 
+                final_text_list[bucket_i]
+            ))
+            batch_accum[bucket_i] = 0
+            utts[bucket_i], ref_rms_list[bucket_i], ref_mels[bucket_i], ref_mel_lens[bucket_i], total_mel_lens[bucket_i], final_text_list[bucket_i] = [], [], [], [], [], []
+
+    # add residual
+    for bucket_i, bucket_frames in enumerate(batch_accum):
+        if bucket_frames > 0:
+            prompts_all.append((
+                utts[bucket_i], 
+                ref_rms_list[bucket_i], 
+                padded_mel_batch(ref_mels[bucket_i]), 
+                ref_mel_lens[bucket_i], 
+                total_mel_lens[bucket_i], 
+                final_text_list[bucket_i]
+            ))
+    # not only leave easy work for last workers
+    random.seed(666)
+    random.shuffle(prompts_all)
+
+    return prompts_all
+
+
+# get wav_res_ref_text of seed-tts test metalst
+# https://github.com/BytedanceSpeech/seed-tts-eval
+
+def get_seed_tts_test(metalst, gen_wav_dir, gpus):
+    f = open(metalst)
+    lines = f.readlines()
+    f.close()
+
+    test_set_ = []
+    for line in tqdm(lines):
+        if len(line.strip().split('|')) == 5:
+            utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split('|')
+        elif len(line.strip().split('|')) == 4:
+            utt, prompt_text, prompt_wav, gt_text = line.strip().split('|')
+
+        if not os.path.exists(os.path.join(gen_wav_dir, utt + '.wav')):
+            continue
+        gen_wav = os.path.join(gen_wav_dir, utt + '.wav')
+        if not os.path.isabs(prompt_wav):
+            prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
+
+        test_set_.append((gen_wav, prompt_wav, gt_text))
+
+    num_jobs = len(gpus)
+    if num_jobs == 1:
+        return [(gpus[0], test_set_)]
+    
+    wav_per_job = len(test_set_) // num_jobs + 1
+    test_set = []
+    for i in range(num_jobs):
+        test_set.append((gpus[i], test_set_[i*wav_per_job:(i+1)*wav_per_job]))
+
+    return test_set
+
+
+# get librispeech test-clean cross sentence test
+
+def get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth = False):
+    f = open(metalst)
+    lines = f.readlines()
+    f.close()
+
+    test_set_ = []
+    for line in tqdm(lines):
+        ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split('\t')
+
+        if eval_ground_truth:
+            gen_spk_id, gen_chaptr_id, _ =  gen_utt.split('-')
+            gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + '.flac')
+        else:
+            if not os.path.exists(os.path.join(gen_wav_dir, gen_utt + '.wav')):
+                raise FileNotFoundError(f"Generated wav not found: {gen_utt}")
+            gen_wav = os.path.join(gen_wav_dir, gen_utt + '.wav')
+
+        ref_spk_id, ref_chaptr_id, _ =  ref_utt.split('-')
+        ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + '.flac')
+
+        test_set_.append((gen_wav, ref_wav, gen_txt))
+
+    num_jobs = len(gpus)
+    if num_jobs == 1:
+        return [(gpus[0], test_set_)]
+    
+    wav_per_job = len(test_set_) // num_jobs + 1
+    test_set = []
+    for i in range(num_jobs):
+        test_set.append((gpus[i], test_set_[i*wav_per_job:(i+1)*wav_per_job]))
+
+    return test_set
+
+
+# load asr model
+
+def load_asr_model(lang, ckpt_dir = ""):
+    if lang == "zh":
+        from funasr import AutoModel
+        model = AutoModel(
+            model = os.path.join(ckpt_dir, "paraformer-zh"), 
+            # vad_model = os.path.join(ckpt_dir, "fsmn-vad"), 
+            # punc_model = os.path.join(ckpt_dir, "ct-punc"),
+            # spk_model = os.path.join(ckpt_dir, "cam++"), 
+            disable_update=True,
+            )  # following seed-tts setting
+    elif lang == "en":
+        from faster_whisper import WhisperModel
+        model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
+        model = WhisperModel(model_size, device="cuda", compute_type="float16")
+    return model
+
+
+# WER Evaluation, the way Seed-TTS does
+
+def run_asr_wer(args):
+    rank, lang, test_set, ckpt_dir = args
+
+    if lang == "zh":
+        import zhconv
+        torch.cuda.set_device(rank)
+    elif lang == "en":
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
+    else:
+        raise NotImplementedError("lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now.")
+
+    asr_model = load_asr_model(lang, ckpt_dir = ckpt_dir)
+    
+    from zhon.hanzi import punctuation
+    punctuation_all = punctuation + string.punctuation
+    wers = []
+
+    from jiwer import compute_measures
+    for gen_wav, prompt_wav, truth in tqdm(test_set):
+        if lang == "zh":
+            res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)
+            hypo = res[0]["text"]
+            hypo = zhconv.convert(hypo, 'zh-cn')
+        elif lang == "en":
+            segments, _ = asr_model.transcribe(gen_wav, beam_size=5, language="en")
+            hypo = ''
+            for segment in segments:
+                hypo = hypo + ' ' + segment.text
+
+        # raw_truth = truth
+        # raw_hypo = hypo
+
+        for x in punctuation_all:
+            truth = truth.replace(x, '')
+            hypo = hypo.replace(x, '')
+
+        truth = truth.replace('  ', ' ')
+        hypo = hypo.replace('  ', ' ')
+
+        if lang == "zh":
+            truth = " ".join([x for x in truth])
+            hypo = " ".join([x for x in hypo])
+        elif lang == "en":
+            truth = truth.lower()
+            hypo = hypo.lower()
+
+        measures = compute_measures(truth, hypo)
+        wer = measures["wer"]
+
+        # ref_list = truth.split(" ")
+        # subs = measures["substitutions"] / len(ref_list)
+        # dele = measures["deletions"] / len(ref_list)
+        # inse = measures["insertions"] / len(ref_list)
+
+        wers.append(wer)
+
+    return wers
+
+
+# SIM Evaluation
+
+def run_sim(args):
+    rank, test_set, ckpt_dir = args
+    device = f"cuda:{rank}"
+
+    model = ECAPA_TDNN_SMALL(feat_dim=1024, feat_type='wavlm_large', config_path=None)
+    state_dict = torch.load(ckpt_dir, weights_only=True, map_location=lambda storage, loc: storage)
+    model.load_state_dict(state_dict['model'], strict=False)
+
+    use_gpu=True if torch.cuda.is_available() else False
+    if use_gpu:
+        model = model.cuda(device)
+    model.eval()
+
+    sim_list = []
+    for wav1, wav2, truth in tqdm(test_set):
+
+        wav1, sr1 = torchaudio.load(wav1)
+        wav2, sr2 = torchaudio.load(wav2)
+
+        resample1 = torchaudio.transforms.Resample(orig_freq=sr1, new_freq=16000)
+        resample2 = torchaudio.transforms.Resample(orig_freq=sr2, new_freq=16000)
+        wav1 = resample1(wav1)
+        wav2 = resample2(wav2)
+
+        if use_gpu:
+            wav1 = wav1.cuda(device)
+            wav2 = wav2.cuda(device)
+        with torch.no_grad():
+            emb1 = model(wav1)
+            emb2 = model(wav2)
+        
+        sim = F.cosine_similarity(emb1, emb2)[0].item()
+        # print(f"VSim score between two audios: {sim:.4f} (-1.0, 1.0).")
+        sim_list.append(sim)
+    
+    return sim_list
+
+
+# filter func for dirty data with many repetitions
+
+def repetition_found(text, length = 2, tolerance = 10):
+    pattern_count = defaultdict(int)
+    for i in range(len(text) - length + 1):
+        pattern = text[i:i + length]
+        pattern_count[pattern] += 1
+    for pattern, count in pattern_count.items():
+        if count > tolerance:
+            return True
+    return False
+
+
+# load model checkpoint for inference
+
+def load_checkpoint(model, ckpt_path, device, use_ema = True):
+    from ema_pytorch import EMA
+
+    ckpt_type = ckpt_path.split(".")[-1]
+    if ckpt_type == "safetensors":
+        from safetensors.torch import load_file
+        checkpoint = load_file(ckpt_path, device=device)
+    else:
+        checkpoint = torch.load(ckpt_path, weights_only=True, map_location=device)
+
+    if use_ema == True:
+        ema_model = EMA(model, include_online_model = False).to(device)
+        if ckpt_type == "safetensors":
+            ema_model.load_state_dict(checkpoint)
+        else:
+            ema_model.load_state_dict(checkpoint['ema_model_state_dict'])
+        ema_model.copy_params_from_ema_to_model()
+    else:
+        model.load_state_dict(checkpoint['model_state_dict'])
+        
+    return model
\ No newline at end of file
diff --git a/capspeech/nar/network/__init__.py b/capspeech/nar/network/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/capspeech/nar/network/crossdit.py b/capspeech/nar/network/crossdit.py
new file mode 100644
index 0000000000000000000000000000000000000000..702ae31aaae2ab6f705f3ed81867b16a69823f0e
--- /dev/null
+++ b/capspeech/nar/network/crossdit.py
@@ -0,0 +1,208 @@
+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+
+from __future__ import annotations
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from einops import repeat
+
+from x_transformers.x_transformers import RotaryEmbedding
+
+from capspeech.nar.model.modules import (
+    TimestepEmbedding,
+    ConvNeXtV2Block,
+    ConvPositionEmbedding,
+    CrossDiTBlock,
+    DiTBlock,
+    AdaLayerNormZero_Final,
+    precompute_freqs_cis, get_pos_embed_indices,
+)
+
+
+class TextEmbedding(nn.Module):
+    def __init__(self, text_num_embeds, text_dim, conv_layers = 0, conv_mult = 2):
+        super().__init__()
+        self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim)  # use 0 as filler token
+
+        if conv_layers > 0:
+            self.extra_modeling = True
+            self.precompute_max_pos = 4096  # ~44s of 24khz audio
+            self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
+            self.text_blocks = nn.Sequential(*[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)])
+        else:
+            self.extra_modeling = False
+
+    def forward(self, text: int['b nt'], seq_len, drop_text=False):
+        batch, text_len = text.shape[0], text.shape[1]
+        text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
+        text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
+        text = F.pad(text, (0, seq_len - text_len), value = 0)
+
+        if drop_text:  # cfg for text
+            text = torch.zeros_like(text)
+
+        text = self.text_embed(text) # b n -> b n d
+
+        # possible extra modeling
+        if self.extra_modeling:
+            # sinus pos emb
+            batch_start = torch.zeros((batch,), dtype=torch.long)
+            pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
+            text_pos_embed = self.freqs_cis[pos_idx]
+            text = text + text_pos_embed
+
+            # convnextv2 blocks
+            text = self.text_blocks(text)
+
+        return text
+
+
+# noised input audio and context mixing embedding
+class InputEmbedding(nn.Module):
+    def __init__(self, mel_dim, text_dim, out_dim):
+        super().__init__()
+        self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
+        self.conv_pos_embed = ConvPositionEmbedding(dim = out_dim)
+
+    def forward(self, x: float['b n d'], cond: float['b n d'], 
+                text_embed: float['b n d'], drop_audio_cond = False):
+        if drop_audio_cond or cond is None:  # cfg for cond audio
+            cond = torch.zeros_like(x)
+
+        x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
+        x = self.conv_pos_embed(x) + x
+        return x
+
+
+# Transformer backbone using DiT blocks
+
+class CrossDiT(nn.Module):
+    def __init__(self,
+                 dim, depth=8, heads=8, dim_head=64, dropout=0.0, ff_mult=4,
+                 mel_dim=100, t5_dim=512, clap_dim=512,
+                 text_num_embeds=256, text_dim=None, conv_layers=0,
+                 skip=False, use_checkpoint=True, qk_norm=True,
+                 ):
+        super().__init__()
+
+        self.time_embed = TimestepEmbedding(dim)
+        if text_dim is None:
+            text_dim = mel_dim
+        self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
+        self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
+
+        self.caption_embedding = nn.Sequential(
+                nn.Linear(t5_dim, dim),
+                nn.SiLU(),
+                nn.Linear(dim, dim)
+            )
+
+        self.clap_embedding = nn.Sequential(
+                nn.Linear(clap_dim, dim),
+                nn.SiLU(),
+                nn.Linear(dim, text_dim)
+            )
+
+        # self.null_clap = nn.Parameters
+        # self.null_prompt = nn.Parameters
+
+        self.rotary_embed = RotaryEmbedding(dim_head)
+
+        self.dim = dim
+        self.depth = depth
+
+        self.skip = skip
+
+        self.in_blocks = nn.ModuleList([
+            CrossDiTBlock(dim=dim,
+                          heads=heads,
+                          dim_head=dim_head,
+                          ff_mult=ff_mult,
+                          dropout=dropout,
+                          use_checkpoint=use_checkpoint,
+                          qk_norm=qk_norm,
+                          skip=False
+                         )
+            for _ in range(depth//2)
+        ])
+
+        self.mid_block = CrossDiTBlock(dim=dim,
+                                       heads=heads,
+                                       dim_head=dim_head,
+                                       ff_mult=ff_mult,
+                                       dropout=dropout,
+                                       use_checkpoint=use_checkpoint,
+                                       qk_norm=qk_norm,
+                                       skip=False)
+        self.out_blocks = nn.ModuleList([
+            CrossDiTBlock(dim=dim,
+                          heads=heads,
+                          dim_head=dim_head,
+                          ff_mult=ff_mult,
+                          dropout=dropout,
+                          use_checkpoint=use_checkpoint,
+                          qk_norm=qk_norm,
+                          skip=skip
+                         )
+            for _ in range(depth//2)
+        ])
+
+        self.norm_out = AdaLayerNormZero_Final(dim)  # final modulation
+        self.proj_out = nn.Linear(dim, mel_dim)
+
+    def forward(
+            self,
+            x: float['b n d'],  # nosied input audio
+            cond: float['b n d'],  # masked cond audio
+            prompt: float['b n d'], # speech caption
+            clap: float['b n d'], # sound effects
+            text: int['b nt'],  # text
+            time: float['b'] | float[''],  # time step
+            mask: bool['b n'] | None = None,
+            prompt_mask: bool['b n'] | None = None,
+    ):
+        batch, seq_len = x.shape[0], x.shape[1]
+        if time.ndim == 0:
+            time = repeat(time, ' -> b', b=batch)
+
+        t = self.time_embed(time)
+        text_embed = self.text_embed(text, seq_len-1)
+
+        prompt_embed = self.caption_embedding(prompt)
+        clap_embed = self.clap_embedding(clap).unsqueeze(1)
+        text_embed = torch.cat([clap_embed, text_embed], dim=1)
+
+        x = self.input_embed(x, cond, text_embed)
+
+        rope = self.rotary_embed.forward_from_seq_len(seq_len)
+
+        skips = []
+        for i, block in enumerate(self.in_blocks):
+            x = block(x, t, mask=mask, rope=rope, 
+                      context=prompt_embed, context_mask=prompt_mask)
+            if self.skip:
+                skips.append(x)
+        x = self.mid_block(x, t, mask=mask, rope=rope, 
+                           context=prompt_embed, context_mask=prompt_mask)
+
+        for i, block in enumerate(self.out_blocks):
+            if self.skip:
+                skip = skips.pop()
+            else:
+                skip = None
+            x = block(x, t, mask=mask, rope=rope, 
+                      context=prompt_embed, context_mask=prompt_mask, skip=skip)
+
+        x = self.norm_out(x, t)
+        output = self.proj_out(x)
+
+        return output
diff --git a/capspeech/nar/train.py b/capspeech/nar/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce11dfebd9bec35c513f53532b80ed618de75488
--- /dev/null
+++ b/capspeech/nar/train.py
@@ -0,0 +1,286 @@
+import os
+import time
+import random
+import argparse
+import numpy as np
+from tqdm import tqdm
+from accelerate import Accelerator
+from einops import rearrange
+from cached_path import cached_path
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+
+# replace this with BigVGAN
+import bigvgan
+
+from model.modules import MelSpec
+from network.crossdit import CrossDiT
+from dataset.capspeech import CapSpeech
+from utils import load_checkpoint, make_pad_mask
+from utils import get_lr_scheduler, load_yaml_with_includes
+from inference import eval_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    # Config settings
+    parser.add_argument('--config-name', type=str, required=True)
+
+    # Training settings
+    parser.add_argument("--amp", type=str, default='fp16')
+    parser.add_argument('--epochs', type=int, default=15)
+    parser.add_argument('--num-workers', type=int, default=32)
+    parser.add_argument('--num-threads', type=int, default=1)
+    parser.add_argument('--eval-every-step', type=int, default=10000)
+    # save all states including optimizer every save-every-step
+    parser.add_argument('--save-every-step', type=int, default=10000)
+    parser.add_argument('--resume-from', type=str, default=None, help='Path to checkpoint to resume training')
+
+    # Log and random seed
+    parser.add_argument('--random-seed', type=int, default=2025)
+    parser.add_argument('--log-step', type=int, default=500)
+    parser.add_argument('--log-dir', type=str, default='./logs/')
+    parser.add_argument('--save-dir', type=str, default='./ckpts/')
+    return parser.parse_args()
+
+
+def setup_directories(args, params):
+    args.log_dir = os.path.join(args.log_dir, params['model_name']) + '/'
+    args.save_dir = os.path.join(args.save_dir, params['model_name']) + '/'
+
+    os.makedirs(args.log_dir, exist_ok=True)
+    os.makedirs(args.save_dir, exist_ok=True)
+
+
+def set_device(args):
+    torch.set_num_threads(args.num_threads)
+    if torch.cuda.is_available():
+        args.device = 'cuda'
+        torch.cuda.manual_seed_all(args.random_seed)
+        torch.backends.cuda.matmul.allow_tf32 = True
+        if torch.backends.cudnn.is_available():
+            torch.backends.cudnn.deterministic = True
+            torch.backends.cudnn.benchmark = False
+    else:
+        args.device = 'cpu'
+
+
+def prepare_batch(batch, mel, latent_sr):
+    x, x_lens, y, y_lens, c, c_lens, tag = batch["x"], batch["x_lens"], batch["y"], batch["y_lens"], batch["c"], batch["c_lens"], batch["tag"]
+
+    # add len for clap embedding
+    x_lens = x_lens + 1
+
+    with torch.no_grad():
+        # print(y.mean())
+        audio_clip = mel(y)
+
+        audio_clip = rearrange(audio_clip, 'b d n -> b n d')
+        y_lens = (y_lens * latent_sr).long()
+
+    return x, x_lens, audio_clip, y_lens, c, c_lens, tag
+
+
+if __name__ == '__main__':
+
+    args = parse_args()
+    params = load_yaml_with_includes(args.config_name)
+
+    # random seed
+    set_device(args)
+    random.seed(args.random_seed)
+    torch.manual_seed(args.random_seed)
+
+    accelerator = Accelerator(mixed_precision=args.amp,
+                              gradient_accumulation_steps=params['opt']['accumulation_steps'],
+                              step_scheduler_with_optimizer=False)
+
+    # dataset
+    train_set = CapSpeech(**params['data']['trainset'])
+    train_loader = DataLoader(train_set, num_workers=args.num_workers,
+                              batch_size=params['opt']['batch_size'], shuffle=True,
+                              collate_fn=train_set.collate)
+
+    val_set = CapSpeech(**params['data']['valset'])
+    val_loader = DataLoader(val_set, num_workers=0,
+                            batch_size=1, shuffle=False,
+                            collate_fn=train_set.collate)
+
+    # load dit
+    model = CrossDiT(**params['model'])
+
+    # mel spectrogram - move to accelerator device after preparation
+    mel = MelSpec(**params['mel'])
+    latent_sr = params['mel']['target_sample_rate'] / params['mel']['hop_length']
+
+    # load vocoder
+    vocoder = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_24khz_100band_256x', use_cuda_kernel=False)
+    vocoder.remove_weight_norm()
+    vocoder = vocoder.eval().to(accelerator.device)
+
+    # prepare opt
+    optimizer = torch.optim.AdamW(model.parameters(), lr=params['opt']['learning_rate'])
+
+    if args.resume_from is not None and os.path.exists(args.resume_from):
+        checkpoint = torch.load(args.resume_from, map_location='cpu')
+        model.load_state_dict(checkpoint["model"])
+        optimizer.load_state_dict(checkpoint["optimizer"])
+        global_step = checkpoint["global_step"]
+        start_epoch = checkpoint["epoch"] + 1  # Continue from the next epoch
+        print(f"Resuming training from checkpoint: {args.resume_from}, starting from epoch {start_epoch}.")
+    else:
+        global_step = 0
+        start_epoch = 0
+    
+    lr_scheduler = get_lr_scheduler(optimizer, 'customized', **params['opt']['lr_scheduler'])
+    
+    # Prepare with accelerator
+    (model, optimizer, lr_scheduler, 
+     train_loader, val_loader) = accelerator.prepare(model, optimizer, lr_scheduler, train_loader, val_loader)
+    
+    # Move mel and vocos to the same device as model AFTER preparation
+    mel = mel.to(accelerator.device)
+    vocoder = vocoder.to(accelerator.device)
+
+    # Add synchronization point
+    accelerator.wait_for_everyone()
+
+    losses = 0.0
+
+    if accelerator.is_main_process:
+        setup_directories(args, params)
+        trainable_params = sum(param.nelement() for param in model.parameters() if param.requires_grad)
+        print("Number of trainable parameters: %.2fM" % (trainable_params / 1e6))
+    
+    # Add synchronization point
+    accelerator.wait_for_everyone()
+
+    # REMOVED initial evaluation to prevent deadlock
+    # We'll evaluate after the first epoch or at the first eval step
+
+    for epoch in range(start_epoch, args.epochs):
+        model.train()
+        
+        # Use accelerator's progress bar for correct handling in distributed setup
+        progress_bar = tqdm(train_loader, disable=not accelerator.is_local_main_process)
+        
+        for step, batch in enumerate(progress_bar):
+            with accelerator.accumulate(model):
+                (text, text_lens, audio_clips, audio_lens, prompt, prompt_lens, clap) = prepare_batch(batch, mel, latent_sr)
+                # prepare flow mathing
+                x1 = audio_clips
+                x0 = torch.randn_like(x1)
+                t = torch.rand((x1.shape[0],), dtype=x1.dtype, device=x1.device)
+                sigma = rearrange(t, 'b -> b 1 1')
+                noisy_x1 = (1 - sigma) * x0.clone() + sigma * x1.clone()
+                flow = x1.clone() - x0.clone()
+                # option: audio-prompt based zero-shot tts
+                # tts_mask = create_tts_mask(seq_len, x1.shape[1], params['opt']['mask_range'])
+                # # cond = x1.clone(), cond[tts_mask[..., None]] = 0
+                # cond = torch.where(tts_mask[..., None], torch.zeros_like(x1), x1)
+                cond = None
+
+                # prepare batch cfg
+                drop_prompt = (torch.rand(x1.shape[0]) < params['opt']['drop_spk'])
+                drop_text = drop_prompt & (torch.rand(x1.shape[0]) < params['opt']['drop_text'])
+
+                prompt[drop_prompt] = 0.0
+                prompt_lens[drop_prompt] = 1
+                clap[drop_text] = 0.0
+                text[drop_text] = -1
+
+                seq_len_audio = audio_clips.shape[1]
+                pad_mask = make_pad_mask(audio_lens, seq_len_audio).to(audio_clips.device)
+
+                seq_len_prompt = prompt.shape[1]
+                prompt_mask = make_pad_mask(prompt_lens, seq_len_prompt).to(prompt.device)
+
+                pred = model(x=noisy_x1, cond=cond,
+                             prompt=prompt, clap=clap, text=text, time=t,
+                             mask=pad_mask, prompt_mask=prompt_mask)
+
+                loss = F.mse_loss(pred, flow, reduction="none")
+                loss = loss[pad_mask].mean()
+
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    if 'grad_clip' in params['opt'] and params['opt']['grad_clip'] > 0:
+                        accelerator.clip_grad_norm_(model.parameters(),
+                                                    max_norm=params['opt']['grad_clip'])
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Fixed step counting - increment only once per actual step, not per accumulation step
+            if accelerator.sync_gradients:
+                global_step += 1
+                losses += loss.item()
+
+                # Add progress bar description
+                if accelerator.is_local_main_process:
+                    progress_bar.set_description(f"Epoch {epoch+1}, Loss: {loss.item():.6f}")
+
+                if global_step % args.log_step == 0:
+                    losses = losses / args.log_step  # Calculate average loss
+                    
+                    if accelerator.is_main_process:
+                        current_time = time.asctime(time.localtime(time.time()))
+                        epoch_info = f'Epoch: [{epoch + 1}][{args.epochs}]'
+                        batch_info = f'Global Step: {global_step}'
+                        loss_info = f'Loss: {losses:.6f}'
+
+                        # Extract the learning rate from the optimizer
+                        lr = optimizer.param_groups[0]['lr']
+                        lr_info = f'Learning Rate: {lr:.6f}'
+
+                        log_message = f'{current_time}\n{epoch_info}    {batch_info}    {loss_info}    {lr_info}\n'
+
+                        with open(args.log_dir + 'log.txt', mode='a') as n:
+                            n.write(log_message)
+
+                    # Reset loss accumulator
+                    losses = 0.0
+                
+                # Evaluation logic
+                if global_step % args.eval_every_step == 0:
+                    # Set model to eval mode
+                    model.eval()
+                    
+                    # Synchronize before evaluation
+                    accelerator.wait_for_everyone()
+                    
+                    if accelerator.is_main_process:
+                        # Get unwrapped model for evaluation
+                        unwrapped_model = accelerator.unwrap_model(model)
+                        
+                        # Run evaluation without specifying device
+                        eval_model(unwrapped_model, vocoder, mel, val_loader, params,
+                                   steps=25, cfg=2.0,
+                                   sway_sampling_coef=-1.0, 
+                                   # Remove explicit device setting
+                                   epoch=global_step, save_path=args.log_dir + 'output/', val_num=1)
+                        
+                        # Save model checkpoint
+                        accelerator.save({
+                            "model": unwrapped_model.state_dict(),
+                            "optimizer": optimizer.state_dict(),
+                            "epoch": epoch,
+                            "global_step": global_step,
+                        }, args.save_dir + str(global_step) + '.pt')
+                        
+                        # Save full state including optimizer if needed
+                        if global_step % args.save_every_step == 0:
+                            accelerator.save_state(f"{args.save_dir}{global_step}")
+                    
+                    # Synchronize after evaluation and saving
+                    accelerator.wait_for_everyone()
+                    
+                    # Set model back to train mode
+                    model.train()
+
+        # Synchronize at the end of each epoch
+        accelerator.wait_for_everyone()
diff --git a/capspeech/nar/utils/__init__.py b/capspeech/nar/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5e5c679248fe8bb7be8f95ae7f7bffd8139109f
--- /dev/null
+++ b/capspeech/nar/utils/__init__.py
@@ -0,0 +1,2 @@
+from .utils import initialize_controlnet, load_checkpoint, get_lr_scheduler, load_yaml_with_includes, make_pad_mask
+from .mask import create_tts_mask
\ No newline at end of file
diff --git a/capspeech/nar/utils/mask.py b/capspeech/nar/utils/mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..c114817fc03b4d4b7a165f54f25f2edf45e1b0de
--- /dev/null
+++ b/capspeech/nar/utils/mask.py
@@ -0,0 +1,41 @@
+import torch
+
+
+def create_tts_mask(seq_len, max_seq_len, mask_range):
+
+    bs = seq_len.size(0)
+    device = seq_len.device
+
+    # 1. Sample random fractional lengths for each sequence
+    frac_lengths = torch.zeros(bs, device=device).uniform_(*mask_range)
+
+    # 2. Convert fractional lengths to integer lengths
+    lengths = (frac_lengths * seq_len).long()
+
+    # 3. Compute valid start indices based on sequence length
+    max_start = seq_len - lengths
+
+    # 4. Sample random start positions (clamped at 0)
+    rand = torch.rand(bs, device=device)
+    start = (max_start * rand).long().clamp(min=0)
+    end = start + lengths
+
+    # 5. Build the final boolean mask
+    # max_seq_len = seq_len.max().item()
+    seq = torch.arange(max_seq_len, device=device).long()
+
+    start_mask = seq[None, :] >= start[:, None]
+    end_mask = seq[None, :] <  end[:, None]
+    mask = start_mask & end_mask
+
+    return mask
+
+
+if __name__ == "__main__":
+    # Example: 3 sequences of lengths [5, 7, 6]
+    lengths = torch.tensor([5, 7, 6])
+    mask_range = (0.7, 1.0)  # Sample fractional lengths between 30% and 70% of each seq
+
+    mask = create_tts_mask(lengths, mask_range)
+    print("Mask shape:", mask.shape)  # Should be [3, 7], since max_seq_len is 7
+    print(mask)
\ No newline at end of file
diff --git a/capspeech/nar/utils/utils.py b/capspeech/nar/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..00a502bc87b17167511406c4b61212aff2e72cb6
--- /dev/null
+++ b/capspeech/nar/utils/utils.py
@@ -0,0 +1,92 @@
+import torch
+import numpy as np
+import yaml
+import os
+
+
+def load_yaml_with_includes(yaml_file):
+    def loader_with_include(loader, node):
+        # Load the included file
+        include_path = os.path.join(os.path.dirname(yaml_file), loader.construct_scalar(node))
+        with open(include_path, 'r') as f:
+            return yaml.load(f, Loader=yaml.FullLoader)
+
+    yaml.add_constructor('!include', loader_with_include, Loader=yaml.FullLoader)
+
+    with open(yaml_file, 'r') as f:
+        return yaml.load(f, Loader=yaml.FullLoader)
+
+
+def initialize_controlnet(controlnet, model):
+    model_state_dict = model.state_dict()
+    controlnet_state_dict = controlnet.state_dict()
+
+    # Create a new state_dict for controlnet
+    new_state_dict = {}
+    for k, v in controlnet_state_dict.items():
+        if k in model_state_dict and model_state_dict[k].shape == v.shape:
+            new_state_dict[k] = model_state_dict[k]
+        else:
+            print(f'new layer in controlnet: {k}')
+            new_state_dict[k] = v  # Keep the original if unmatched
+
+    # Load the new state_dict into controlnet
+    controlnet.load_state_dict(new_state_dict)
+    return controlnet
+
+
+def load_checkpoint(model, ckpt_path, device, use_ema = True):
+    ckpt_type = ckpt_path.split(".")[-1]
+    if ckpt_type == "safetensors":
+        from safetensors.torch import load_file
+        checkpoint = load_file(ckpt_path, device=device)
+    else:
+        checkpoint = torch.load(ckpt_path, weights_only=True, map_location=device)
+
+    new_state_dict = {}
+    for key, value in checkpoint.items():
+        if key.startswith('ema_model.transformer'):
+            new_key = key.replace('ema_model.transformer.', '')
+            new_state_dict[new_key] = value
+
+    load_info = model.load_state_dict(new_state_dict, strict=False)
+    # The returned object provides two lists: 'missing_keys' and 'unexpected_keys'
+    print("Missing keys:", load_info.missing_keys)
+    print("Unexpected keys:", load_info.unexpected_keys)
+    return model
+
+
+def customized_lr_scheduler(optimizer, warmup_steps=10000, decay_steps=1e6, end_factor=1e-4):
+    from torch.optim.lr_scheduler import LinearLR, SequentialLR
+    warmup_scheduler = LinearLR(optimizer,
+                                start_factor=min(1 / warmup_steps, 1),
+                                end_factor=1.0, total_iters=warmup_steps)
+
+    decay_scheduler = LinearLR(optimizer,
+                               start_factor=1.0,
+                               end_factor=end_factor,
+                               total_iters=decay_steps)
+
+    scheduler = SequentialLR(optimizer, schedulers=[warmup_scheduler, decay_scheduler], 
+                             milestones=[warmup_steps])
+    return scheduler
+
+
+def get_lr_scheduler(optimizer, name, **kwargs):
+    if name == 'customized':
+        return customized_lr_scheduler(optimizer, **kwargs)
+    elif name == 'cosine':
+        from torch.optim.lr_scheduler import CosineAnnealingLR
+        return CosineAnnealingLR(optimizer, **kwargs)
+    else:
+        raise NotImplementedError(name)
+
+
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    assert lengths.ndim == 1, lengths.ndim
+    max_len = max(max_len, lengths.max())
+    n = lengths.size(0)
+    seq_range = torch.arange(0, max_len, device=lengths.device)
+    expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
+
+    return expaned_lengths <= lengths.unsqueeze(-1)
\ No newline at end of file