Text-to-Speech Models
Collection
3 items
•
Updated
•
1
This model is a fine-tuned version of csm-1B for medical text-to-speech tasks. It was trained on a curated dataset of ~2,000 medical text-to-speech pairs, focusing on clinical terminology, healthcare instructions, and patient–doctor communication scenarios.
Use the code below to get started with the model.
import torch
from transformers import CsmForConditionalGeneration, AutoProcessor
import soundfile as sf
from peft import PeftModel
model_id = "unsloth/csm-1b"
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained(model_id)
base_model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
model = PeftModel.from_pretrained(base_model, "khazarai/Medical-TTS")
text = "Mild dorsal angulation of the distal radius reflective of the fracture."
speaker_id = 0
conversation = [
{"role": str(speaker_id), "content": [{"type": "text", "text": text}]},
]
audio_values = model.generate(
**processor.apply_chat_template(
conversation,
tokenize=True,
return_dict=True,
).to("cuda"),
max_new_tokens=650,
# play with these parameters to tweak results
# depth_decoder_top_k=0,
# depth_decoder_top_p=0.9,
# depth_decoder_do_sample=True,
# depth_decoder_temperature=0.9,
# top_k=0,
# top_p=1.0,
# temperature=0.9,
# do_sample=True,
#########################################################
output_audio=True
)
audio = audio_values[0].to(torch.float32).cpu().numpy()
sf.write("example.wav", audio, 24000)