Spaces:

yl4579
/

DMOSpeech2-demo

Running on Zero

File size: 9,775 Bytes

407412c

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "93caa9f6",
   "metadata": {},
   "source": [
    "## Basic Inference (teacher-guided, 8 steps)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72e5ced3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from infer import DMOInference\n",
    "import IPython.display as ipd\n",
    "import torchaudio\n",
    "import time\n",
    "\n",
    "# Initialize the model\n",
    "tts = DMOInference(\n",
    "    student_checkpoint_path=\"../ckpts/model_85000.pt\", \n",
    "    duration_predictor_path=\"../ckpts/model_1500.pt\",\n",
    "    device=\"cuda\",\n",
    "    model_type=\"F5TTS_Base\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc5b2634",
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt_audio = \"f5_tts/infer/examples/basic/basic_ref_en.wav\"\n",
    "\n",
    "ref_text = \"Some call me nature, others call me mother nature.\"\n",
    "gen_text = \"I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring.\"\n",
    "\n",
    "start_time = time.time()\n",
    "# Generate with default settings\n",
    "generated_audio = tts.generate(\n",
    "    gen_text=gen_text,\n",
    "    audio_path=prompt_audio,\n",
    "    prompt_text=ref_text)\n",
    "end_time = time.time()\n",
    "\n",
    "processing_time = end_time - start_time\n",
    "audio_duration = generated_audio.shape[-1] / 24000\n",
    "rtf = processing_time / audio_duration\n",
    "\n",
    "print('\\n--------\\n')\n",
    "print('Prompt Audio: ')\n",
    "display(ipd.Audio(prompt_audio, rate=24000))\n",
    "print('Generated Audio: ')\n",
    "display(ipd.Audio(generated_audio, rate=24000))\n",
    "\n",
    "print(f\"  RTF: {rtf:.2f}x ({1/rtf:.2f}x speed)\")\n",
    "print(f\"  Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d65b4bde",
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt_audio = \"f5_tts/infer/examples/basic/basic_ref_zh.wav\"\n",
    "\n",
    "ref_text = \"对，这就是我，万人敬仰的太乙真人。\"\n",
    "gen_text = '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道：\"我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？\"'\n",
    "\n",
    "start_time = time.time()\n",
    "# Generate with default settings\n",
    "generated_audio = tts.generate(\n",
    "    gen_text=gen_text,\n",
    "    audio_path=prompt_audio,\n",
    "    prompt_text=ref_text\n",
    ")\n",
    "end_time = time.time()\n",
    "\n",
    "processing_time = end_time - start_time\n",
    "audio_duration = generated_audio.shape[-1] / 24000\n",
    "rtf = processing_time / audio_duration\n",
    "\n",
    "print('\\n--------\\n')\n",
    "print('Prompt Audio: ')\n",
    "display(ipd.Audio(prompt_audio, rate=24000))\n",
    "print('Generated Audio: ')\n",
    "display(ipd.Audio(generated_audio, rate=24000))\n",
    "\n",
    "print(f\"  RTF: {rtf:.2f}x ({1/rtf:.2f}x speed)\")\n",
    "print(f\"  Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a0b8df0f",
   "metadata": {},
   "source": [
    "## Comparision between different sampling configurations"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3962e78c",
   "metadata": {},
   "source": [
    "#### Student only (4 steps)\n",
    "\n",
    "Need to set `teacher_steps` and `student_start_step` to 0 to enable full student sampling."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53a232a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt_audio = \"f5_tts/infer/examples/basic/basic_ref_zh.wav\"\n",
    "\n",
    "ref_text = \"对，这就是我，万人敬仰的太乙真人。\"\n",
    "gen_text = '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道：\"我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？\"'\n",
    "\n",
    "start_time = time.time()\n",
    "# Generate with default settings\n",
    "generated_audio = tts.generate(\n",
    "    gen_text=gen_text,\n",
    "    audio_path=prompt_audio,\n",
    "    prompt_text=ref_text,\n",
    "    teacher_steps=0, # set this to 0 for no teachr sampling\n",
    "    student_start_step=0, # set this to 0 for full student sampling\n",
    ")\n",
    "end_time = time.time()\n",
    "\n",
    "processing_time = end_time - start_time\n",
    "audio_duration = generated_audio.shape[-1] / 24000\n",
    "rtf = processing_time / audio_duration\n",
    "\n",
    "print('\\n--------\\n')\n",
    "print('Prompt Audio: ')\n",
    "display(ipd.Audio(prompt_audio, rate=24000))\n",
    "print('Generated Audio: ')\n",
    "display(ipd.Audio(generated_audio, rate=24000))\n",
    "\n",
    "print(f\"  RTF: {rtf:.2f}x ({1/rtf:.2f}x speed)\")\n",
    "print(f\"  Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5cb24808",
   "metadata": {},
   "source": [
    "#### More teacher steps (16 steps)\n",
    "\n",
    "Now we use 14 steps from the teacher and 2 steps from the student to have higher diversity (16 steps total)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10b3620e",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "prompt_audio = \"f5_tts/infer/examples/basic/basic_ref_zh.wav\"\n",
    "\n",
    "ref_text = \"对，这就是我，万人敬仰的太乙真人。\"\n",
    "gen_text = '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道：\"我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？\"'\n",
    "\n",
    "start_time = time.time()\n",
    "# Generate with default settings\n",
    "generated_audio = tts.generate(\n",
    "    gen_text=gen_text,\n",
    "    audio_path=prompt_audio,\n",
    "    prompt_text=ref_text,\n",
    "    teacher_steps=24, \n",
    "    teacher_stopping_time=0.3, # 0.25 means students go for the last two steps (0.26ish, 0.6ish)\n",
    "    student_start_step=2, # only two steps for students\n",
    "    verbose=True # see the number of steps used\n",
    ")\n",
    "end_time = time.time()\n",
    "\n",
    "processing_time = end_time - start_time\n",
    "audio_duration = generated_audio.shape[-1] / 24000\n",
    "rtf = processing_time / audio_duration\n",
    "\n",
    "print('\\n--------\\n')\n",
    "print('Prompt Audio: ')\n",
    "display(ipd.Audio(prompt_audio, rate=24000))\n",
    "print('Generated Audio: ')\n",
    "display(ipd.Audio(generated_audio, rate=24000))\n",
    "\n",
    "print(f\"  RTF: {rtf:.2f}x ({1/rtf:.2f}x speed)\")\n",
    "print(f\"  Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "70fd3cb1",
   "metadata": {},
   "source": [
    "#### Stochastic duration \n",
    "\n",
    "Introduce even more diversity by adding randomness to the duration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bb39d323",
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt_audio = \"f5_tts/infer/examples/basic/basic_ref_zh.wav\"\n",
    "\n",
    "ref_text = \"对，这就是我，万人敬仰的太乙真人。\"\n",
    "gen_text = '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道：\"我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？\"'\n",
    "\n",
    "start_time = time.time()\n",
    "# Generate with default settings\n",
    "generated_audio = tts.generate(\n",
    "    gen_text=gen_text,\n",
    "    audio_path=prompt_audio,\n",
    "    prompt_text=ref_text,\n",
    "    teacher_steps=24, \n",
    "    teacher_stopping_time=0.25, # 0.25 means students go for the last two steps (0.26ish, 0.6ish)\n",
    "    student_start_step=2, # only two steps for students\n",
    "    temperature=0.8, # set some temperature for duration sampling \n",
    ")\n",
    "end_time = time.time()\n",
    "\n",
    "processing_time = end_time - start_time\n",
    "audio_duration = generated_audio.shape[-1] / 24000\n",
    "rtf = processing_time / audio_duration\n",
    "\n",
    "print('\\n--------\\n')\n",
    "print('Prompt Audio: ')\n",
    "display(ipd.Audio(prompt_audio, rate=24000))\n",
    "print('Generated Audio: ')\n",
    "display(ipd.Audio(generated_audio, rate=24000))\n",
    "\n",
    "print(f\"  RTF: {rtf:.2f}x ({1/rtf:.2f}x speed)\")\n",
    "print(f\"  Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f109ed3",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1cc13360",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}