{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4",
"authorship_tag": "ABX9TyOcNNO6b/4X+3VXZr1bdSH6",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU",
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"474d9a121acd4a4ab0d9f946db1568bc": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_576d89979ab246ed9889ca55ebef3d92",
"IPY_MODEL_700fe25008db413db8fe97821a164ab4",
"IPY_MODEL_46d2cadc00a348b69721f0a451b6ea8c"
],
"layout": "IPY_MODEL_4340f0135b5d4a2f90a507ab8230e6e7"
}
},
"576d89979ab246ed9889ca55ebef3d92": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_cead9aa9d7b34351b660f330c27ac310",
"placeholder": "",
"style": "IPY_MODEL_9552a1bfd4d34f76b6a1582b612a4ebf",
"value": "config.yaml: 100%"
}
},
"700fe25008db413db8fe97821a164ab4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b1d1d14f6aa94217b3d6dfd3daee1da8",
"max": 461,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_99867e32bd674d709aaf1b0f7102ee81",
"value": 461
}
},
"46d2cadc00a348b69721f0a451b6ea8c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_03c99e67ea69449f9a7bcda1419d0ec4",
"placeholder": "",
"style": "IPY_MODEL_380cbd29f1b84541ae20ef1f683ce69f",
"value": " 461/461 [00:00<00:00, 33.7kB/s]"
}
},
"4340f0135b5d4a2f90a507ab8230e6e7": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"cead9aa9d7b34351b660f330c27ac310": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"9552a1bfd4d34f76b6a1582b612a4ebf": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"b1d1d14f6aa94217b3d6dfd3daee1da8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"99867e32bd674d709aaf1b0f7102ee81": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"03c99e67ea69449f9a7bcda1419d0ec4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"380cbd29f1b84541ae20ef1f683ce69f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"b5624e5cd9a54018ac0a1db436ea7202": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_ed549a2ef8ef4b28b73d8b56cbd3519d",
"IPY_MODEL_a72129f5b6774545a9e0a30f7cb1e619",
"IPY_MODEL_6166f05435dc4b59b7a36229183a4828"
],
"layout": "IPY_MODEL_dbf4ddee3a42433c98ccce6883fab83d"
}
},
"ed549a2ef8ef4b28b73d8b56cbd3519d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b9a2279178094b598fcd978e8edc75fa",
"placeholder": "",
"style": "IPY_MODEL_62fdbd59289c45e6a56154f9c7740c87",
"value": "pytorch_model.bin: 100%"
}
},
"a72129f5b6774545a9e0a30f7cb1e619": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_198ea7e06ab94c34b3c35b978ce754ff",
"max": 54365991,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_0c9f4972097d49259122001e726d67bd",
"value": 54365991
}
},
"6166f05435dc4b59b7a36229183a4828": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_3e52c628e5904044a77a2cc9efb942f0",
"placeholder": "",
"style": "IPY_MODEL_4d20115a059542f9b718f4f69fb16a87",
"value": " 54.4M/54.4M [00:00<00:00, 123MB/s]"
}
},
"dbf4ddee3a42433c98ccce6883fab83d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b9a2279178094b598fcd978e8edc75fa": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"62fdbd59289c45e6a56154f9c7740c87": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"198ea7e06ab94c34b3c35b978ce754ff": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"0c9f4972097d49259122001e726d67bd": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"3e52c628e5904044a77a2cc9efb942f0": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"4d20115a059542f9b718f4f69fb16a87": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"
"
]
},
{
"cell_type": "code",
"source": [
"!git clone -b master https://github.com/adelacvg/ttts.git"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "WB8vmGyDAPrr",
"outputId": "cee62580-1134-435f-d6ce-e68402a48a31"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Cloning into 'ttts'...\n",
"remote: Enumerating objects: 538, done.\u001b[K\n",
"remote: Counting objects: 100% (444/444), done.\u001b[K\n",
"remote: Compressing objects: 100% (283/283), done.\u001b[K\n",
"remote: Total 538 (delta 241), reused 346 (delta 154), pack-reused 94\u001b[K\n",
"Receiving objects: 100% (538/538), 61.11 MiB | 13.93 MiB/s, done.\n",
"Resolving deltas: 100% (250/250), done.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%cd ttts\n",
"!pip install -e ."
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "HKikTDsVA1_g",
"outputId": "1906df87-d848-4099-acec-b18c40db200a"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"/content/ttts\n",
"Obtaining file:///content/ttts\n",
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Installing collected packages: ttts\n",
" Running setup.py develop for ttts\n",
"Successfully installed ttts-0.1\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!git lfs install\n",
"!git clone https://huggingface.co/adelacvg/TTTS"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_KYD7eZQB4x7",
"outputId": "dc2fc2b8-5883-4d96-96b6-5df190d356f7"
},
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Git LFS initialized.\n",
"Cloning into 'TTTS'...\n",
"fatal: Remote branch master not found in upstream origin\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import locale\n",
"locale.getpreferredencoding = lambda: \"UTF-8\"\n",
"!pip install pypinyin einops omegaconf==2.0.6 encodec vocos k_diffusion"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "UcOqgsONDGZ3",
"outputId": "5d0ef477-0263-4cad-cd75-da93a1482a87"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting pypinyin\n",
" Downloading pypinyin-0.50.0-py2.py3-none-any.whl (1.4 MB)\n",
"\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/1.4 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.2/1.4 MB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.6/1.4 MB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━\u001b[0m \u001b[32m1.0/1.4 MB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting einops\n",
" Downloading einops-0.7.0-py3-none-any.whl (44 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.6/44.6 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting omegaconf==2.0.6\n",
" Downloading omegaconf-2.0.6-py3-none-any.whl (36 kB)\n",
"Collecting encodec\n",
" Downloading encodec-0.1.1.tar.gz (3.7 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.7/3.7 MB\u001b[0m \u001b[31m23.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Collecting vocos\n",
" Downloading vocos-0.1.0-py3-none-any.whl (24 kB)\n",
"Collecting k_diffusion\n",
" Downloading k_diffusion-0.1.1.post1-py3-none-any.whl (33 kB)\n",
"Requirement already satisfied: PyYAML>=5.1.* in /usr/local/lib/python3.10/dist-packages (from omegaconf==2.0.6) (6.0.1)\n",
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from omegaconf==2.0.6) (4.5.0)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from encodec) (1.23.5)\n",
"Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from encodec) (2.1.0+cu121)\n",
"Requirement already satisfied: torchaudio in /usr/local/lib/python3.10/dist-packages (from encodec) (2.1.0+cu121)\n",
"Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from vocos) (1.11.4)\n",
"Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from vocos) (0.20.2)\n",
"Collecting accelerate (from k_diffusion)\n",
" Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m270.9/270.9 kB\u001b[0m \u001b[31m27.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting clean-fid (from k_diffusion)\n",
" Downloading clean_fid-0.1.35-py3-none-any.whl (26 kB)\n",
"Collecting clip-anytorch (from k_diffusion)\n",
" Downloading clip_anytorch-2.6.0-py3-none-any.whl (1.4 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m31.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting dctorch (from k_diffusion)\n",
" Downloading dctorch-0.1.2-py3-none-any.whl (2.3 kB)\n",
"Collecting jsonmerge (from k_diffusion)\n",
" Downloading jsonmerge-1.9.2-py3-none-any.whl (19 kB)\n",
"Collecting kornia (from k_diffusion)\n",
" Downloading kornia-0.7.1-py2.py3-none-any.whl (756 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m756.0/756.0 kB\u001b[0m \u001b[31m37.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from k_diffusion) (9.4.0)\n",
"Requirement already satisfied: safetensors in /usr/local/lib/python3.10/dist-packages (from k_diffusion) (0.4.1)\n",
"Requirement already satisfied: scikit-image in /usr/local/lib/python3.10/dist-packages (from k_diffusion) (0.19.3)\n",
"Collecting torchdiffeq (from k_diffusion)\n",
" Downloading torchdiffeq-0.2.3-py3-none-any.whl (31 kB)\n",
"Collecting torchsde (from k_diffusion)\n",
" Downloading torchsde-0.2.6-py3-none-any.whl (61 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.2/61.2 kB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from k_diffusion) (0.16.0+cu121)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from k_diffusion) (4.66.1)\n",
"Collecting wandb (from k_diffusion)\n",
" Downloading wandb-0.16.2-py3-none-any.whl (2.2 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m38.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch->encodec) (3.13.1)\n",
"Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->encodec) (1.12)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->encodec) (3.2.1)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->encodec) (3.1.2)\n",
"Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch->encodec) (2023.6.0)\n",
"Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch->encodec) (2.1.0)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate->k_diffusion) (23.2)\n",
"Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate->k_diffusion) (5.9.5)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from clean-fid->k_diffusion) (2.31.0)\n",
"Collecting ftfy (from clip-anytorch->k_diffusion)\n",
" Downloading ftfy-6.1.3-py3-none-any.whl (53 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.4/53.4 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from clip-anytorch->k_diffusion) (2023.6.3)\n",
"Requirement already satisfied: jsonschema>2.4.0 in /usr/local/lib/python3.10/dist-packages (from jsonmerge->k_diffusion) (4.19.2)\n",
"Requirement already satisfied: imageio>=2.4.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image->k_diffusion) (2.31.6)\n",
"Requirement already satisfied: tifffile>=2019.7.26 in /usr/local/lib/python3.10/dist-packages (from scikit-image->k_diffusion) (2023.12.9)\n",
"Requirement already satisfied: PyWavelets>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image->k_diffusion) (1.5.0)\n",
"Collecting trampoline>=0.1.2 (from torchsde->k_diffusion)\n",
" Downloading trampoline-0.1.2-py3-none-any.whl (5.2 kB)\n",
"Requirement already satisfied: Click!=8.0.0,>=7.1 in /usr/local/lib/python3.10/dist-packages (from wandb->k_diffusion) (8.1.7)\n",
"Collecting GitPython!=3.1.29,>=1.0.0 (from wandb->k_diffusion)\n",
" Downloading GitPython-3.1.41-py3-none-any.whl (196 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m196.4/196.4 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting sentry-sdk>=1.0.0 (from wandb->k_diffusion)\n",
" Downloading sentry_sdk-1.39.2-py2.py3-none-any.whl (254 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m254.1/254.1 kB\u001b[0m \u001b[31m23.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting docker-pycreds>=0.4.0 (from wandb->k_diffusion)\n",
" Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\n",
"Collecting setproctitle (from wandb->k_diffusion)\n",
" Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from wandb->k_diffusion) (67.7.2)\n",
"Requirement already satisfied: appdirs>=1.4.3 in /usr/local/lib/python3.10/dist-packages (from wandb->k_diffusion) (1.4.4)\n",
"Requirement already satisfied: protobuf!=4.21.0,<5,>=3.19.0 in /usr/local/lib/python3.10/dist-packages (from wandb->k_diffusion) (3.20.3)\n",
"Requirement already satisfied: six>=1.4.0 in /usr/local/lib/python3.10/dist-packages (from docker-pycreds>=0.4.0->wandb->k_diffusion) (1.16.0)\n",
"Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb->k_diffusion)\n",
" Downloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>2.4.0->jsonmerge->k_diffusion) (23.2.0)\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>2.4.0->jsonmerge->k_diffusion) (2023.12.1)\n",
"Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>2.4.0->jsonmerge->k_diffusion) (0.32.1)\n",
"Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>2.4.0->jsonmerge->k_diffusion) (0.16.2)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->clean-fid->k_diffusion) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->clean-fid->k_diffusion) (3.6)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->clean-fid->k_diffusion) (2.0.7)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->clean-fid->k_diffusion) (2023.11.17)\n",
"Requirement already satisfied: wcwidth<0.3.0,>=0.2.12 in /usr/local/lib/python3.10/dist-packages (from ftfy->clip-anytorch->k_diffusion) (0.2.12)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->encodec) (2.1.3)\n",
"Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->encodec) (1.3.0)\n",
"Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb->k_diffusion)\n",
" Downloading smmap-5.0.1-py3-none-any.whl (24 kB)\n",
"Building wheels for collected packages: encodec\n",
" Building wheel for encodec (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for encodec: filename=encodec-0.1.1-py3-none-any.whl size=45759 sha256=a6f25c3141ae73377bf58efa53716b479f3299aaebcbaf213514f7955a86427f\n",
" Stored in directory: /root/.cache/pip/wheels/fc/36/cb/81af8b985a5f5e0815312d5e52b41263237af07b977e6bcbf3\n",
"Successfully built encodec\n",
"Installing collected packages: trampoline, smmap, setproctitle, sentry-sdk, pypinyin, omegaconf, ftfy, einops, docker-pycreds, gitdb, torchsde, torchdiffeq, kornia, GitPython, dctorch, accelerate, wandb, jsonmerge, encodec, clip-anytorch, clean-fid, vocos, k_diffusion\n",
"Successfully installed GitPython-3.1.41 accelerate-0.26.1 clean-fid-0.1.35 clip-anytorch-2.6.0 dctorch-0.1.2 docker-pycreds-0.4.0 einops-0.7.0 encodec-0.1.1 ftfy-6.1.3 gitdb-4.0.11 jsonmerge-1.9.2 k_diffusion-0.1.1.post1 kornia-0.7.1 omegaconf-2.0.6 pypinyin-0.50.0 sentry-sdk-1.39.2 setproctitle-1.3.3 smmap-5.0.1 torchdiffeq-0.2.3 torchsde-0.2.6 trampoline-0.1.2 vocos-0.1.0 wandb-0.16.2\n"
]
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "9Iz6jokZ_-AB"
},
"outputs": [],
"source": [
"from pypinyin import lazy_pinyin, Style\n",
"import torch\n",
"\n",
"MODELS = {\n",
" 'vqvae.pth':'TTTS/vae-30.pt',\n",
" 'gpt.pth': 'TTTS/gpt-70.pt',\n",
" 'clvp2.pth': '',\n",
" 'diffusion.pth': 'TTTS/diffusion-855.pt',\n",
" 'vocoder.pth': '~/tortoise_plus_zh/ttts/pretrained_models/pytorch_model.bin',\n",
" 'rlg_auto.pth': '',\n",
" 'rlg_diffuser.pth': '',\n",
"}"
]
},
{
"cell_type": "code",
"source": [
"from ttts.gpt.voice_tokenizer import VoiceBpeTokenizer\n",
"import torch.nn.functional as F\n",
"device = 'cuda:0'\n",
"text = \"大家好,今天来点大家想看的东西。\"\n",
"# text = \"霞浦县衙城镇乌旗瓦窑村水位猛涨。\"\n",
"# text = '高德官方网站,拥有全面、精准的地点信息,公交驾车路线规划,特色语音导航,商家团购、优惠信息。'\n",
"# text = '四是四,十是十,十四是十四,四十是四十。'\n",
"# text = '八百标兵奔北坡,炮兵并排北边跑。炮兵怕把标兵碰,标兵怕碰炮兵炮。'\n",
"# text = '黑化肥发灰,灰化肥发黑。黑化肥挥发会发灰;灰化肥挥发会发黑。'\n",
"# text = '先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。然侍卫之臣不懈于内,忠志之士忘身于外者,盖追先帝之殊遇,欲报之于陛下也。诚宜开张圣听,以光先帝遗德,恢弘志士之气,不宜妄自菲薄,引喻失义,以塞忠谏之路也。'\n",
"pinyin = ' '.join(lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True))\n",
"tokenizer = VoiceBpeTokenizer('ttts/gpt/gpt_tts_tokenizer.json')\n",
"text_tokens = torch.IntTensor(tokenizer.encode(pinyin)).unsqueeze(0).to(device)\n",
"text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary.\n",
"text_tokens = text_tokens.to(device)\n",
"print(pinyin)\n",
"print(text_tokens)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "SpTKi32TDQFi",
"outputId": "84292cc0-d09e-420d-c449-a746e44dade5"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"da4 jia1 hao3 , jin1 tian1 lai2 dian3 da4 jia1 xiang3 kan4 de5 dong1 xi1 。\n",
"tensor([[161, 2, 155, 2, 16, 87, 2, 43, 2, 224, 2, 171, 71, 2,\n",
" 182, 2, 188, 2, 161, 2, 155, 2, 62, 92, 2, 19, 63, 2,\n",
" 65, 2, 12, 84, 2, 228, 2, 39, 0]], device='cuda:0',\n",
" dtype=torch.int32)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from ttts.utils.infer_utils import load_model\n",
"from ttts.vocoder.feature_extractors import MelSpectrogramFeatures\n",
"import torchaudio\n",
"# device = 'gpu:0'\n",
"gpt = load_model('gpt',MODELS['gpt.pth'],'ttts/gpt/config.json',device)\n",
"gpt.post_init_gpt2_config(use_deepspeed=False, kv_cache=False, half=False)\n",
"# diffusion = load_model('diffusion',MODELS['diffusion.pth'],'ttts/diffusion/config.json',device)\n",
"cond_audio = 'ttts/3.wav'\n",
"audio,sr = torchaudio.load(cond_audio)\n",
"if audio.shape[0]>1:\n",
" audio = audio[0].unsqueeze(0)\n",
"audio = torchaudio.transforms.Resample(sr,24000)(audio)\n",
"cond_mel = MelSpectrogramFeatures()(audio).to(device)\n",
"print(cond_mel.shape)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0LBf8nz2DQ-_",
"outputId": "5da3c1c4-1988-4f78-d6c3-0bb936132ea0"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/transformers/configuration_utils.py:381: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.\n",
" warnings.warn(\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"torch.Size([1, 100, 400])\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"auto_conditioning = cond_mel\n",
"settings = {'temperature': .8, 'length_penalty': 1.0, 'repetition_penalty': 2.0,\n",
" 'top_p': .8,\n",
" 'cond_free_k': 2.0, 'diffusion_temperature': 1.0}\n",
"top_p = .8\n",
"temperature = .8\n",
"autoregressive_batch_size = 1\n",
"length_penalty = 1.0\n",
"repetition_penalty = 2.0\n",
"max_mel_tokens = 600\n",
"print(auto_conditioning.shape)\n",
"print(text_tokens.shape)\n",
"# text_tokens = F.pad(text_tokens,(0,400-text_tokens.shape[1]),value=0)\n",
"print(text_tokens.shape)\n",
"codes = gpt.inference_speech(auto_conditioning, text_tokens,\n",
" do_sample=True,\n",
" top_p=top_p,\n",
" temperature=temperature,\n",
" num_return_sequences=autoregressive_batch_size,\n",
" length_penalty=length_penalty,\n",
" repetition_penalty=repetition_penalty,\n",
" max_generate_length=max_mel_tokens)\n",
"print(codes)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "VFEHSQ42Dadt",
"outputId": "774d31e7-8619-40b7-b503-29b1db39b07a"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"torch.Size([1, 100, 400])\n",
"torch.Size([1, 37])\n",
"torch.Size([1, 37])\n",
"tensor([[2867, 40, 7537, 5986, 692, 8079, 3282, 2094, 7478, 3286, 6652, 6674,\n",
" 5798, 2868, 4153, 1419, 4593, 423, 4472, 1487, 1989, 1628, 2796, 7296,\n",
" 4683, 3228, 7038, 6446, 89, 650, 7796, 2746, 4241, 4120, 2312, 1319,\n",
" 920, 4114, 6384, 4140, 1420, 7758, 1772, 6313, 4813, 1588, 366, 7217,\n",
" 6078, 2773, 6962, 5245, 7034, 1663, 6909, 7176, 3340, 3308, 1078, 72,\n",
" 1060, 4546, 2860, 3679, 6956, 4215, 2774, 5394, 0, 8193]],\n",
" device='cuda:0')\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from vocos import Vocos\n",
"\n",
"vocos = Vocos.from_pretrained(\"charactr/vocos-mel-24khz\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 209,
"referenced_widgets": [
"474d9a121acd4a4ab0d9f946db1568bc",
"576d89979ab246ed9889ca55ebef3d92",
"700fe25008db413db8fe97821a164ab4",
"46d2cadc00a348b69721f0a451b6ea8c",
"4340f0135b5d4a2f90a507ab8230e6e7",
"cead9aa9d7b34351b660f330c27ac310",
"9552a1bfd4d34f76b6a1582b612a4ebf",
"b1d1d14f6aa94217b3d6dfd3daee1da8",
"99867e32bd674d709aaf1b0f7102ee81",
"03c99e67ea69449f9a7bcda1419d0ec4",
"380cbd29f1b84541ae20ef1f683ce69f",
"b5624e5cd9a54018ac0a1db436ea7202",
"ed549a2ef8ef4b28b73d8b56cbd3519d",
"a72129f5b6774545a9e0a30f7cb1e619",
"6166f05435dc4b59b7a36229183a4828",
"dbf4ddee3a42433c98ccce6883fab83d",
"b9a2279178094b598fcd978e8edc75fa",
"62fdbd59289c45e6a56154f9c7740c87",
"198ea7e06ab94c34b3c35b978ce754ff",
"0c9f4972097d49259122001e726d67bd",
"3e52c628e5904044a77a2cc9efb942f0",
"4d20115a059542f9b718f4f69fb16a87"
]
},
"id": "cIGDrvOHDfvS",
"outputId": "50e7c89c-d2c2-4655-c34b-0cd7d556b791"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n",
"The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
"To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
"You will be able to reuse this secret in all of your notebooks.\n",
"Please note that authentication is recommended but still optional to access public models or datasets.\n",
" warnings.warn(\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"config.yaml: 0%| | 0.00/461 [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "474d9a121acd4a4ab0d9f946db1568bc"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"pytorch_model.bin: 0%| | 0.00/54.4M [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "b5624e5cd9a54018ac0a1db436ea7202"
}
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"latent = gpt(auto_conditioning, text_tokens,\n",
" torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,\n",
" torch.tensor([codes.shape[-1]*gpt.mel_length_compression], device=text_tokens.device),\n",
" return_latent=True, clip_inputs=False).transpose(1,2)\n",
"latent.shape"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "cR1pO-tTDjng",
"outputId": "d0e2bdfd-f0f9-4c0a-a742-a9108122b9cd"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"torch.Size([1, 1024, 70])"
]
},
"metadata": {},
"execution_count": 10
}
]
},
{
"cell_type": "code",
"source": [
"from ttts.diffusion.train import do_spectrogram_diffusion\n",
"from ttts.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule\n",
"from ttts.diffusion.aa_model import denormalize_tacotron_mel, normalize_tacotron_mel\n",
"# print(device)\n",
"diffusion = load_model('diffusion',MODELS['diffusion.pth'],'ttts/diffusion/config.yaml',device)\n",
"diffuser = SpacedDiffusion(use_timesteps=space_timesteps(1000, [50]), model_mean_type='epsilon',\n",
" model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 1000),\n",
" conditioning_free=True, conditioning_free_k=2., sampler='dpm++2m')\n",
"diffusion_conditioning = normalize_tacotron_mel(cond_mel)\n",
"mel = do_spectrogram_diffusion(diffusion, diffuser, latent, diffusion_conditioning, temperature=1.0).detach().cpu()\n",
"wav = vocos.decode(mel)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "G-bgy-MLDl3E",
"outputId": "1161bb24-49d1-495f-e80d-49234e83160e"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"base model params: 46144712\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"100%|██████████| 50/50 [00:05<00:00, 8.65it/s]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from IPython.display import Audio\n",
"wav = wav.detach().cpu()\n",
"torchaudio.save('gen.wav',wav.detach().cpu(), 24000)\n",
"Audio(wav,rate=24000)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 74
},
"id": "Trjm4pQ7DoNJ",
"outputId": "bf54e60f-f588-47b4-ceb3-7abac53fa2c2"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
""
],
"text/html": [
"\n",
" \n",
" "
]
},
"metadata": {},
"execution_count": 13
}
]
}
]
}