diff --git "a/demo.ipynb" "b/demo.ipynb"
new file mode 100644--- /dev/null
+++ "b/demo.ipynb"
@@ -0,0 +1,1290 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "gpuType": "T4",
+ "authorship_tag": "ABX9TyOcNNO6b/4X+3VXZr1bdSH6",
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ },
+ "accelerator": "GPU",
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "474d9a121acd4a4ab0d9f946db1568bc": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "HBoxModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_576d89979ab246ed9889ca55ebef3d92",
+ "IPY_MODEL_700fe25008db413db8fe97821a164ab4",
+ "IPY_MODEL_46d2cadc00a348b69721f0a451b6ea8c"
+ ],
+ "layout": "IPY_MODEL_4340f0135b5d4a2f90a507ab8230e6e7"
+ }
+ },
+ "576d89979ab246ed9889ca55ebef3d92": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "HTMLModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_cead9aa9d7b34351b660f330c27ac310",
+ "placeholder": "",
+ "style": "IPY_MODEL_9552a1bfd4d34f76b6a1582b612a4ebf",
+ "value": "config.yaml: 100%"
+ }
+ },
+ "700fe25008db413db8fe97821a164ab4": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "FloatProgressModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "ProgressView",
+ "bar_style": "success",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_b1d1d14f6aa94217b3d6dfd3daee1da8",
+ "max": 461,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_99867e32bd674d709aaf1b0f7102ee81",
+ "value": 461
+ }
+ },
+ "46d2cadc00a348b69721f0a451b6ea8c": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "HTMLModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_03c99e67ea69449f9a7bcda1419d0ec4",
+ "placeholder": "",
+ "style": "IPY_MODEL_380cbd29f1b84541ae20ef1f683ce69f",
+ "value": " 461/461 [00:00<00:00, 33.7kB/s]"
+ }
+ },
+ "4340f0135b5d4a2f90a507ab8230e6e7": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "model_module_version": "1.2.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "cead9aa9d7b34351b660f330c27ac310": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "model_module_version": "1.2.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "9552a1bfd4d34f76b6a1582b612a4ebf": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "DescriptionStyleModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "b1d1d14f6aa94217b3d6dfd3daee1da8": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "model_module_version": "1.2.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "99867e32bd674d709aaf1b0f7102ee81": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "ProgressStyleModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "bar_color": null,
+ "description_width": ""
+ }
+ },
+ "03c99e67ea69449f9a7bcda1419d0ec4": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "model_module_version": "1.2.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "380cbd29f1b84541ae20ef1f683ce69f": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "DescriptionStyleModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "b5624e5cd9a54018ac0a1db436ea7202": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "HBoxModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_ed549a2ef8ef4b28b73d8b56cbd3519d",
+ "IPY_MODEL_a72129f5b6774545a9e0a30f7cb1e619",
+ "IPY_MODEL_6166f05435dc4b59b7a36229183a4828"
+ ],
+ "layout": "IPY_MODEL_dbf4ddee3a42433c98ccce6883fab83d"
+ }
+ },
+ "ed549a2ef8ef4b28b73d8b56cbd3519d": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "HTMLModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_b9a2279178094b598fcd978e8edc75fa",
+ "placeholder": "",
+ "style": "IPY_MODEL_62fdbd59289c45e6a56154f9c7740c87",
+ "value": "pytorch_model.bin: 100%"
+ }
+ },
+ "a72129f5b6774545a9e0a30f7cb1e619": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "FloatProgressModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "ProgressView",
+ "bar_style": "success",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_198ea7e06ab94c34b3c35b978ce754ff",
+ "max": 54365991,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_0c9f4972097d49259122001e726d67bd",
+ "value": 54365991
+ }
+ },
+ "6166f05435dc4b59b7a36229183a4828": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "HTMLModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_3e52c628e5904044a77a2cc9efb942f0",
+ "placeholder": "",
+ "style": "IPY_MODEL_4d20115a059542f9b718f4f69fb16a87",
+ "value": " 54.4M/54.4M [00:00<00:00, 123MB/s]"
+ }
+ },
+ "dbf4ddee3a42433c98ccce6883fab83d": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "model_module_version": "1.2.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "b9a2279178094b598fcd978e8edc75fa": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "model_module_version": "1.2.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "62fdbd59289c45e6a56154f9c7740c87": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "DescriptionStyleModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "198ea7e06ab94c34b3c35b978ce754ff": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "model_module_version": "1.2.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "0c9f4972097d49259122001e726d67bd": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "ProgressStyleModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "bar_color": null,
+ "description_width": ""
+ }
+ },
+ "3e52c628e5904044a77a2cc9efb942f0": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "model_module_version": "1.2.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "4d20115a059542f9b718f4f69fb16a87": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "DescriptionStyleModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ }
+ }
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!git clone -b master https://github.com/adelacvg/ttts.git"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "WB8vmGyDAPrr",
+ "outputId": "cee62580-1134-435f-d6ce-e68402a48a31"
+ },
+ "execution_count": 2,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Cloning into 'ttts'...\n",
+ "remote: Enumerating objects: 538, done.\u001b[K\n",
+ "remote: Counting objects: 100% (444/444), done.\u001b[K\n",
+ "remote: Compressing objects: 100% (283/283), done.\u001b[K\n",
+ "remote: Total 538 (delta 241), reused 346 (delta 154), pack-reused 94\u001b[K\n",
+ "Receiving objects: 100% (538/538), 61.11 MiB | 13.93 MiB/s, done.\n",
+ "Resolving deltas: 100% (250/250), done.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "%cd ttts\n",
+ "!pip install -e ."
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "HKikTDsVA1_g",
+ "outputId": "1906df87-d848-4099-acec-b18c40db200a"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "/content/ttts\n",
+ "Obtaining file:///content/ttts\n",
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Installing collected packages: ttts\n",
+ " Running setup.py develop for ttts\n",
+ "Successfully installed ttts-0.1\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!git lfs install\n",
+ "!git clone https://huggingface.co/adelacvg/TTTS"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "_KYD7eZQB4x7",
+ "outputId": "dc2fc2b8-5883-4d96-96b6-5df190d356f7"
+ },
+ "execution_count": 1,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Git LFS initialized.\n",
+ "Cloning into 'TTTS'...\n",
+ "fatal: Remote branch master not found in upstream origin\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import locale\n",
+ "locale.getpreferredencoding = lambda: \"UTF-8\"\n",
+ "!pip install pypinyin einops omegaconf==2.0.6 encodec vocos k_diffusion"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "UcOqgsONDGZ3",
+ "outputId": "5d0ef477-0263-4cad-cd75-da93a1482a87"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Collecting pypinyin\n",
+ " Downloading pypinyin-0.50.0-py2.py3-none-any.whl (1.4 MB)\n",
+ "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/1.4 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.2/1.4 MB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.6/1.4 MB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━\u001b[0m \u001b[32m1.0/1.4 MB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hCollecting einops\n",
+ " Downloading einops-0.7.0-py3-none-any.whl (44 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.6/44.6 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hCollecting omegaconf==2.0.6\n",
+ " Downloading omegaconf-2.0.6-py3-none-any.whl (36 kB)\n",
+ "Collecting encodec\n",
+ " Downloading encodec-0.1.1.tar.gz (3.7 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.7/3.7 MB\u001b[0m \u001b[31m23.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Collecting vocos\n",
+ " Downloading vocos-0.1.0-py3-none-any.whl (24 kB)\n",
+ "Collecting k_diffusion\n",
+ " Downloading k_diffusion-0.1.1.post1-py3-none-any.whl (33 kB)\n",
+ "Requirement already satisfied: PyYAML>=5.1.* in /usr/local/lib/python3.10/dist-packages (from omegaconf==2.0.6) (6.0.1)\n",
+ "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from omegaconf==2.0.6) (4.5.0)\n",
+ "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from encodec) (1.23.5)\n",
+ "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from encodec) (2.1.0+cu121)\n",
+ "Requirement already satisfied: torchaudio in /usr/local/lib/python3.10/dist-packages (from encodec) (2.1.0+cu121)\n",
+ "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from vocos) (1.11.4)\n",
+ "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from vocos) (0.20.2)\n",
+ "Collecting accelerate (from k_diffusion)\n",
+ " Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m270.9/270.9 kB\u001b[0m \u001b[31m27.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hCollecting clean-fid (from k_diffusion)\n",
+ " Downloading clean_fid-0.1.35-py3-none-any.whl (26 kB)\n",
+ "Collecting clip-anytorch (from k_diffusion)\n",
+ " Downloading clip_anytorch-2.6.0-py3-none-any.whl (1.4 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m31.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hCollecting dctorch (from k_diffusion)\n",
+ " Downloading dctorch-0.1.2-py3-none-any.whl (2.3 kB)\n",
+ "Collecting jsonmerge (from k_diffusion)\n",
+ " Downloading jsonmerge-1.9.2-py3-none-any.whl (19 kB)\n",
+ "Collecting kornia (from k_diffusion)\n",
+ " Downloading kornia-0.7.1-py2.py3-none-any.whl (756 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m756.0/756.0 kB\u001b[0m \u001b[31m37.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from k_diffusion) (9.4.0)\n",
+ "Requirement already satisfied: safetensors in /usr/local/lib/python3.10/dist-packages (from k_diffusion) (0.4.1)\n",
+ "Requirement already satisfied: scikit-image in /usr/local/lib/python3.10/dist-packages (from k_diffusion) (0.19.3)\n",
+ "Collecting torchdiffeq (from k_diffusion)\n",
+ " Downloading torchdiffeq-0.2.3-py3-none-any.whl (31 kB)\n",
+ "Collecting torchsde (from k_diffusion)\n",
+ " Downloading torchsde-0.2.6-py3-none-any.whl (61 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.2/61.2 kB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from k_diffusion) (0.16.0+cu121)\n",
+ "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from k_diffusion) (4.66.1)\n",
+ "Collecting wandb (from k_diffusion)\n",
+ " Downloading wandb-0.16.2-py3-none-any.whl (2.2 MB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m38.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch->encodec) (3.13.1)\n",
+ "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->encodec) (1.12)\n",
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->encodec) (3.2.1)\n",
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->encodec) (3.1.2)\n",
+ "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch->encodec) (2023.6.0)\n",
+ "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch->encodec) (2.1.0)\n",
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate->k_diffusion) (23.2)\n",
+ "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate->k_diffusion) (5.9.5)\n",
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from clean-fid->k_diffusion) (2.31.0)\n",
+ "Collecting ftfy (from clip-anytorch->k_diffusion)\n",
+ " Downloading ftfy-6.1.3-py3-none-any.whl (53 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.4/53.4 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from clip-anytorch->k_diffusion) (2023.6.3)\n",
+ "Requirement already satisfied: jsonschema>2.4.0 in /usr/local/lib/python3.10/dist-packages (from jsonmerge->k_diffusion) (4.19.2)\n",
+ "Requirement already satisfied: imageio>=2.4.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image->k_diffusion) (2.31.6)\n",
+ "Requirement already satisfied: tifffile>=2019.7.26 in /usr/local/lib/python3.10/dist-packages (from scikit-image->k_diffusion) (2023.12.9)\n",
+ "Requirement already satisfied: PyWavelets>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image->k_diffusion) (1.5.0)\n",
+ "Collecting trampoline>=0.1.2 (from torchsde->k_diffusion)\n",
+ " Downloading trampoline-0.1.2-py3-none-any.whl (5.2 kB)\n",
+ "Requirement already satisfied: Click!=8.0.0,>=7.1 in /usr/local/lib/python3.10/dist-packages (from wandb->k_diffusion) (8.1.7)\n",
+ "Collecting GitPython!=3.1.29,>=1.0.0 (from wandb->k_diffusion)\n",
+ " Downloading GitPython-3.1.41-py3-none-any.whl (196 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m196.4/196.4 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hCollecting sentry-sdk>=1.0.0 (from wandb->k_diffusion)\n",
+ " Downloading sentry_sdk-1.39.2-py2.py3-none-any.whl (254 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m254.1/254.1 kB\u001b[0m \u001b[31m23.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hCollecting docker-pycreds>=0.4.0 (from wandb->k_diffusion)\n",
+ " Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\n",
+ "Collecting setproctitle (from wandb->k_diffusion)\n",
+ " Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)\n",
+ "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from wandb->k_diffusion) (67.7.2)\n",
+ "Requirement already satisfied: appdirs>=1.4.3 in /usr/local/lib/python3.10/dist-packages (from wandb->k_diffusion) (1.4.4)\n",
+ "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.19.0 in /usr/local/lib/python3.10/dist-packages (from wandb->k_diffusion) (3.20.3)\n",
+ "Requirement already satisfied: six>=1.4.0 in /usr/local/lib/python3.10/dist-packages (from docker-pycreds>=0.4.0->wandb->k_diffusion) (1.16.0)\n",
+ "Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb->k_diffusion)\n",
+ " Downloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>2.4.0->jsonmerge->k_diffusion) (23.2.0)\n",
+ "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>2.4.0->jsonmerge->k_diffusion) (2023.12.1)\n",
+ "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>2.4.0->jsonmerge->k_diffusion) (0.32.1)\n",
+ "Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>2.4.0->jsonmerge->k_diffusion) (0.16.2)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->clean-fid->k_diffusion) (3.3.2)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->clean-fid->k_diffusion) (3.6)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->clean-fid->k_diffusion) (2.0.7)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->clean-fid->k_diffusion) (2023.11.17)\n",
+ "Requirement already satisfied: wcwidth<0.3.0,>=0.2.12 in /usr/local/lib/python3.10/dist-packages (from ftfy->clip-anytorch->k_diffusion) (0.2.12)\n",
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->encodec) (2.1.3)\n",
+ "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->encodec) (1.3.0)\n",
+ "Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb->k_diffusion)\n",
+ " Downloading smmap-5.0.1-py3-none-any.whl (24 kB)\n",
+ "Building wheels for collected packages: encodec\n",
+ " Building wheel for encodec (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for encodec: filename=encodec-0.1.1-py3-none-any.whl size=45759 sha256=a6f25c3141ae73377bf58efa53716b479f3299aaebcbaf213514f7955a86427f\n",
+ " Stored in directory: /root/.cache/pip/wheels/fc/36/cb/81af8b985a5f5e0815312d5e52b41263237af07b977e6bcbf3\n",
+ "Successfully built encodec\n",
+ "Installing collected packages: trampoline, smmap, setproctitle, sentry-sdk, pypinyin, omegaconf, ftfy, einops, docker-pycreds, gitdb, torchsde, torchdiffeq, kornia, GitPython, dctorch, accelerate, wandb, jsonmerge, encodec, clip-anytorch, clean-fid, vocos, k_diffusion\n",
+ "Successfully installed GitPython-3.1.41 accelerate-0.26.1 clean-fid-0.1.35 clip-anytorch-2.6.0 dctorch-0.1.2 docker-pycreds-0.4.0 einops-0.7.0 encodec-0.1.1 ftfy-6.1.3 gitdb-4.0.11 jsonmerge-1.9.2 k_diffusion-0.1.1.post1 kornia-0.7.1 omegaconf-2.0.6 pypinyin-0.50.0 sentry-sdk-1.39.2 setproctitle-1.3.3 smmap-5.0.1 torchdiffeq-0.2.3 torchsde-0.2.6 trampoline-0.1.2 vocos-0.1.0 wandb-0.16.2\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "9Iz6jokZ_-AB"
+ },
+ "outputs": [],
+ "source": [
+ "from pypinyin import lazy_pinyin, Style\n",
+ "import torch\n",
+ "\n",
+ "MODELS = {\n",
+ " 'vqvae.pth':'TTTS/vae-30.pt',\n",
+ " 'gpt.pth': 'TTTS/gpt-70.pt',\n",
+ " 'clvp2.pth': '',\n",
+ " 'diffusion.pth': 'TTTS/diffusion-855.pt',\n",
+ " 'vocoder.pth': '~/tortoise_plus_zh/ttts/pretrained_models/pytorch_model.bin',\n",
+ " 'rlg_auto.pth': '',\n",
+ " 'rlg_diffuser.pth': '',\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from ttts.gpt.voice_tokenizer import VoiceBpeTokenizer\n",
+ "import torch.nn.functional as F\n",
+ "device = 'cuda:0'\n",
+ "text = \"大家好,今天来点大家想看的东西。\"\n",
+ "# text = \"霞浦县衙城镇乌旗瓦窑村水位猛涨。\"\n",
+ "# text = '高德官方网站,拥有全面、精准的地点信息,公交驾车路线规划,特色语音导航,商家团购、优惠信息。'\n",
+ "# text = '四是四,十是十,十四是十四,四十是四十。'\n",
+ "# text = '八百标兵奔北坡,炮兵并排北边跑。炮兵怕把标兵碰,标兵怕碰炮兵炮。'\n",
+ "# text = '黑化肥发灰,灰化肥发黑。黑化肥挥发会发灰;灰化肥挥发会发黑。'\n",
+ "# text = '先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。然侍卫之臣不懈于内,忠志之士忘身于外者,盖追先帝之殊遇,欲报之于陛下也。诚宜开张圣听,以光先帝遗德,恢弘志士之气,不宜妄自菲薄,引喻失义,以塞忠谏之路也。'\n",
+ "pinyin = ' '.join(lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True))\n",
+ "tokenizer = VoiceBpeTokenizer('ttts/gpt/gpt_tts_tokenizer.json')\n",
+ "text_tokens = torch.IntTensor(tokenizer.encode(pinyin)).unsqueeze(0).to(device)\n",
+ "text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary.\n",
+ "text_tokens = text_tokens.to(device)\n",
+ "print(pinyin)\n",
+ "print(text_tokens)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "SpTKi32TDQFi",
+ "outputId": "84292cc0-d09e-420d-c449-a746e44dade5"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "da4 jia1 hao3 , jin1 tian1 lai2 dian3 da4 jia1 xiang3 kan4 de5 dong1 xi1 。\n",
+ "tensor([[161, 2, 155, 2, 16, 87, 2, 43, 2, 224, 2, 171, 71, 2,\n",
+ " 182, 2, 188, 2, 161, 2, 155, 2, 62, 92, 2, 19, 63, 2,\n",
+ " 65, 2, 12, 84, 2, 228, 2, 39, 0]], device='cuda:0',\n",
+ " dtype=torch.int32)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from ttts.utils.infer_utils import load_model\n",
+ "from ttts.vocoder.feature_extractors import MelSpectrogramFeatures\n",
+ "import torchaudio\n",
+ "# device = 'gpu:0'\n",
+ "gpt = load_model('gpt',MODELS['gpt.pth'],'ttts/gpt/config.json',device)\n",
+ "gpt.post_init_gpt2_config(use_deepspeed=False, kv_cache=False, half=False)\n",
+ "# diffusion = load_model('diffusion',MODELS['diffusion.pth'],'ttts/diffusion/config.json',device)\n",
+ "cond_audio = 'ttts/3.wav'\n",
+ "audio,sr = torchaudio.load(cond_audio)\n",
+ "if audio.shape[0]>1:\n",
+ " audio = audio[0].unsqueeze(0)\n",
+ "audio = torchaudio.transforms.Resample(sr,24000)(audio)\n",
+ "cond_mel = MelSpectrogramFeatures()(audio).to(device)\n",
+ "print(cond_mel.shape)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "0LBf8nz2DQ-_",
+ "outputId": "5da3c1c4-1988-4f78-d6c3-0bb936132ea0"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/transformers/configuration_utils.py:381: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "torch.Size([1, 100, 400])\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "auto_conditioning = cond_mel\n",
+ "settings = {'temperature': .8, 'length_penalty': 1.0, 'repetition_penalty': 2.0,\n",
+ " 'top_p': .8,\n",
+ " 'cond_free_k': 2.0, 'diffusion_temperature': 1.0}\n",
+ "top_p = .8\n",
+ "temperature = .8\n",
+ "autoregressive_batch_size = 1\n",
+ "length_penalty = 1.0\n",
+ "repetition_penalty = 2.0\n",
+ "max_mel_tokens = 600\n",
+ "print(auto_conditioning.shape)\n",
+ "print(text_tokens.shape)\n",
+ "# text_tokens = F.pad(text_tokens,(0,400-text_tokens.shape[1]),value=0)\n",
+ "print(text_tokens.shape)\n",
+ "codes = gpt.inference_speech(auto_conditioning, text_tokens,\n",
+ " do_sample=True,\n",
+ " top_p=top_p,\n",
+ " temperature=temperature,\n",
+ " num_return_sequences=autoregressive_batch_size,\n",
+ " length_penalty=length_penalty,\n",
+ " repetition_penalty=repetition_penalty,\n",
+ " max_generate_length=max_mel_tokens)\n",
+ "print(codes)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "VFEHSQ42Dadt",
+ "outputId": "774d31e7-8619-40b7-b503-29b1db39b07a"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "torch.Size([1, 100, 400])\n",
+ "torch.Size([1, 37])\n",
+ "torch.Size([1, 37])\n",
+ "tensor([[2867, 40, 7537, 5986, 692, 8079, 3282, 2094, 7478, 3286, 6652, 6674,\n",
+ " 5798, 2868, 4153, 1419, 4593, 423, 4472, 1487, 1989, 1628, 2796, 7296,\n",
+ " 4683, 3228, 7038, 6446, 89, 650, 7796, 2746, 4241, 4120, 2312, 1319,\n",
+ " 920, 4114, 6384, 4140, 1420, 7758, 1772, 6313, 4813, 1588, 366, 7217,\n",
+ " 6078, 2773, 6962, 5245, 7034, 1663, 6909, 7176, 3340, 3308, 1078, 72,\n",
+ " 1060, 4546, 2860, 3679, 6956, 4215, 2774, 5394, 0, 8193]],\n",
+ " device='cuda:0')\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from vocos import Vocos\n",
+ "\n",
+ "vocos = Vocos.from_pretrained(\"charactr/vocos-mel-24khz\")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 209,
+ "referenced_widgets": [
+ "474d9a121acd4a4ab0d9f946db1568bc",
+ "576d89979ab246ed9889ca55ebef3d92",
+ "700fe25008db413db8fe97821a164ab4",
+ "46d2cadc00a348b69721f0a451b6ea8c",
+ "4340f0135b5d4a2f90a507ab8230e6e7",
+ "cead9aa9d7b34351b660f330c27ac310",
+ "9552a1bfd4d34f76b6a1582b612a4ebf",
+ "b1d1d14f6aa94217b3d6dfd3daee1da8",
+ "99867e32bd674d709aaf1b0f7102ee81",
+ "03c99e67ea69449f9a7bcda1419d0ec4",
+ "380cbd29f1b84541ae20ef1f683ce69f",
+ "b5624e5cd9a54018ac0a1db436ea7202",
+ "ed549a2ef8ef4b28b73d8b56cbd3519d",
+ "a72129f5b6774545a9e0a30f7cb1e619",
+ "6166f05435dc4b59b7a36229183a4828",
+ "dbf4ddee3a42433c98ccce6883fab83d",
+ "b9a2279178094b598fcd978e8edc75fa",
+ "62fdbd59289c45e6a56154f9c7740c87",
+ "198ea7e06ab94c34b3c35b978ce754ff",
+ "0c9f4972097d49259122001e726d67bd",
+ "3e52c628e5904044a77a2cc9efb942f0",
+ "4d20115a059542f9b718f4f69fb16a87"
+ ]
+ },
+ "id": "cIGDrvOHDfvS",
+ "outputId": "50e7c89c-d2c2-4655-c34b-0cd7d556b791"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n",
+ "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
+ "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
+ "You will be able to reuse this secret in all of your notebooks.\n",
+ "Please note that authentication is recommended but still optional to access public models or datasets.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "config.yaml: 0%| | 0.00/461 [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "474d9a121acd4a4ab0d9f946db1568bc"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "pytorch_model.bin: 0%| | 0.00/54.4M [00:00, ?B/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "b5624e5cd9a54018ac0a1db436ea7202"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "latent = gpt(auto_conditioning, text_tokens,\n",
+ " torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,\n",
+ " torch.tensor([codes.shape[-1]*gpt.mel_length_compression], device=text_tokens.device),\n",
+ " return_latent=True, clip_inputs=False).transpose(1,2)\n",
+ "latent.shape"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "cR1pO-tTDjng",
+ "outputId": "d0e2bdfd-f0f9-4c0a-a742-a9108122b9cd"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "torch.Size([1, 1024, 70])"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 10
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from ttts.diffusion.train import do_spectrogram_diffusion\n",
+ "from ttts.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule\n",
+ "from ttts.diffusion.aa_model import denormalize_tacotron_mel, normalize_tacotron_mel\n",
+ "# print(device)\n",
+ "diffusion = load_model('diffusion',MODELS['diffusion.pth'],'ttts/diffusion/config.yaml',device)\n",
+ "diffuser = SpacedDiffusion(use_timesteps=space_timesteps(1000, [50]), model_mean_type='epsilon',\n",
+ " model_var_type='learned_range', loss_type='mse', betas=get_named_beta_schedule('linear', 1000),\n",
+ " conditioning_free=True, conditioning_free_k=2., sampler='dpm++2m')\n",
+ "diffusion_conditioning = normalize_tacotron_mel(cond_mel)\n",
+ "mel = do_spectrogram_diffusion(diffusion, diffuser, latent, diffusion_conditioning, temperature=1.0).detach().cpu()\n",
+ "wav = vocos.decode(mel)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "G-bgy-MLDl3E",
+ "outputId": "1161bb24-49d1-495f-e80d-49234e83160e"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "base model params: 46144712\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "100%|██████████| 50/50 [00:05<00:00, 8.65it/s]\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from IPython.display import Audio\n",
+ "wav = wav.detach().cpu()\n",
+ "torchaudio.save('gen.wav',wav.detach().cpu(), 24000)\n",
+ "Audio(wav,rate=24000)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 74
+ },
+ "id": "Trjm4pQ7DoNJ",
+ "outputId": "bf54e60f-f588-47b4-ceb3-7abac53fa2c2"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 13
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file