{ "cells": [ { "cell_type": "markdown", "id": "1647c4b213a7ceee", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true } }, "source": [ "# Example usage" ] }, { "cell_type": "markdown", "id": "de97a5a6fb11f19d", "metadata": {}, "source": [ "## Install requirements" ] }, { "cell_type": "code", "execution_count": null, "id": "947baef9c8a11d9", "metadata": { "jupyter": { "is_executing": true } }, "outputs": [], "source": [ "# Standard requirements:\n", "# !pip install datasets torch torchcodec transformers sentencepiece\n", "# Google Colab requirements:\n", "!pip install \"torchcodec~=0.7.0\"" ] }, { "cell_type": "markdown", "id": "ba0dc6c29b3331d1", "metadata": {}, "source": [ "## Automatically instantiate the model" ] }, { "cell_type": "code", "execution_count": null, "id": "96da493f87cad12e", "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer\n", "\n", "model_id = \"abr-ai/asr-19m-v2-en-32b\"\n", "feature_extractor = AutoFeatureExtractor.from_pretrained(\n", " model_id, trust_remote_code=True\n", ")\n", "model = AutoModel.from_pretrained(model_id, trust_remote_code=True)\n", "tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "feature_extractor = feature_extractor.to(device)\n", "model = model.to(device)" ] }, { "cell_type": "markdown", "id": "633fe80a3b1719da", "metadata": {}, "source": [ "## Load example data (LibriSpeech)" ] }, { "cell_type": "code", "execution_count": null, "id": "d3019f6d09ecf524", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset(\"librispeech_asr\", \"clean\", split=\"test\", streaming=True)\n", "samples = list(dataset.take(3)) # Take 3 examples" ] }, { "cell_type": "markdown", "id": "af6af3b42bd709b9", "metadata": {}, "source": [ "## Simple transcription" ] }, { "cell_type": "code", "execution_count": null, "id": "45e6144ee4b7ddc8", "metadata": {}, "outputs": [], "source": [ "audio = samples[0][\"audio\"][\"array\"]\n", "features = feature_extractor(audio)\n", "logits = model(features)\n", "transcription = tokenizer.decode_from_logits(logits)\n", "\n", "print(f\"Reference text: {samples[0]['text'].lower()}\")\n", "print(f\"Transcription: {transcription[0]}\\n\")" ] }, { "cell_type": "markdown", "id": "ec0f75905a44a165", "metadata": {}, "source": [ "## Batched transcription" ] }, { "cell_type": "code", "execution_count": null, "id": "8ae6cef6e167db5", "metadata": {}, "outputs": [], "source": [ "audio_list = [sample[\"audio\"][\"array\"] for sample in samples]\n", "batch_features = feature_extractor(audio_list)\n", "batch_outputs = model(batch_features[\"input_features\"], mask=batch_features[\"mask\"])\n", "transcriptions = tokenizer.decode_from_logits(\n", " batch_outputs[\"logits\"], mask=batch_outputs[\"mask\"]\n", ")\n", "\n", "for i, sample in enumerate(samples):\n", " print(f\"Reference text: {sample['text'].lower()}\")\n", " print(f\"Transcription: {transcriptions[i]}\")\n", " print(\"-\"*30)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.18" } }, "nbformat": 4, "nbformat_minor": 5 }