import{s as So,o as jo,n as Y}from"../chunks/scheduler.25b97de1.js";import{S as xo,i as Fo,g as p,s as a,r as h,A as Jo,h as m,f as s,c as r,j as q,u,x as v,k as V,y as i,a as c,v as f,d as g,t as _,w as b}from"../chunks/index.d9030fc9.js";import{T as Tt}from"../chunks/Tip.baa67368.js";import{D as L}from"../chunks/Docstring.ffac8efa.js";import{C as Oe}from"../chunks/CodeBlock.e6cd0d95.js";import{E as De}from"../chunks/ExampleCodeBlock.22dfe688.js";import{H as B,E as Wo}from"../chunks/EditOnGithub.91d95064.js";function Zo(U){let t,T="Example:",l,d,y;return d=new Oe({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFVuaVNwZWVjaENvbmZpZyUyQyUyMFVuaVNwZWVjaE1vZGVsJTBBJTBBJTIzJTIwSW5pdGlhbGl6aW5nJTIwYSUyMFVuaVNwZWVjaCUyMGZhY2Vib29rJTJGdW5pc3BlZWNoLWJhc2UtOTYwaCUyMHN0eWxlJTIwY29uZmlndXJhdGlvbiUwQWNvbmZpZ3VyYXRpb24lMjAlM0QlMjBVbmlTcGVlY2hDb25maWcoKSUwQSUwQSUyMyUyMEluaXRpYWxpemluZyUyMGElMjBtb2RlbCUyMCh3aXRoJTIwcmFuZG9tJTIwd2VpZ2h0cyklMjBmcm9tJTIwdGhlJTIwZmFjZWJvb2slMkZ1bmlzcGVlY2gtYmFzZS05NjBoJTIwc3R5bGUlMjBjb25maWd1cmF0aW9uJTBBbW9kZWwlMjAlM0QlMjBVbmlTcGVlY2hNb2RlbChjb25maWd1cmF0aW9uKSUwQSUwQSUyMyUyMEFjY2Vzc2luZyUyMHRoZSUyMG1vZGVsJTIwY29uZmlndXJhdGlvbiUwQWNvbmZpZ3VyYXRpb24lMjAlM0QlMjBtb2RlbC5jb25maWc=",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> UniSpeechConfig, UniSpeechModel

<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># Initializing a UniSpeech facebook/unispeech-base-960h style configuration</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>configuration = UniSpeechConfig()

<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># Initializing a model (with random weights) from the facebook/unispeech-base-960h style configuration</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>model = UniSpeechModel(configuration)

<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># Accessing the model configuration</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>configuration = model.config`,wrap:!1}}),{c(){t=p("p"),t.textContent=T,l=a(),h(d.$$.fragment)},l(o){t=m(o,"P",{"data-svelte-h":!0}),v(t)!=="svelte-11lpom8"&&(t.textContent=T),l=r(o),u(d.$$.fragment,o)},m(o,w){c(o,t,w),c(o,l,w),f(d,o,w),y=!0},p:Y,i(o){y||(g(d.$$.fragment,o),y=!0)},o(o){_(d.$$.fragment,o),y=!1},d(o){o&&(s(t),s(l)),b(d,o)}}}function zo(U){let t,T=`Although the recipe for forward pass needs to be defined within this function, one should call the <code>Module</code>
instance afterwards instead of this since the former takes care of running the pre and post processing steps while
the latter silently ignores them.`;return{c(){t=p("p"),t.innerHTML=T},l(l){t=m(l,"P",{"data-svelte-h":!0}),v(t)!=="svelte-fincs2"&&(t.innerHTML=T)},m(l,d){c(l,t,d)},p:Y,d(l){l&&s(t)}}}function Go(U){let t,T="Example:",l,d,y;return d=new Oe({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Qcm9jZXNzb3IlMkMlMjBVbmlTcGVlY2hNb2RlbCUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBJTBBZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJoZi1pbnRlcm5hbC10ZXN0aW5nJTJGbGlicmlzcGVlY2hfYXNyX2RlbW8lMjIlMkMlMjAlMjJjbGVhbiUyMiUyQyUyMHNwbGl0JTNEJTIydmFsaWRhdGlvbiUyMiUyQyUyMHRydXN0X3JlbW90ZV9jb2RlJTNEVHJ1ZSklMEFkYXRhc2V0JTIwJTNEJTIwZGF0YXNldC5zb3J0KCUyMmlkJTIyKSUwQXNhbXBsaW5nX3JhdGUlMjAlM0QlMjBkYXRhc2V0LmZlYXR1cmVzJTVCJTIyYXVkaW8lMjIlNUQuc2FtcGxpbmdfcmF0ZSUwQSUwQXByb2Nlc3NvciUyMCUzRCUyMEF1dG9Qcm9jZXNzb3IuZnJvbV9wcmV0cmFpbmVkKCUyMnBhdHJpY2t2b25wbGF0ZW4lMkZ1bmlzcGVlY2gtbGFyZ2UtMTUwMGgtY3YtdGltaXQlMjIpJTBBbW9kZWwlMjAlM0QlMjBVbmlTcGVlY2hNb2RlbC5mcm9tX3ByZXRyYWluZWQoJTIycGF0cmlja3ZvbnBsYXRlbiUyRnVuaXNwZWVjaC1sYXJnZS0xNTAwaC1jdi10aW1pdCUyMiklMEElMEElMjMlMjBhdWRpbyUyMGZpbGUlMjBpcyUyMGRlY29kZWQlMjBvbiUyMHRoZSUyMGZseSUwQWlucHV0cyUyMCUzRCUyMHByb2Nlc3NvcihkYXRhc2V0JTVCMCU1RCU1QiUyMmF1ZGlvJTIyJTVEJTVCJTIyYXJyYXklMjIlNUQlMkMlMjBzYW1wbGluZ19yYXRlJTNEc2FtcGxpbmdfcmF0ZSUyQyUyMHJldHVybl90ZW5zb3JzJTNEJTIycHQlMjIpJTBBd2l0aCUyMHRvcmNoLm5vX2dyYWQoKSUzQSUwQSUyMCUyMCUyMCUyMG91dHB1dHMlMjAlM0QlMjBtb2RlbCgqKmlucHV0cyklMEElMEFsYXN0X2hpZGRlbl9zdGF0ZXMlMjAlM0QlMjBvdXRwdXRzLmxhc3RfaGlkZGVuX3N0YXRlJTBBbGlzdChsYXN0X2hpZGRlbl9zdGF0ZXMuc2hhcGUp",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoProcessor, UniSpeechModel
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> torch
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset

<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = load_dataset(<span class="hljs-string">&quot;hf-internal-testing/librispeech_asr_demo&quot;</span>, <span class="hljs-string">&quot;clean&quot;</span>, split=<span class="hljs-string">&quot;validation&quot;</span>, trust_remote_code=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = dataset.sort(<span class="hljs-string">&quot;id&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>sampling_rate = dataset.features[<span class="hljs-string">&quot;audio&quot;</span>].sampling_rate

<span class="hljs-meta">&gt;&gt;&gt; </span>processor = AutoProcessor.from_pretrained(<span class="hljs-string">&quot;patrickvonplaten/unispeech-large-1500h-cv-timit&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>model = UniSpeechModel.from_pretrained(<span class="hljs-string">&quot;patrickvonplaten/unispeech-large-1500h-cv-timit&quot;</span>)

<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># audio file is decoded on the fly</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(dataset[<span class="hljs-number">0</span>][<span class="hljs-string">&quot;audio&quot;</span>][<span class="hljs-string">&quot;array&quot;</span>], sampling_rate=sampling_rate, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">with</span> torch.no_grad():
<span class="hljs-meta">... </span>    outputs = model(**inputs)

<span class="hljs-meta">&gt;&gt;&gt; </span>last_hidden_states = outputs.last_hidden_state
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">list</span>(last_hidden_states.shape)
[<span class="hljs-number">1</span>, <span class="hljs-number">292</span>, <span class="hljs-number">1024</span>]`,wrap:!1}}),{c(){t=p("p"),t.textContent=T,l=a(),h(d.$$.fragment)},l(o){t=m(o,"P",{"data-svelte-h":!0}),v(t)!=="svelte-11lpom8"&&(t.textContent=T),l=r(o),u(d.$$.fragment,o)},m(o,w){c(o,t,w),c(o,l,w),f(d,o,w),y=!0},p:Y,i(o){y||(g(d.$$.fragment,o),y=!0)},o(o){_(d.$$.fragment,o),y=!1},d(o){o&&(s(t),s(l)),b(d,o)}}}function qo(U){let t,T=`Although the recipe for forward pass needs to be defined within this function, one should call the <code>Module</code>
instance afterwards instead of this since the former takes care of running the pre and post processing steps while
the latter silently ignores them.`;return{c(){t=p("p"),t.innerHTML=T},l(l){t=m(l,"P",{"data-svelte-h":!0}),v(t)!=="svelte-fincs2"&&(t.innerHTML=T)},m(l,d){c(l,t,d)},p:Y,d(l){l&&s(t)}}}function Vo(U){let t,T="Example:",l,d,y;return d=new Oe({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Qcm9jZXNzb3IlMkMlMjBVbmlTcGVlY2hGb3JDVEMlMEFmcm9tJTIwZGF0YXNldHMlMjBpbXBvcnQlMjBsb2FkX2RhdGFzZXQlMEFpbXBvcnQlMjB0b3JjaCUwQSUwQWRhdGFzZXQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTIyaGYtaW50ZXJuYWwtdGVzdGluZyUyRmxpYnJpc3BlZWNoX2Fzcl9kZW1vJTIyJTJDJTIwJTIyY2xlYW4lMjIlMkMlMjBzcGxpdCUzRCUyMnZhbGlkYXRpb24lMjIlMkMlMjB0cnVzdF9yZW1vdGVfY29kZSUzRFRydWUpJTBBZGF0YXNldCUyMCUzRCUyMGRhdGFzZXQuc29ydCglMjJpZCUyMiklMEFzYW1wbGluZ19yYXRlJTIwJTNEJTIwZGF0YXNldC5mZWF0dXJlcyU1QiUyMmF1ZGlvJTIyJTVELnNhbXBsaW5nX3JhdGUlMEElMEFwcm9jZXNzb3IlMjAlM0QlMjBBdXRvUHJvY2Vzc29yLmZyb21fcHJldHJhaW5lZCglMjJwYXRyaWNrdm9ucGxhdGVuJTJGdW5pc3BlZWNoLWxhcmdlLTE1MDBoLWN2LXRpbWl0JTIyKSUwQW1vZGVsJTIwJTNEJTIwVW5pU3BlZWNoRm9yQ1RDLmZyb21fcHJldHJhaW5lZCglMjJwYXRyaWNrdm9ucGxhdGVuJTJGdW5pc3BlZWNoLWxhcmdlLTE1MDBoLWN2LXRpbWl0JTIyKSUwQSUwQSUyMyUyMGF1ZGlvJTIwZmlsZSUyMGlzJTIwZGVjb2RlZCUyMG9uJTIwdGhlJTIwZmx5JTBBaW5wdXRzJTIwJTNEJTIwcHJvY2Vzc29yKGRhdGFzZXQlNUIwJTVEJTVCJTIyYXVkaW8lMjIlNUQlNUIlMjJhcnJheSUyMiU1RCUyQyUyMHNhbXBsaW5nX3JhdGUlM0RzYW1wbGluZ19yYXRlJTJDJTIwcmV0dXJuX3RlbnNvcnMlM0QlMjJwdCUyMiklMEF3aXRoJTIwdG9yY2gubm9fZ3JhZCgpJTNBJTBBJTIwJTIwJTIwJTIwbG9naXRzJTIwJTNEJTIwbW9kZWwoKippbnB1dHMpLmxvZ2l0cyUwQXByZWRpY3RlZF9pZHMlMjAlM0QlMjB0b3JjaC5hcmdtYXgobG9naXRzJTJDJTIwZGltJTNELTEpJTBBJTBBJTIzJTIwdHJhbnNjcmliZSUyMHNwZWVjaCUwQXRyYW5zY3JpcHRpb24lMjAlM0QlMjBwcm9jZXNzb3IuYmF0Y2hfZGVjb2RlKHByZWRpY3RlZF9pZHMpJTBBdHJhbnNjcmlwdGlvbiU1QjAlNUQlMEElMEFpbnB1dHMlNUIlMjJsYWJlbHMlMjIlNUQlMjAlM0QlMjBwcm9jZXNzb3IodGV4dCUzRGRhdGFzZXQlNUIwJTVEJTVCJTIydGV4dCUyMiU1RCUyQyUyMHJldHVybl90ZW5zb3JzJTNEJTIycHQlMjIpLmlucHV0X2lkcyUwQSUwQSUyMyUyMGNvbXB1dGUlMjBsb3NzJTBBbG9zcyUyMCUzRCUyMG1vZGVsKCoqaW5wdXRzKS5sb3NzJTBBcm91bmQobG9zcy5pdGVtKCklMkMlMjAyKQ==",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoProcessor, UniSpeechForCTC
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> torch

<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = load_dataset(<span class="hljs-string">&quot;hf-internal-testing/librispeech_asr_demo&quot;</span>, <span class="hljs-string">&quot;clean&quot;</span>, split=<span class="hljs-string">&quot;validation&quot;</span>, trust_remote_code=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = dataset.sort(<span class="hljs-string">&quot;id&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>sampling_rate = dataset.features[<span class="hljs-string">&quot;audio&quot;</span>].sampling_rate

<span class="hljs-meta">&gt;&gt;&gt; </span>processor = AutoProcessor.from_pretrained(<span class="hljs-string">&quot;patrickvonplaten/unispeech-large-1500h-cv-timit&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>model = UniSpeechForCTC.from_pretrained(<span class="hljs-string">&quot;patrickvonplaten/unispeech-large-1500h-cv-timit&quot;</span>)

<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># audio file is decoded on the fly</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(dataset[<span class="hljs-number">0</span>][<span class="hljs-string">&quot;audio&quot;</span>][<span class="hljs-string">&quot;array&quot;</span>], sampling_rate=sampling_rate, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">with</span> torch.no_grad():
<span class="hljs-meta">... </span>    logits = model(**inputs).logits
<span class="hljs-meta">&gt;&gt;&gt; </span>predicted_ids = torch.argmax(logits, dim=-<span class="hljs-number">1</span>)

<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># transcribe speech</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>transcription = processor.batch_decode(predicted_ids)
<span class="hljs-meta">&gt;&gt;&gt; </span>transcription[<span class="hljs-number">0</span>]
<span class="hljs-string">&#x27;mister quilter is the apposl of the midle classes and weare glad to welcom his gosepl&#x27;</span>

<span class="hljs-meta">&gt;&gt;&gt; </span>inputs[<span class="hljs-string">&quot;labels&quot;</span>] = processor(text=dataset[<span class="hljs-number">0</span>][<span class="hljs-string">&quot;text&quot;</span>], return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).input_ids

<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># compute loss</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>loss = model(**inputs).loss
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">round</span>(loss.item(), <span class="hljs-number">2</span>)
<span class="hljs-number">17.17</span>`,wrap:!1}}),{c(){t=p("p"),t.textContent=T,l=a(),h(d.$$.fragment)},l(o){t=m(o,"P",{"data-svelte-h":!0}),v(t)!=="svelte-11lpom8"&&(t.textContent=T),l=r(o),u(d.$$.fragment,o)},m(o,w){c(o,t,w),c(o,l,w),f(d,o,w),y=!0},p:Y,i(o){y||(g(d.$$.fragment,o),y=!0)},o(o){_(d.$$.fragment,o),y=!1},d(o){o&&(s(t),s(l)),b(d,o)}}}function No(U){let t,T=`Although the recipe for forward pass needs to be defined within this function, one should call the <code>Module</code>
instance afterwards instead of this since the former takes care of running the pre and post processing steps while
the latter silently ignores them.`;return{c(){t=p("p"),t.innerHTML=T},l(l){t=m(l,"P",{"data-svelte-h":!0}),v(t)!=="svelte-fincs2"&&(t.innerHTML=T)},m(l,d){c(l,t,d)},p:Y,d(l){l&&s(t)}}}function Xo(U){let t,T="Example:",l,d,y;return d=new Oe({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9GZWF0dXJlRXh0cmFjdG9yJTJDJTIwVW5pU3BlZWNoRm9yU2VxdWVuY2VDbGFzc2lmaWNhdGlvbiUwQWZyb20lMjBkYXRhc2V0cyUyMGltcG9ydCUyMGxvYWRfZGF0YXNldCUwQWltcG9ydCUyMHRvcmNoJTBBJTBBZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJoZi1pbnRlcm5hbC10ZXN0aW5nJTJGbGlicmlzcGVlY2hfYXNyX2RlbW8lMjIlMkMlMjAlMjJjbGVhbiUyMiUyQyUyMHNwbGl0JTNEJTIydmFsaWRhdGlvbiUyMiUyQyUyMHRydXN0X3JlbW90ZV9jb2RlJTNEVHJ1ZSklMEFkYXRhc2V0JTIwJTNEJTIwZGF0YXNldC5zb3J0KCUyMmlkJTIyKSUwQXNhbXBsaW5nX3JhdGUlMjAlM0QlMjBkYXRhc2V0LmZlYXR1cmVzJTVCJTIyYXVkaW8lMjIlNUQuc2FtcGxpbmdfcmF0ZSUwQSUwQWZlYXR1cmVfZXh0cmFjdG9yJTIwJTNEJTIwQXV0b0ZlYXR1cmVFeHRyYWN0b3IuZnJvbV9wcmV0cmFpbmVkKCUyMnBhdHJpY2t2b25wbGF0ZW4lMkZ1bmlzcGVlY2gtbGFyZ2UtMTUwMGgtY3YtdGltaXQlMjIpJTBBbW9kZWwlMjAlM0QlMjBVbmlTcGVlY2hGb3JTZXF1ZW5jZUNsYXNzaWZpY2F0aW9uLmZyb21fcHJldHJhaW5lZCglMjJwYXRyaWNrdm9ucGxhdGVuJTJGdW5pc3BlZWNoLWxhcmdlLTE1MDBoLWN2LXRpbWl0JTIyKSUwQSUwQSUyMyUyMGF1ZGlvJTIwZmlsZSUyMGlzJTIwZGVjb2RlZCUyMG9uJTIwdGhlJTIwZmx5JTBBaW5wdXRzJTIwJTNEJTIwZmVhdHVyZV9leHRyYWN0b3IoZGF0YXNldCU1QjAlNUQlNUIlMjJhdWRpbyUyMiU1RCU1QiUyMmFycmF5JTIyJTVEJTJDJTIwc2FtcGxpbmdfcmF0ZSUzRHNhbXBsaW5nX3JhdGUlMkMlMjByZXR1cm5fdGVuc29ycyUzRCUyMnB0JTIyKSUwQSUwQXdpdGglMjB0b3JjaC5ub19ncmFkKCklM0ElMEElMjAlMjAlMjAlMjBsb2dpdHMlMjAlM0QlMjBtb2RlbCgqKmlucHV0cykubG9naXRzJTBBJTBBcHJlZGljdGVkX2NsYXNzX2lkcyUyMCUzRCUyMHRvcmNoLmFyZ21heChsb2dpdHMlMkMlMjBkaW0lM0QtMSkuaXRlbSgpJTBBcHJlZGljdGVkX2xhYmVsJTIwJTNEJTIwbW9kZWwuY29uZmlnLmlkMmxhYmVsJTVCcHJlZGljdGVkX2NsYXNzX2lkcyU1RCUwQSUwQSUyMyUyMGNvbXB1dGUlMjBsb3NzJTIwLSUyMHRhcmdldF9sYWJlbCUyMGlzJTIwZS5nLiUyMCUyMmRvd24lMjIlMEF0YXJnZXRfbGFiZWwlMjAlM0QlMjBtb2RlbC5jb25maWcuaWQybGFiZWwlNUIwJTVEJTBBaW5wdXRzJTVCJTIybGFiZWxzJTIyJTVEJTIwJTNEJTIwdG9yY2gudGVuc29yKCU1Qm1vZGVsLmNvbmZpZy5sYWJlbDJpZCU1QnRhcmdldF9sYWJlbCU1RCU1RCklMEFsb3NzJTIwJTNEJTIwbW9kZWwoKippbnB1dHMpLmxvc3M=",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoFeatureExtractor, UniSpeechForSequenceClassification
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> torch

<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = load_dataset(<span class="hljs-string">&quot;hf-internal-testing/librispeech_asr_demo&quot;</span>, <span class="hljs-string">&quot;clean&quot;</span>, split=<span class="hljs-string">&quot;validation&quot;</span>, trust_remote_code=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = dataset.sort(<span class="hljs-string">&quot;id&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>sampling_rate = dataset.features[<span class="hljs-string">&quot;audio&quot;</span>].sampling_rate

<span class="hljs-meta">&gt;&gt;&gt; </span>feature_extractor = AutoFeatureExtractor.from_pretrained(<span class="hljs-string">&quot;patrickvonplaten/unispeech-large-1500h-cv-timit&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>model = UniSpeechForSequenceClassification.from_pretrained(<span class="hljs-string">&quot;patrickvonplaten/unispeech-large-1500h-cv-timit&quot;</span>)

<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># audio file is decoded on the fly</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = feature_extractor(dataset[<span class="hljs-number">0</span>][<span class="hljs-string">&quot;audio&quot;</span>][<span class="hljs-string">&quot;array&quot;</span>], sampling_rate=sampling_rate, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>)

<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">with</span> torch.no_grad():
<span class="hljs-meta">... </span>    logits = model(**inputs).logits

<span class="hljs-meta">&gt;&gt;&gt; </span>predicted_class_ids = torch.argmax(logits, dim=-<span class="hljs-number">1</span>).item()
<span class="hljs-meta">&gt;&gt;&gt; </span>predicted_label = model.config.id2label[predicted_class_ids]

<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># compute loss - target_label is e.g. &quot;down&quot;</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>target_label = model.config.id2label[<span class="hljs-number">0</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs[<span class="hljs-string">&quot;labels&quot;</span>] = torch.tensor([model.config.label2id[target_label]])
<span class="hljs-meta">&gt;&gt;&gt; </span>loss = model(**inputs).loss`,wrap:!1}}),{c(){t=p("p"),t.textContent=T,l=a(),h(d.$$.fragment)},l(o){t=m(o,"P",{"data-svelte-h":!0}),v(t)!=="svelte-11lpom8"&&(t.textContent=T),l=r(o),u(d.$$.fragment,o)},m(o,w){c(o,t,w),c(o,l,w),f(d,o,w),y=!0},p:Y,i(o){y||(g(d.$$.fragment,o),y=!0)},o(o){_(d.$$.fragment,o),y=!1},d(o){o&&(s(t),s(l)),b(d,o)}}}function Ro(U){let t,T=`Although the recipe for forward pass needs to be defined within this function, one should call the <code>Module</code>
instance afterwards instead of this since the former takes care of running the pre and post processing steps while
the latter silently ignores them.`;return{c(){t=p("p"),t.innerHTML=T},l(l){t=m(l,"P",{"data-svelte-h":!0}),v(t)!=="svelte-fincs2"&&(t.innerHTML=T)},m(l,d){c(l,t,d)},p:Y,d(l){l&&s(t)}}}function Io(U){let t,T="Example:",l,d,y;return d=new Oe({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwdHJhbnNmb3JtZXJzJTIwaW1wb3J0JTIwQXV0b0ZlYXR1cmVFeHRyYWN0b3IlMkMlMjBVbmlTcGVlY2hGb3JQcmVUcmFpbmluZyUwQSUwQWZlYXR1cmVfZXh0cmFjdG9yJTIwJTNEJTIwQXV0b0ZlYXR1cmVFeHRyYWN0b3IuZnJvbV9wcmV0cmFpbmVkKCUyMm1pY3Jvc29mdCUyRnVuaXNwZWVjaC1sYXJnZS0xNTAwaC1jdiUyMiklMEFtb2RlbCUyMCUzRCUyMFVuaVNwZWVjaEZvclByZVRyYWluaW5nLmZyb21fcHJldHJhaW5lZCglMjJtaWNyb3NvZnQlMkZ1bmlzcGVlY2gtbGFyZ2UtMTUwMGgtY3YlMjIpJTBBJTIzJTIwVE9ETyUzQSUyMEFkZCUyMGZ1bGwlMjBwcmV0cmFpbmluZyUyMGV4YW1wbGU=",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> torch
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoFeatureExtractor, UniSpeechForPreTraining

<span class="hljs-meta">&gt;&gt;&gt; </span>feature_extractor = AutoFeatureExtractor.from_pretrained(<span class="hljs-string">&quot;microsoft/unispeech-large-1500h-cv&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>model = UniSpeechForPreTraining.from_pretrained(<span class="hljs-string">&quot;microsoft/unispeech-large-1500h-cv&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># <span class="hljs-doctag">TODO:</span> Add full pretraining example</span>`,wrap:!1}}),{c(){t=p("p"),t.textContent=T,l=a(),h(d.$$.fragment)},l(o){t=m(o,"P",{"data-svelte-h":!0}),v(t)!=="svelte-11lpom8"&&(t.textContent=T),l=r(o),u(d.$$.fragment,o)},m(o,w){c(o,t,w),c(o,l,w),f(d,o,w),y=!0},p:Y,i(o){y||(g(d.$$.fragment,o),y=!0)},o(o){_(d.$$.fragment,o),y=!1},d(o){o&&(s(t),s(l)),b(d,o)}}}function Lo(U){let t,T,l,d,y,o,w,Ae,ne,to=`The UniSpeech model was proposed in <a href="https://arxiv.org/abs/2101.07597" rel="nofollow">UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data</a> by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael
Zeng, Xuedong Huang .`,Ke,se,oo="The abstract from the paper is the following:",et,ae,no=`<em>In this paper, we propose a unified pre-training approach called UniSpeech to learn speech representations with both
unlabeled and labeled data, in which supervised phonetic CTC learning and phonetically-aware contrastive
self-supervised learning are conducted in a multi-task learning manner. The resultant representations can capture
information more correlated with phonetic structures and improve the generalization across languages and domains. We
evaluate the effectiveness of UniSpeech for cross-lingual representation learning on public CommonVoice corpus. The
results show that UniSpeech outperforms self-supervised pretraining and supervised transfer learning for speech
recognition by a maximum of 13.4% and 17.8% relative phone error rate reductions respectively (averaged over all
testing languages). The transferability of UniSpeech is also demonstrated on a domain-shift speech recognition task,
i.e., a relative word error rate reduction of 6% against the previous approach.</em>`,tt,re,so=`This model was contributed by <a href="https://huggingface.co/patrickvonplaten" rel="nofollow">patrickvonplaten</a>. The Authors’ code can be
found <a href="https://github.com/microsoft/UniSpeech/tree/main/UniSpeech" rel="nofollow">here</a>.`,ot,ie,nt,le,ao=`<li>UniSpeech is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please
use <a href="/docs/transformers/v4.47.1/en/model_doc/wav2vec2#transformers.Wav2Vec2Processor">Wav2Vec2Processor</a> for the feature extraction.</li> <li>UniSpeech model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be
decoded using <a href="/docs/transformers/v4.47.1/en/model_doc/wav2vec2#transformers.Wav2Vec2CTCTokenizer">Wav2Vec2CTCTokenizer</a>.</li>`,st,ce,at,de,ro='<li><a href="../tasks/audio_classification">Audio classification task guide</a></li> <li><a href="../tasks/asr">Automatic speech recognition task guide</a></li>',rt,pe,it,j,me,wt,je,io=`This is the configuration class to store the configuration of a <a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechModel">UniSpeechModel</a>. It is used to instantiate an
UniSpeech model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the UniSpeech
<a href="https://huggingface.co/microsoft/unispeech-large-1500h-cv" rel="nofollow">microsoft/unispeech-large-1500h-cv</a> architecture.`,Ut,xe,lo=`Configuration objects inherit from <a href="/docs/transformers/v4.47.1/en/main_classes/configuration#transformers.PretrainedConfig">PretrainedConfig</a> and can be used to control the model outputs. Read the
documentation from <a href="/docs/transformers/v4.47.1/en/main_classes/configuration#transformers.PretrainedConfig">PretrainedConfig</a> for more information.`,Mt,P,lt,he,ct,H,ue,Ct,Fe,co="Output type of <code>UniSpeechForPreTrainingOutput</code>, with potential hidden states and attentions.",dt,fe,pt,C,ge,kt,Je,po=`The bare UniSpeech Model transformer outputting raw hidden-states without any specific head on top.
UniSpeech was proposed in <a href="https://arxiv.org/abs/2101.07597" rel="nofollow">UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled
Data</a> by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei,
Michael Zeng, Xuedong Huang.`,$t,We,mo=`This model inherits from <a href="/docs/transformers/v4.47.1/en/main_classes/model#transformers.PreTrainedModel">PreTrainedModel</a>. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving etc.).`,St,Ze,ho=`This model is a PyTorch <a href="https://pytorch.org/docs/stable/nn.html#torch.nn.Module" rel="nofollow">torch.nn.Module</a> sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.`,jt,W,_e,xt,ze,uo='The <a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechModel">UniSpeechModel</a> forward method, overrides the <code>__call__</code> special method.',Ft,Q,Jt,E,mt,be,ht,k,ve,Wt,Ge,fo=`UniSpeech Model with a <code>language modeling</code> head on top for Connectionist Temporal Classification (CTC).
UniSpeech was proposed in <a href="https://arxiv.org/abs/2101.07597" rel="nofollow">UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled
Data</a> by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei,
Michael Zeng, Xuedong Huang.`,Zt,qe,go=`This model inherits from <a href="/docs/transformers/v4.47.1/en/main_classes/model#transformers.PreTrainedModel">PreTrainedModel</a>. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving etc.).`,zt,Ve,_o=`This model is a PyTorch <a href="https://pytorch.org/docs/stable/nn.html#torch.nn.Module" rel="nofollow">torch.nn.Module</a> sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.`,Gt,Z,ye,qt,Ne,bo='The <a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechForCTC">UniSpeechForCTC</a> forward method, overrides the <code>__call__</code> special method.',Vt,D,Nt,O,ut,Te,ft,M,we,Xt,Xe,vo=`UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
SUPERB Keyword Spotting.`,Rt,Re,yo=`UniSpeech was proposed in <a href="https://arxiv.org/abs/2101.07597" rel="nofollow">UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled
Data</a> by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei,
Michael Zeng, Xuedong Huang.`,It,Ie,To=`This model inherits from <a href="/docs/transformers/v4.47.1/en/main_classes/model#transformers.PreTrainedModel">PreTrainedModel</a>. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving etc.).`,Lt,Le,wo=`This model is a PyTorch <a href="https://pytorch.org/docs/stable/nn.html#torch.nn.Module" rel="nofollow">torch.nn.Module</a> sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.`,Bt,z,Ue,Ht,Be,Uo='The <a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechForSequenceClassification">UniSpeechForSequenceClassification</a> forward method, overrides the <code>__call__</code> special method.',Yt,A,Pt,K,gt,Me,_t,$,Ce,Qt,He,Mo=`UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
UniSpeech was proposed in <a href="https://arxiv.org/abs/2101.07597" rel="nofollow">UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled
Data</a> by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei,
Michael Zeng, Xuedong Huang.`,Et,Ye,Co=`This model inherits from <a href="/docs/transformers/v4.47.1/en/main_classes/model#transformers.PreTrainedModel">PreTrainedModel</a>. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving etc.).`,Dt,Pe,ko=`This model is a PyTorch <a href="https://pytorch.org/docs/stable/nn.html#torch.nn.Module" rel="nofollow">torch.nn.Module</a> sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.`,Ot,G,ke,At,Qe,$o='The <a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechForPreTraining">UniSpeechForPreTraining</a> forward method, overrides the <code>__call__</code> special method.',Kt,ee,eo,te,bt,$e,vt,Ee,yt;return y=new B({props:{title:"UniSpeech",local:"unispeech",headingTag:"h1"}}),w=new B({props:{title:"Overview",local:"overview",headingTag:"h2"}}),ie=new B({props:{title:"Usage tips",local:"usage-tips",headingTag:"h2"}}),ce=new B({props:{title:"Resources",local:"resources",headingTag:"h2"}}),pe=new B({props:{title:"UniSpeechConfig",local:"transformers.UniSpeechConfig",headingTag:"h2"}}),me=new L({props:{name:"class transformers.UniSpeechConfig",anchor:"transformers.UniSpeechConfig",parameters:[{name:"vocab_size",val:" = 32"},{name:"hidden_size",val:" = 768"},{name:"num_hidden_layers",val:" = 12"},{name:"num_attention_heads",val:" = 12"},{name:"intermediate_size",val:" = 3072"},{name:"hidden_act",val:" = 'gelu'"},{name:"hidden_dropout",val:" = 0.1"},{name:"activation_dropout",val:" = 0.1"},{name:"attention_dropout",val:" = 0.1"},{name:"feat_proj_dropout",val:" = 0.0"},{name:"feat_quantizer_dropout",val:" = 0.0"},{name:"final_dropout",val:" = 0.1"},{name:"layerdrop",val:" = 0.1"},{name:"initializer_range",val:" = 0.02"},{name:"layer_norm_eps",val:" = 1e-05"},{name:"feat_extract_norm",val:" = 'group'"},{name:"feat_extract_activation",val:" = 'gelu'"},{name:"conv_dim",val:" = (512, 512, 512, 512, 512, 512, 512)"},{name:"conv_stride",val:" = (5, 2, 2, 2, 2, 2, 2)"},{name:"conv_kernel",val:" = (10, 3, 3, 3, 3, 2, 2)"},{name:"conv_bias",val:" = False"},{name:"num_conv_pos_embeddings",val:" = 128"},{name:"num_conv_pos_embedding_groups",val:" = 16"},{name:"do_stable_layer_norm",val:" = False"},{name:"apply_spec_augment",val:" = True"},{name:"mask_time_prob",val:" = 0.05"},{name:"mask_time_length",val:" = 10"},{name:"mask_time_min_masks",val:" = 2"},{name:"mask_feature_prob",val:" = 0.0"},{name:"mask_feature_length",val:" = 10"},{name:"mask_feature_min_masks",val:" = 0"},{name:"num_codevectors_per_group",val:" = 320"},{name:"num_codevector_groups",val:" = 2"},{name:"contrastive_logits_temperature",val:" = 0.1"},{name:"num_negatives",val:" = 100"},{name:"codevector_dim",val:" = 256"},{name:"proj_codevector_dim",val:" = 256"},{name:"diversity_loss_weight",val:" = 0.1"},{name:"ctc_loss_reduction",val:" = 'mean'"},{name:"ctc_zero_infinity",val:" = False"},{name:"use_weighted_layer_sum",val:" = False"},{name:"classifier_proj_size",val:" = 256"},{name:"num_ctc_classes",val:" = 80"},{name:"pad_token_id",val:" = 0"},{name:"bos_token_id",val:" = 1"},{name:"eos_token_id",val:" = 2"},{name:"replace_prob",val:" = 0.5"},{name:"**kwargs",val:""}],parametersDescription:[{anchor:"transformers.UniSpeechConfig.vocab_size",description:`<strong>vocab_size</strong> (<code>int</code>, <em>optional</em>, defaults to 32) &#x2014;
Vocabulary size of the UniSpeech model. Defines the number of different tokens that can be represented by
the <code>inputs_ids</code> passed when calling <a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechModel">UniSpeechModel</a>. Vocabulary size of the model. Defines the
different tokens that can be represented by the <em>inputs_ids</em> passed to the forward method of
<a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechModel">UniSpeechModel</a>.`,name:"vocab_size"},{anchor:"transformers.UniSpeechConfig.hidden_size",description:`<strong>hidden_size</strong> (<code>int</code>, <em>optional</em>, defaults to 768) &#x2014;
Dimensionality of the encoder layers and the pooler layer.`,name:"hidden_size"},{anchor:"transformers.UniSpeechConfig.num_hidden_layers",description:`<strong>num_hidden_layers</strong> (<code>int</code>, <em>optional</em>, defaults to 12) &#x2014;
Number of hidden layers in the Transformer encoder.`,name:"num_hidden_layers"},{anchor:"transformers.UniSpeechConfig.num_attention_heads",description:`<strong>num_attention_heads</strong> (<code>int</code>, <em>optional</em>, defaults to 12) &#x2014;
Number of attention heads for each attention layer in the Transformer encoder.`,name:"num_attention_heads"},{anchor:"transformers.UniSpeechConfig.intermediate_size",description:`<strong>intermediate_size</strong> (<code>int</code>, <em>optional</em>, defaults to 3072) &#x2014;
Dimensionality of the &#x201C;intermediate&#x201D; (i.e., feed-forward) layer in the Transformer encoder.`,name:"intermediate_size"},{anchor:"transformers.UniSpeechConfig.hidden_act",description:`<strong>hidden_act</strong> (<code>str</code> or <code>function</code>, <em>optional</em>, defaults to <code>&quot;gelu&quot;</code>) &#x2014;
The non-linear activation function (function or string) in the encoder and pooler. If string, <code>&quot;gelu&quot;</code>,
<code>&quot;relu&quot;</code>, <code>&quot;selu&quot;</code> and <code>&quot;gelu_new&quot;</code> are supported.`,name:"hidden_act"},{anchor:"transformers.UniSpeechConfig.hidden_dropout",description:`<strong>hidden_dropout</strong> (<code>float</code>, <em>optional</em>, defaults to 0.1) &#x2014;
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.`,name:"hidden_dropout"},{anchor:"transformers.UniSpeechConfig.activation_dropout",description:`<strong>activation_dropout</strong> (<code>float</code>, <em>optional</em>, defaults to 0.1) &#x2014;
The dropout ratio for activations inside the fully connected layer.`,name:"activation_dropout"},{anchor:"transformers.UniSpeechConfig.attention_dropout",description:`<strong>attention_dropout</strong> (<code>float</code>, <em>optional</em>, defaults to 0.1) &#x2014;
The dropout ratio for the attention probabilities.`,name:"attention_dropout"},{anchor:"transformers.UniSpeechConfig.feat_proj_dropout",description:`<strong>feat_proj_dropout</strong> (<code>float</code>, <em>optional</em>, defaults to 0.0) &#x2014;
The dropout probability for output of the feature encoder.`,name:"feat_proj_dropout"},{anchor:"transformers.UniSpeechConfig.feat_quantizer_dropout",description:`<strong>feat_quantizer_dropout</strong> (<code>float</code>, <em>optional</em>, defaults to 0.0) &#x2014;
The dropout probability for the output of the feature encoder that&#x2019;s used by the quantizer.`,name:"feat_quantizer_dropout"},{anchor:"transformers.UniSpeechConfig.final_dropout",description:`<strong>final_dropout</strong> (<code>float</code>, <em>optional</em>, defaults to 0.1) &#x2014;
The dropout probability for the final projection layer of <a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechForCTC">UniSpeechForCTC</a>.`,name:"final_dropout"},{anchor:"transformers.UniSpeechConfig.layerdrop",description:`<strong>layerdrop</strong> (<code>float</code>, <em>optional</em>, defaults to 0.1) &#x2014;
The LayerDrop probability. See the [LayerDrop paper](see <a href="https://arxiv.org/abs/1909.11556" rel="nofollow">https://arxiv.org/abs/1909.11556</a>) for more
details.`,name:"layerdrop"},{anchor:"transformers.UniSpeechConfig.initializer_range",description:`<strong>initializer_range</strong> (<code>float</code>, <em>optional</em>, defaults to 0.02) &#x2014;
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.`,name:"initializer_range"},{anchor:"transformers.UniSpeechConfig.layer_norm_eps",description:`<strong>layer_norm_eps</strong> (<code>float</code>, <em>optional</em>, defaults to 1e-05) &#x2014;
The epsilon used by the layer normalization layers.`,name:"layer_norm_eps"},{anchor:"transformers.UniSpeechConfig.feat_extract_norm",description:`<strong>feat_extract_norm</strong> (<code>str</code>, <em>optional</em>, defaults to <code>&quot;group&quot;</code>) &#x2014;
The norm to be applied to 1D convolutional layers in feature encoder. One of <code>&quot;group&quot;</code> for group
normalization of only the first 1D convolutional layer or <code>&quot;layer&quot;</code> for layer normalization of all 1D
convolutional layers.`,name:"feat_extract_norm"},{anchor:"transformers.UniSpeechConfig.feat_extract_activation",description:"<strong>feat_extract_activation</strong> (<code>str, *optional*, defaults to </code>&#x201C;gelu&#x201D;<code>) -- The non-linear activation function (function or string) in the 1D convolutional layers of the feature extractor. If string, </code>&#x201C;gelu&#x201D;<code>, </code>&#x201C;relu&#x201D;<code>, </code>&#x201C;selu&#x201D;<code>and</code>&#x201C;gelu_new&#x201D;` are supported.",name:"feat_extract_activation"},{anchor:"transformers.UniSpeechConfig.conv_dim",description:`<strong>conv_dim</strong> (<code>Tuple[int]</code> or <code>List[int]</code>, <em>optional</em>, defaults to <code>(512, 512, 512, 512, 512, 512, 512)</code>) &#x2014;
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
feature encoder. The length of <em>conv_dim</em> defines the number of 1D convolutional layers.`,name:"conv_dim"},{anchor:"transformers.UniSpeechConfig.conv_stride",description:`<strong>conv_stride</strong> (<code>Tuple[int]</code> or <code>List[int]</code>, <em>optional</em>, defaults to <code>(5, 2, 2, 2, 2, 2, 2)</code>) &#x2014;
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
of <em>conv_stride</em> defines the number of convolutional layers and has to match the length of <em>conv_dim</em>.`,name:"conv_stride"},{anchor:"transformers.UniSpeechConfig.conv_kernel",description:`<strong>conv_kernel</strong> (<code>Tuple[int]</code> or <code>List[int]</code>, <em>optional</em>, defaults to <code>(10, 3, 3, 3, 3, 2, 2)</code>) &#x2014;
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
length of <em>conv_kernel</em> defines the number of convolutional layers and has to match the length of
<em>conv_dim</em>.`,name:"conv_kernel"},{anchor:"transformers.UniSpeechConfig.conv_bias",description:`<strong>conv_bias</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
Whether the 1D convolutional layers have a bias.`,name:"conv_bias"},{anchor:"transformers.UniSpeechConfig.num_conv_pos_embeddings",description:`<strong>num_conv_pos_embeddings</strong> (<code>int</code>, <em>optional</em>, defaults to 128) &#x2014;
Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
embeddings layer.`,name:"num_conv_pos_embeddings"},{anchor:"transformers.UniSpeechConfig.num_conv_pos_embedding_groups",description:`<strong>num_conv_pos_embedding_groups</strong> (<code>int</code>, <em>optional</em>, defaults to 16) &#x2014;
Number of groups of 1D convolutional positional embeddings layer.`,name:"num_conv_pos_embedding_groups"},{anchor:"transformers.UniSpeechConfig.do_stable_layer_norm",description:`<strong>do_stable_layer_norm</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
Whether to apply <em>stable</em> layer norm architecture of the Transformer encoder. <code>do_stable_layer_norm is True</code> corresponds to applying layer norm before the attention layer, whereas <code>do_stable_layer_norm is False</code> corresponds to applying layer norm after the attention layer.`,name:"do_stable_layer_norm"},{anchor:"transformers.UniSpeechConfig.apply_spec_augment",description:`<strong>apply_spec_augment</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>True</code>) &#x2014;
Whether to apply <em>SpecAugment</em> data augmentation to the outputs of the feature encoder. For reference see
<a href="https://arxiv.org/abs/1904.08779" rel="nofollow">SpecAugment: A Simple Data Augmentation Method for Automatic Speech
Recognition</a>.`,name:"apply_spec_augment"},{anchor:"transformers.UniSpeechConfig.mask_time_prob",description:`<strong>mask_time_prob</strong> (<code>float</code>, <em>optional</em>, defaults to 0.05) &#x2014;
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
procecure generates &#x201D;mask_time_prob<em>len(time_axis)/mask_time_length&#x201D; independent masks over the axis. If
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
masked, </em>mask_time_prob<em> should be \`prob_vector_start</em>mask_time_length<code>. Note that overlap may decrease the actual percentage of masked vectors. This is only relevant if </code>apply_spec_augment is True\`.`,name:"mask_time_prob"},{anchor:"transformers.UniSpeechConfig.mask_time_length",description:`<strong>mask_time_length</strong> (<code>int</code>, <em>optional</em>, defaults to 10) &#x2014;
Length of vector span along the time axis.`,name:"mask_time_length"},{anchor:"transformers.UniSpeechConfig.mask_time_min_masks",description:`<strong>mask_time_min_masks</strong> (<code>int</code>, <em>optional</em>, defaults to 2) &#x2014;
The minimum number of masks of length <code>mask_feature_length</code> generated along the time axis, each time step,
irrespectively of <code>mask_feature_prob</code>. Only relevant if &#x201D;mask_time_prob*len(time_axis)/mask_time_length &lt;
mask_time_min_masks&#x201D;`,name:"mask_time_min_masks"},{anchor:"transformers.UniSpeechConfig.mask_feature_prob",description:`<strong>mask_feature_prob</strong> (<code>float</code>, <em>optional</em>, defaults to 0.0) &#x2014;
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
masking procecure generates &#x201D;mask_feature_prob<em>len(feature_axis)/mask_time_length&#x201D; independent masks over
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
span to be masked, </em>mask_feature_prob<em> should be \`prob_vector_start</em>mask_feature_length<code>. Note that overlap may decrease the actual percentage of masked vectors. This is only relevant if </code>apply_spec_augment is
True\`.`,name:"mask_feature_prob"},{anchor:"transformers.UniSpeechConfig.mask_feature_length",description:`<strong>mask_feature_length</strong> (<code>int</code>, <em>optional</em>, defaults to 10) &#x2014;
Length of vector span along the feature axis.`,name:"mask_feature_length"},{anchor:"transformers.UniSpeechConfig.mask_feature_min_masks",description:`<strong>mask_feature_min_masks</strong> (<code>int</code>, <em>optional</em>, defaults to 0) &#x2014;
The minimum number of masks of length <code>mask_feature_length</code> generated along the feature axis, each time
step, irrespectively of <code>mask_feature_prob</code>. Only relevant if
&#x201D;mask_feature_prob*len(feature_axis)/mask_feature_length &lt; mask_feature_min_masks&#x201D;`,name:"mask_feature_min_masks"},{anchor:"transformers.UniSpeechConfig.num_codevectors_per_group",description:`<strong>num_codevectors_per_group</strong> (<code>int</code>, <em>optional</em>, defaults to 320) &#x2014;
Number of entries in each quantization codebook (group).`,name:"num_codevectors_per_group"},{anchor:"transformers.UniSpeechConfig.num_codevector_groups",description:`<strong>num_codevector_groups</strong> (<code>int</code>, <em>optional</em>, defaults to 2) &#x2014;
Number of codevector groups for product codevector quantization.`,name:"num_codevector_groups"},{anchor:"transformers.UniSpeechConfig.contrastive_logits_temperature",description:`<strong>contrastive_logits_temperature</strong> (<code>float</code>, <em>optional</em>, defaults to 0.1) &#x2014;
The temperature <em>kappa</em> in the contrastive loss.`,name:"contrastive_logits_temperature"},{anchor:"transformers.UniSpeechConfig.num_negatives",description:`<strong>num_negatives</strong> (<code>int</code>, <em>optional</em>, defaults to 100) &#x2014;
Number of negative samples for the contrastive loss.`,name:"num_negatives"},{anchor:"transformers.UniSpeechConfig.codevector_dim",description:`<strong>codevector_dim</strong> (<code>int</code>, <em>optional</em>, defaults to 256) &#x2014;
Dimensionality of the quantized feature vectors.`,name:"codevector_dim"},{anchor:"transformers.UniSpeechConfig.proj_codevector_dim",description:`<strong>proj_codevector_dim</strong> (<code>int</code>, <em>optional</em>, defaults to 256) &#x2014;
Dimensionality of the final projection of both the quantized and the transformer features.`,name:"proj_codevector_dim"},{anchor:"transformers.UniSpeechConfig.diversity_loss_weight",description:`<strong>diversity_loss_weight</strong> (<code>int</code>, <em>optional</em>, defaults to 0.1) &#x2014;
The weight of the codebook diversity loss component.`,name:"diversity_loss_weight"},{anchor:"transformers.UniSpeechConfig.ctc_loss_reduction",description:`<strong>ctc_loss_reduction</strong> (<code>str</code>, <em>optional</em>, defaults to <code>&quot;mean&quot;</code>) &#x2014;
Specifies the reduction to apply to the output of <code>torch.nn.CTCLoss</code>. Only relevant when training an
instance of <a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechForCTC">UniSpeechForCTC</a>.`,name:"ctc_loss_reduction"},{anchor:"transformers.UniSpeechConfig.ctc_zero_infinity",description:`<strong>ctc_zero_infinity</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
Whether to zero infinite losses and the associated gradients of <code>torch.nn.CTCLoss</code>. Infinite losses mainly
occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
of <a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechForCTC">UniSpeechForCTC</a>.`,name:"ctc_zero_infinity"},{anchor:"transformers.UniSpeechConfig.use_weighted_layer_sum",description:`<strong>use_weighted_layer_sum</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) &#x2014;
Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
instance of <a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechForSequenceClassification">UniSpeechForSequenceClassification</a>.`,name:"use_weighted_layer_sum"},{anchor:"transformers.UniSpeechConfig.classifier_proj_size",description:`<strong>classifier_proj_size</strong> (<code>int</code>, <em>optional</em>, defaults to 256) &#x2014;
Dimensionality of the projection before token mean-pooling for classification.`,name:"classifier_proj_size"},{anchor:"transformers.UniSpeechConfig.num_ctc_classes",description:`<strong>num_ctc_classes</strong> (<code>int</code>, <em>optional</em>, defaults to 80) &#x2014;
Specifies the number of classes (phoneme tokens and blank token) for phoneme-level CTC loss. Only relevant
when using an instance of <a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechForPreTraining">UniSpeechForPreTraining</a>.`,name:"num_ctc_classes"},{anchor:"transformers.UniSpeechConfig.pad_token_id",description:`<strong>pad_token_id</strong> (<code>int</code>, <em>optional</em>, defaults to 0) &#x2014;
The id of the padding token.`,name:"pad_token_id"},{anchor:"transformers.UniSpeechConfig.bos_token_id",description:`<strong>bos_token_id</strong> (<code>int</code>, <em>optional</em>, defaults to 1) &#x2014;
The id of the &#x201C;beginning-of-sequence&#x201D; token.`,name:"bos_token_id"},{anchor:"transformers.UniSpeechConfig.eos_token_id",description:`<strong>eos_token_id</strong> (<code>int</code>, <em>optional</em>, defaults to 2) &#x2014;
The id of the &#x201C;end-of-sequence&#x201D; token.`,name:"eos_token_id"},{anchor:"transformers.UniSpeechConfig.replace_prob",description:`<strong>replace_prob</strong> (<code>float</code>, <em>optional</em>, defaults to 0.5) &#x2014;
Propability that transformer feature is replaced by quantized feature for pretraining.`,name:"replace_prob"}],source:"https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/unispeech/configuration_unispeech.py#L27"}}),P=new De({props:{anchor:"transformers.UniSpeechConfig.example",$$slots:{default:[Zo]},$$scope:{ctx:U}}}),he=new B({props:{title:"UniSpeech specific outputs",local:"transformers.models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput",headingTag:"h2"}}),ue=new L({props:{name:"class transformers.models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput",anchor:"transformers.models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput",parameters:[{name:"loss",val:": typing.Optional[torch.FloatTensor] = None"},{name:"projected_states",val:": FloatTensor = None"},{name:"projected_quantized_states",val:": FloatTensor = None"},{name:"codevector_perplexity",val:": FloatTensor = None"},{name:"hidden_states",val:": typing.Optional[typing.Tuple[torch.FloatTensor]] = None"},{name:"attentions",val:": typing.Optional[typing.Tuple[torch.FloatTensor]] = None"}],parametersDescription:[{anchor:"transformers.models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput.loss",description:`<strong>loss</strong> (<em>optional</em>, returned when model is in train mode, <code>torch.FloatTensor</code> of shape <code>(1,)</code>) &#x2014;
Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the <a href="https://arxiv.org/pdf/2006.11477.pdf" rel="nofollow">official
paper</a> . (classification) loss.`,name:"loss"},{anchor:"transformers.models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput.projected_states",description:`<strong>projected_states</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, sequence_length, config.proj_codevector_dim)</code>) &#x2014;
Hidden-states of the model projected to <em>config.proj_codevector_dim</em> that can be used to predict the masked
projected quantized states.`,name:"projected_states"},{anchor:"transformers.models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput.projected_quantized_states",description:`<strong>projected_quantized_states</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, sequence_length, config.proj_codevector_dim)</code>) &#x2014;
Quantized extracted feature vectors projected to <em>config.proj_codevector_dim</em> representing the positive
target vectors for contrastive loss.`,name:"projected_quantized_states"},{anchor:"transformers.models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput.hidden_states",description:`<strong>hidden_states</strong> (<code>tuple(torch.FloatTensor)</code>, <em>optional</em>, returned when <code>output_hidden_states=True</code> is passed or when <code>config.output_hidden_states=True</code>) &#x2014;
Tuple of <code>torch.FloatTensor</code> (one for the output of the embeddings + one for the output of each layer) of
shape <code>(batch_size, sequence_length, hidden_size)</code>.</p>
<p>Hidden-states of the model at the output of each layer plus the initial embedding outputs.`,name:"hidden_states"},{anchor:"transformers.models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput.attentions",description:`<strong>attentions</strong> (<code>tuple(torch.FloatTensor)</code>, <em>optional</em>, returned when <code>output_attentions=True</code> is passed or when <code>config.output_attentions=True</code>) &#x2014;
Tuple of <code>torch.FloatTensor</code> (one for each layer) of shape <code>(batch_size, num_heads, sequence_length, sequence_length)</code>.</p>
<p>Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.`,name:"attentions"}],source:"https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/unispeech/modeling_unispeech.py#L67"}}),fe=new B({props:{title:"UniSpeechModel",local:"transformers.UniSpeechModel",headingTag:"h2"}}),ge=new L({props:{name:"class transformers.UniSpeechModel",anchor:"transformers.UniSpeechModel",parameters:[{name:"config",val:": UniSpeechConfig"}],parametersDescription:[{anchor:"transformers.UniSpeechModel.config",description:`<strong>config</strong> (<a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechConfig">UniSpeechConfig</a>) &#x2014; Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the <a href="/docs/transformers/v4.47.1/en/main_classes/model#transformers.PreTrainedModel.from_pretrained">from_pretrained()</a> method to load the model weights.`,name:"config"}],source:"https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/unispeech/modeling_unispeech.py#L1344"}}),_e=new L({props:{name:"forward",anchor:"transformers.UniSpeechModel.forward",parameters:[{name:"input_values",val:": typing.Optional[torch.Tensor]"},{name:"attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"mask_time_indices",val:": typing.Optional[torch.FloatTensor] = None"},{name:"output_attentions",val:": typing.Optional[bool] = None"},{name:"output_hidden_states",val:": typing.Optional[bool] = None"},{name:"return_dict",val:": typing.Optional[bool] = None"}],parametersDescription:[{anchor:"transformers.UniSpeechModel.forward.input_values",description:`<strong>input_values</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, sequence_length)</code>) &#x2014;
Float values of input raw speech waveform. Values can be obtained by loading a <code>.flac</code> or <code>.wav</code> audio file
into an array of type <code>List[float]</code> or a <code>numpy.ndarray</code>, <em>e.g.</em> via the soundfile library (<code>pip install soundfile</code>). To prepare the array into <code>input_values</code>, the <a href="/docs/transformers/v4.47.1/en/model_doc/auto#transformers.AutoProcessor">AutoProcessor</a> should be used for padding and
conversion into a tensor of type <code>torch.FloatTensor</code>. See <a href="/docs/transformers/v4.47.1/en/model_doc/wav2vec2#transformers.Wav2Vec2Processor.__call__">Wav2Vec2Processor.<strong>call</strong>()</a> for details.`,name:"input_values"},{anchor:"transformers.UniSpeechModel.forward.attention_mask",description:`<strong>attention_mask</strong> (<code>torch.LongTensor</code> of shape <code>(batch_size, sequence_length)</code>, <em>optional</em>) &#x2014;
Mask to avoid performing convolution and attention on padding token indices. Mask values selected in <code>[0, 1]</code>:</p>
<ul>
<li>1 for tokens that are <strong>not masked</strong>,</li>
<li>0 for tokens that are <strong>masked</strong>.</li>
</ul>
<p><a href="../glossary#attention-mask">What are attention masks?</a></p>
<div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400">
						
<p><code>attention_mask</code> should only be passed if the corresponding processor has <code>config.return_attention_mask == True</code>. For all models whose processor has <code>config.return_attention_mask == False</code>, <code>attention_mask</code> should
<strong>not</strong> be passed to avoid degraded performance when doing batched inference. For such models
<code>input_values</code> should simply be padded with 0 and passed without <code>attention_mask</code>. Be aware that these
models also yield slightly different results depending on whether <code>input_values</code> is padded or not.</p>

					</div>`,name:"attention_mask"},{anchor:"transformers.UniSpeechModel.forward.output_attentions",description:`<strong>output_attentions</strong> (<code>bool</code>, <em>optional</em>) &#x2014;
Whether or not to return the attentions tensors of all attention layers. See <code>attentions</code> under returned
tensors for more detail.`,name:"output_attentions"},{anchor:"transformers.UniSpeechModel.forward.output_hidden_states",description:`<strong>output_hidden_states</strong> (<code>bool</code>, <em>optional</em>) &#x2014;
Whether or not to return the hidden states of all layers. See <code>hidden_states</code> under returned tensors for
more detail.`,name:"output_hidden_states"},{anchor:"transformers.UniSpeechModel.forward.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>) &#x2014;
Whether or not to return a <a href="/docs/transformers/v4.47.1/en/main_classes/output#transformers.utils.ModelOutput">ModelOutput</a> instead of a plain tuple.`,name:"return_dict"}],source:"https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/unispeech/modeling_unispeech.py#L1413",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


<p>A <a
  href="/docs/transformers/v4.47.1/en/main_classes/output#transformers.modeling_outputs.Wav2Vec2BaseModelOutput"
>transformers.modeling_outputs.Wav2Vec2BaseModelOutput</a> or a tuple of
<code>torch.FloatTensor</code> (if <code>return_dict=False</code> is passed or when <code>config.return_dict=False</code>) comprising various
elements depending on the configuration (<a
  href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechConfig"
>UniSpeechConfig</a>) and inputs.</p>
<ul>
<li>
<p><strong>last_hidden_state</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, sequence_length, hidden_size)</code>) — Sequence of hidden-states at the output of the last layer of the model.</p>
</li>
<li>
<p><strong>extract_features</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, sequence_length, conv_dim[-1])</code>) — Sequence of extracted feature vectors of the last convolutional layer of the model.</p>
</li>
<li>
<p><strong>hidden_states</strong> (<code>tuple(torch.FloatTensor)</code>, <em>optional</em>, returned when <code>output_hidden_states=True</code> is passed or when <code>config.output_hidden_states=True</code>) — Tuple of <code>torch.FloatTensor</code> (one for the output of the embeddings + one for the output of each layer) of
shape <code>(batch_size, sequence_length, hidden_size)</code>.</p>
<p>Hidden-states of the model at the output of each layer plus the initial embedding outputs.</p>
</li>
<li>
<p><strong>attentions</strong> (<code>tuple(torch.FloatTensor)</code>, <em>optional</em>, returned when <code>output_attentions=True</code> is passed or when <code>config.output_attentions=True</code>) — Tuple of <code>torch.FloatTensor</code> (one for each layer) of shape <code>(batch_size, num_heads, sequence_length, sequence_length)</code>.</p>
<p>Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.</p>
</li>
</ul>
`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>


<p><a
  href="/docs/transformers/v4.47.1/en/main_classes/output#transformers.modeling_outputs.Wav2Vec2BaseModelOutput"
>transformers.modeling_outputs.Wav2Vec2BaseModelOutput</a> or <code>tuple(torch.FloatTensor)</code></p>
`}}),Q=new Tt({props:{$$slots:{default:[zo]},$$scope:{ctx:U}}}),E=new De({props:{anchor:"transformers.UniSpeechModel.forward.example",$$slots:{default:[Go]},$$scope:{ctx:U}}}),be=new B({props:{title:"UniSpeechForCTC",local:"transformers.UniSpeechForCTC",headingTag:"h2"}}),ve=new L({props:{name:"class transformers.UniSpeechForCTC",anchor:"transformers.UniSpeechForCTC",parameters:[{name:"config",val:""},{name:"target_lang",val:": typing.Optional[str] = None"}],parametersDescription:[{anchor:"transformers.UniSpeechForCTC.config",description:`<strong>config</strong> (<a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechConfig">UniSpeechConfig</a>) &#x2014; Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the <a href="/docs/transformers/v4.47.1/en/main_classes/model#transformers.PreTrainedModel.from_pretrained">from_pretrained()</a> method to load the model weights.`,name:"config"},{anchor:"transformers.UniSpeechForCTC.target_lang",description:`<strong>target_lang</strong> (<code>str</code>, <em>optional</em>) &#x2014;
Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
adapter.<lang>.bin. Only relevant when using an instance of <a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechForCTC">UniSpeechForCTC</a> with adapters. Uses &#x2018;eng&#x2019;
by default.</lang></lang>`,name:"target_lang"}],source:"https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/unispeech/modeling_unispeech.py#L1615"}}),ye=new L({props:{name:"forward",anchor:"transformers.UniSpeechForCTC.forward",parameters:[{name:"input_values",val:": typing.Optional[torch.Tensor]"},{name:"attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"output_attentions",val:": typing.Optional[bool] = None"},{name:"output_hidden_states",val:": typing.Optional[bool] = None"},{name:"return_dict",val:": typing.Optional[bool] = None"},{name:"labels",val:": typing.Optional[torch.Tensor] = None"}],parametersDescription:[{anchor:"transformers.UniSpeechForCTC.forward.input_values",description:`<strong>input_values</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, sequence_length)</code>) &#x2014;
Float values of input raw speech waveform. Values can be obtained by loading a <code>.flac</code> or <code>.wav</code> audio file
into an array of type <code>List[float]</code> or a <code>numpy.ndarray</code>, <em>e.g.</em> via the soundfile library (<code>pip install soundfile</code>). To prepare the array into <code>input_values</code>, the <a href="/docs/transformers/v4.47.1/en/model_doc/auto#transformers.AutoProcessor">AutoProcessor</a> should be used for padding and
conversion into a tensor of type <code>torch.FloatTensor</code>. See <a href="/docs/transformers/v4.47.1/en/model_doc/wav2vec2#transformers.Wav2Vec2Processor.__call__">Wav2Vec2Processor.<strong>call</strong>()</a> for details.`,name:"input_values"},{anchor:"transformers.UniSpeechForCTC.forward.attention_mask",description:`<strong>attention_mask</strong> (<code>torch.LongTensor</code> of shape <code>(batch_size, sequence_length)</code>, <em>optional</em>) &#x2014;
Mask to avoid performing convolution and attention on padding token indices. Mask values selected in <code>[0, 1]</code>:</p>
<ul>
<li>1 for tokens that are <strong>not masked</strong>,</li>
<li>0 for tokens that are <strong>masked</strong>.</li>
</ul>
<p><a href="../glossary#attention-mask">What are attention masks?</a></p>
<div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400">
						
<p><code>attention_mask</code> should only be passed if the corresponding processor has <code>config.return_attention_mask == True</code>. For all models whose processor has <code>config.return_attention_mask == False</code>, <code>attention_mask</code> should
<strong>not</strong> be passed to avoid degraded performance when doing batched inference. For such models
<code>input_values</code> should simply be padded with 0 and passed without <code>attention_mask</code>. Be aware that these
models also yield slightly different results depending on whether <code>input_values</code> is padded or not.</p>

					</div>`,name:"attention_mask"},{anchor:"transformers.UniSpeechForCTC.forward.output_attentions",description:`<strong>output_attentions</strong> (<code>bool</code>, <em>optional</em>) &#x2014;
Whether or not to return the attentions tensors of all attention layers. See <code>attentions</code> under returned
tensors for more detail.`,name:"output_attentions"},{anchor:"transformers.UniSpeechForCTC.forward.output_hidden_states",description:`<strong>output_hidden_states</strong> (<code>bool</code>, <em>optional</em>) &#x2014;
Whether or not to return the hidden states of all layers. See <code>hidden_states</code> under returned tensors for
more detail.`,name:"output_hidden_states"},{anchor:"transformers.UniSpeechForCTC.forward.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>) &#x2014;
Whether or not to return a <a href="/docs/transformers/v4.47.1/en/main_classes/output#transformers.utils.ModelOutput">ModelOutput</a> instead of a plain tuple.`,name:"return_dict"},{anchor:"transformers.UniSpeechForCTC.forward.labels",description:`<strong>labels</strong> (<code>torch.LongTensor</code> of shape <code>(batch_size, target_length)</code>, <em>optional</em>) &#x2014;
Labels for connectionist temporal classification. Note that <code>target_length</code> has to be smaller or equal to
the sequence length of the output logits. Indices are selected in <code>[-100, 0, ..., config.vocab_size - 1]</code>.
All labels set to <code>-100</code> are ignored (masked), the loss is only computed for labels in <code>[0, ..., config.vocab_size - 1]</code>.`,name:"labels"}],source:"https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/unispeech/modeling_unispeech.py#L1698",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


<p>A <a
  href="/docs/transformers/v4.47.1/en/main_classes/output#transformers.modeling_outputs.CausalLMOutput"
>transformers.modeling_outputs.CausalLMOutput</a> or a tuple of
<code>torch.FloatTensor</code> (if <code>return_dict=False</code> is passed or when <code>config.return_dict=False</code>) comprising various
elements depending on the configuration (<a
  href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechConfig"
>UniSpeechConfig</a>) and inputs.</p>
<ul>
<li>
<p><strong>loss</strong> (<code>torch.FloatTensor</code> of shape <code>(1,)</code>, <em>optional</em>, returned when <code>labels</code> is provided) — Language modeling loss (for next-token prediction).</p>
</li>
<li>
<p><strong>logits</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, sequence_length, config.vocab_size)</code>) — Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).</p>
</li>
<li>
<p><strong>hidden_states</strong> (<code>tuple(torch.FloatTensor)</code>, <em>optional</em>, returned when <code>output_hidden_states=True</code> is passed or when <code>config.output_hidden_states=True</code>) — Tuple of <code>torch.FloatTensor</code> (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape <code>(batch_size, sequence_length, hidden_size)</code>.</p>
<p>Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.</p>
</li>
<li>
<p><strong>attentions</strong> (<code>tuple(torch.FloatTensor)</code>, <em>optional</em>, returned when <code>output_attentions=True</code> is passed or when <code>config.output_attentions=True</code>) — Tuple of <code>torch.FloatTensor</code> (one for each layer) of shape <code>(batch_size, num_heads, sequence_length, sequence_length)</code>.</p>
<p>Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.</p>
</li>
</ul>
`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>


<p><a
  href="/docs/transformers/v4.47.1/en/main_classes/output#transformers.modeling_outputs.CausalLMOutput"
>transformers.modeling_outputs.CausalLMOutput</a> or <code>tuple(torch.FloatTensor)</code></p>
`}}),D=new Tt({props:{$$slots:{default:[qo]},$$scope:{ctx:U}}}),O=new De({props:{anchor:"transformers.UniSpeechForCTC.forward.example",$$slots:{default:[Vo]},$$scope:{ctx:U}}}),Te=new B({props:{title:"UniSpeechForSequenceClassification",local:"transformers.UniSpeechForSequenceClassification",headingTag:"h2"}}),we=new L({props:{name:"class transformers.UniSpeechForSequenceClassification",anchor:"transformers.UniSpeechForSequenceClassification",parameters:[{name:"config",val:""}],parametersDescription:[{anchor:"transformers.UniSpeechForSequenceClassification.config",description:`<strong>config</strong> (<a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechConfig">UniSpeechConfig</a>) &#x2014; Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the <a href="/docs/transformers/v4.47.1/en/main_classes/model#transformers.PreTrainedModel.from_pretrained">from_pretrained()</a> method to load the model weights.`,name:"config"}],source:"https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/unispeech/modeling_unispeech.py#L1777"}}),Ue=new L({props:{name:"forward",anchor:"transformers.UniSpeechForSequenceClassification.forward",parameters:[{name:"input_values",val:": typing.Optional[torch.Tensor]"},{name:"attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"output_attentions",val:": typing.Optional[bool] = None"},{name:"output_hidden_states",val:": typing.Optional[bool] = None"},{name:"return_dict",val:": typing.Optional[bool] = None"},{name:"labels",val:": typing.Optional[torch.Tensor] = None"}],parametersDescription:[{anchor:"transformers.UniSpeechForSequenceClassification.forward.input_values",description:`<strong>input_values</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, sequence_length)</code>) &#x2014;
Float values of input raw speech waveform. Values can be obtained by loading a <code>.flac</code> or <code>.wav</code> audio file
into an array of type <code>List[float]</code> or a <code>numpy.ndarray</code>, <em>e.g.</em> via the soundfile library (<code>pip install soundfile</code>). To prepare the array into <code>input_values</code>, the <a href="/docs/transformers/v4.47.1/en/model_doc/auto#transformers.AutoProcessor">AutoProcessor</a> should be used for padding and
conversion into a tensor of type <code>torch.FloatTensor</code>. See <a href="/docs/transformers/v4.47.1/en/model_doc/wav2vec2#transformers.Wav2Vec2Processor.__call__">Wav2Vec2Processor.<strong>call</strong>()</a> for details.`,name:"input_values"},{anchor:"transformers.UniSpeechForSequenceClassification.forward.attention_mask",description:`<strong>attention_mask</strong> (<code>torch.LongTensor</code> of shape <code>(batch_size, sequence_length)</code>, <em>optional</em>) &#x2014;
Mask to avoid performing convolution and attention on padding token indices. Mask values selected in <code>[0, 1]</code>:</p>
<ul>
<li>1 for tokens that are <strong>not masked</strong>,</li>
<li>0 for tokens that are <strong>masked</strong>.</li>
</ul>
<p><a href="../glossary#attention-mask">What are attention masks?</a></p>
<div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400">
						
<p><code>attention_mask</code> should only be passed if the corresponding processor has <code>config.return_attention_mask == True</code>. For all models whose processor has <code>config.return_attention_mask == False</code>, <code>attention_mask</code> should
<strong>not</strong> be passed to avoid degraded performance when doing batched inference. For such models
<code>input_values</code> should simply be padded with 0 and passed without <code>attention_mask</code>. Be aware that these
models also yield slightly different results depending on whether <code>input_values</code> is padded or not.</p>

					</div>`,name:"attention_mask"},{anchor:"transformers.UniSpeechForSequenceClassification.forward.output_attentions",description:`<strong>output_attentions</strong> (<code>bool</code>, <em>optional</em>) &#x2014;
Whether or not to return the attentions tensors of all attention layers. See <code>attentions</code> under returned
tensors for more detail.`,name:"output_attentions"},{anchor:"transformers.UniSpeechForSequenceClassification.forward.output_hidden_states",description:`<strong>output_hidden_states</strong> (<code>bool</code>, <em>optional</em>) &#x2014;
Whether or not to return the hidden states of all layers. See <code>hidden_states</code> under returned tensors for
more detail.`,name:"output_hidden_states"},{anchor:"transformers.UniSpeechForSequenceClassification.forward.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>) &#x2014;
Whether or not to return a <a href="/docs/transformers/v4.47.1/en/main_classes/output#transformers.utils.ModelOutput">ModelOutput</a> instead of a plain tuple.`,name:"return_dict"},{anchor:"transformers.UniSpeechForSequenceClassification.forward.labels",description:`<strong>labels</strong> (<code>torch.LongTensor</code> of shape <code>(batch_size,)</code>, <em>optional</em>) &#x2014;
Labels for computing the sequence classification/regression loss. Indices should be in <code>[0, ..., config.num_labels - 1]</code>. If <code>config.num_labels == 1</code> a regression loss is computed (Mean-Square loss), If
<code>config.num_labels &gt; 1</code> a classification loss is computed (Cross-Entropy).`,name:"labels"}],source:"https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/unispeech/modeling_unispeech.py#L1832",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


<p>A <a
  href="/docs/transformers/v4.47.1/en/main_classes/output#transformers.modeling_outputs.SequenceClassifierOutput"
>transformers.modeling_outputs.SequenceClassifierOutput</a> or a tuple of
<code>torch.FloatTensor</code> (if <code>return_dict=False</code> is passed or when <code>config.return_dict=False</code>) comprising various
elements depending on the configuration (<a
  href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechConfig"
>UniSpeechConfig</a>) and inputs.</p>
<ul>
<li>
<p><strong>loss</strong> (<code>torch.FloatTensor</code> of shape <code>(1,)</code>, <em>optional</em>, returned when <code>labels</code> is provided) — Classification (or regression if config.num_labels==1) loss.</p>
</li>
<li>
<p><strong>logits</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, config.num_labels)</code>) — Classification (or regression if config.num_labels==1) scores (before SoftMax).</p>
</li>
<li>
<p><strong>hidden_states</strong> (<code>tuple(torch.FloatTensor)</code>, <em>optional</em>, returned when <code>output_hidden_states=True</code> is passed or when <code>config.output_hidden_states=True</code>) — Tuple of <code>torch.FloatTensor</code> (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape <code>(batch_size, sequence_length, hidden_size)</code>.</p>
<p>Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.</p>
</li>
<li>
<p><strong>attentions</strong> (<code>tuple(torch.FloatTensor)</code>, <em>optional</em>, returned when <code>output_attentions=True</code> is passed or when <code>config.output_attentions=True</code>) — Tuple of <code>torch.FloatTensor</code> (one for each layer) of shape <code>(batch_size, num_heads, sequence_length, sequence_length)</code>.</p>
<p>Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.</p>
</li>
</ul>
`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>


<p><a
  href="/docs/transformers/v4.47.1/en/main_classes/output#transformers.modeling_outputs.SequenceClassifierOutput"
>transformers.modeling_outputs.SequenceClassifierOutput</a> or <code>tuple(torch.FloatTensor)</code></p>
`}}),A=new Tt({props:{$$slots:{default:[No]},$$scope:{ctx:U}}}),K=new De({props:{anchor:"transformers.UniSpeechForSequenceClassification.forward.example",$$slots:{default:[Xo]},$$scope:{ctx:U}}}),Me=new B({props:{title:"UniSpeechForPreTraining",local:"transformers.UniSpeechForPreTraining",headingTag:"h2"}}),Ce=new L({props:{name:"class transformers.UniSpeechForPreTraining",anchor:"transformers.UniSpeechForPreTraining",parameters:[{name:"config",val:": UniSpeechConfig"}],parametersDescription:[{anchor:"transformers.UniSpeechForPreTraining.config",description:`<strong>config</strong> (<a href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechConfig">UniSpeechConfig</a>) &#x2014; Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the <a href="/docs/transformers/v4.47.1/en/main_classes/model#transformers.PreTrainedModel.from_pretrained">from_pretrained()</a> method to load the model weights.`,name:"config"}],source:"https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/unispeech/modeling_unispeech.py#L1469"}}),ke=new L({props:{name:"forward",anchor:"transformers.UniSpeechForPreTraining.forward",parameters:[{name:"input_values",val:": typing.Optional[torch.Tensor]"},{name:"attention_mask",val:": typing.Optional[torch.Tensor] = None"},{name:"output_attentions",val:": typing.Optional[bool] = None"},{name:"output_hidden_states",val:": typing.Optional[bool] = None"},{name:"return_dict",val:": typing.Optional[bool] = None"}],parametersDescription:[{anchor:"transformers.UniSpeechForPreTraining.forward.input_values",description:`<strong>input_values</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, sequence_length)</code>) &#x2014;
Float values of input raw speech waveform. Values can be obtained by loading a <code>.flac</code> or <code>.wav</code> audio file
into an array of type <code>List[float]</code> or a <code>numpy.ndarray</code>, <em>e.g.</em> via the soundfile library (<code>pip install soundfile</code>). To prepare the array into <code>input_values</code>, the <a href="/docs/transformers/v4.47.1/en/model_doc/auto#transformers.AutoProcessor">AutoProcessor</a> should be used for padding and
conversion into a tensor of type <code>torch.FloatTensor</code>. See <a href="/docs/transformers/v4.47.1/en/model_doc/wav2vec2#transformers.Wav2Vec2Processor.__call__">Wav2Vec2Processor.<strong>call</strong>()</a> for details.`,name:"input_values"},{anchor:"transformers.UniSpeechForPreTraining.forward.attention_mask",description:`<strong>attention_mask</strong> (<code>torch.LongTensor</code> of shape <code>(batch_size, sequence_length)</code>, <em>optional</em>) &#x2014;
Mask to avoid performing convolution and attention on padding token indices. Mask values selected in <code>[0, 1]</code>:</p>
<ul>
<li>1 for tokens that are <strong>not masked</strong>,</li>
<li>0 for tokens that are <strong>masked</strong>.</li>
</ul>
<p><a href="../glossary#attention-mask">What are attention masks?</a></p>
<div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400">
						
<p><code>attention_mask</code> should only be passed if the corresponding processor has <code>config.return_attention_mask == True</code>. For all models whose processor has <code>config.return_attention_mask == False</code>, <code>attention_mask</code> should
<strong>not</strong> be passed to avoid degraded performance when doing batched inference. For such models
<code>input_values</code> should simply be padded with 0 and passed without <code>attention_mask</code>. Be aware that these
models also yield slightly different results depending on whether <code>input_values</code> is padded or not.</p>

					</div>`,name:"attention_mask"},{anchor:"transformers.UniSpeechForPreTraining.forward.output_attentions",description:`<strong>output_attentions</strong> (<code>bool</code>, <em>optional</em>) &#x2014;
Whether or not to return the attentions tensors of all attention layers. See <code>attentions</code> under returned
tensors for more detail.`,name:"output_attentions"},{anchor:"transformers.UniSpeechForPreTraining.forward.output_hidden_states",description:`<strong>output_hidden_states</strong> (<code>bool</code>, <em>optional</em>) &#x2014;
Whether or not to return the hidden states of all layers. See <code>hidden_states</code> under returned tensors for
more detail.`,name:"output_hidden_states"},{anchor:"transformers.UniSpeechForPreTraining.forward.return_dict",description:`<strong>return_dict</strong> (<code>bool</code>, <em>optional</em>) &#x2014;
Whether or not to return a <a href="/docs/transformers/v4.47.1/en/main_classes/output#transformers.utils.ModelOutput">ModelOutput</a> instead of a plain tuple.`,name:"return_dict"},{anchor:"transformers.UniSpeechForPreTraining.forward.mask_time_indices",description:`<strong>mask_time_indices</strong> (<code>torch.BoolTensor</code> of shape <code>(batch_size, sequence_length)</code>, <em>optional</em>) &#x2014;
Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
masked extracted features in <em>config.proj_codevector_dim</em> space.`,name:"mask_time_indices"},{anchor:"transformers.UniSpeechForPreTraining.forward.sampled_negative_indices",description:`<strong>sampled_negative_indices</strong> (<code>torch.BoolTensor</code> of shape <code>(batch_size, sequence_length, num_negatives)</code>, <em>optional</em>) &#x2014;
Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
Required input for pre-training.`,name:"sampled_negative_indices"}],source:"https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/unispeech/modeling_unispeech.py#L1533",returnDescription:`<script context="module">export const metadata = 'undefined';<\/script>


<p>A <a
  href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput"
>transformers.models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput</a> or a tuple of
<code>torch.FloatTensor</code> (if <code>return_dict=False</code> is passed or when <code>config.return_dict=False</code>) comprising various
elements depending on the configuration (<a
  href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.UniSpeechConfig"
>UniSpeechConfig</a>) and inputs.</p>
<ul>
<li>
<p><strong>loss</strong> (<em>optional</em>, returned when model is in train mode, <code>torch.FloatTensor</code> of shape <code>(1,)</code>) — Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the <a
  href="https://arxiv.org/pdf/2006.11477.pdf"
  rel="nofollow"
>official
paper</a> . (classification) loss.</p>
</li>
<li>
<p><strong>projected_states</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, sequence_length, config.proj_codevector_dim)</code>) — Hidden-states of the model projected to <em>config.proj_codevector_dim</em> that can be used to predict the masked
projected quantized states.</p>
</li>
<li>
<p><strong>projected_quantized_states</strong> (<code>torch.FloatTensor</code> of shape <code>(batch_size, sequence_length, config.proj_codevector_dim)</code>) — Quantized extracted feature vectors projected to <em>config.proj_codevector_dim</em> representing the positive
target vectors for contrastive loss.</p>
</li>
<li>
<p><strong>hidden_states</strong> (<code>tuple(torch.FloatTensor)</code>, <em>optional</em>, returned when <code>output_hidden_states=True</code> is passed or when <code>config.output_hidden_states=True</code>) — Tuple of <code>torch.FloatTensor</code> (one for the output of the embeddings + one for the output of each layer) of
shape <code>(batch_size, sequence_length, hidden_size)</code>.</p>
<p>Hidden-states of the model at the output of each layer plus the initial embedding outputs.</p>
</li>
<li>
<p><strong>attentions</strong> (<code>tuple(torch.FloatTensor)</code>, <em>optional</em>, returned when <code>output_attentions=True</code> is passed or when <code>config.output_attentions=True</code>) — Tuple of <code>torch.FloatTensor</code> (one for each layer) of shape <code>(batch_size, num_heads, sequence_length, sequence_length)</code>.</p>
<p>Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.</p>
</li>
</ul>
`,returnType:`<script context="module">export const metadata = 'undefined';<\/script>


<p><a
  href="/docs/transformers/v4.47.1/en/model_doc/unispeech#transformers.models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput"
>transformers.models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput</a> or <code>tuple(torch.FloatTensor)</code></p>
`}}),ee=new Tt({props:{$$slots:{default:[Ro]},$$scope:{ctx:U}}}),te=new De({props:{anchor:"transformers.UniSpeechForPreTraining.forward.example",$$slots:{default:[Io]},$$scope:{ctx:U}}}),$e=new Wo({props:{source:"https://github.com/huggingface/transformers/blob/main/docs/source/en/model_doc/unispeech.md"}}),{c(){t=p("meta"),T=a(),l=p("p"),d=a(),h(y.$$.fragment),o=a(),h(w.$$.fragment),Ae=a(),ne=p("p"),ne.innerHTML=to,Ke=a(),se=p("p"),se.textContent=oo,et=a(),ae=p("p"),ae.innerHTML=no,tt=a(),re=p("p"),re.innerHTML=so,ot=a(),h(ie.$$.fragment),nt=a(),le=p("ul"),le.innerHTML=ao,st=a(),h(ce.$$.fragment),at=a(),de=p("ul"),de.innerHTML=ro,rt=a(),h(pe.$$.fragment),it=a(),j=p("div"),h(me.$$.fragment),wt=a(),je=p("p"),je.innerHTML=io,Ut=a(),xe=p("p"),xe.innerHTML=lo,Mt=a(),h(P.$$.fragment),lt=a(),h(he.$$.fragment),ct=a(),H=p("div"),h(ue.$$.fragment),Ct=a(),Fe=p("p"),Fe.innerHTML=co,dt=a(),h(fe.$$.fragment),pt=a(),C=p("div"),h(ge.$$.fragment),kt=a(),Je=p("p"),Je.innerHTML=po,$t=a(),We=p("p"),We.innerHTML=mo,St=a(),Ze=p("p"),Ze.innerHTML=ho,jt=a(),W=p("div"),h(_e.$$.fragment),xt=a(),ze=p("p"),ze.innerHTML=uo,Ft=a(),h(Q.$$.fragment),Jt=a(),h(E.$$.fragment),mt=a(),h(be.$$.fragment),ht=a(),k=p("div"),h(ve.$$.fragment),Wt=a(),Ge=p("p"),Ge.innerHTML=fo,Zt=a(),qe=p("p"),qe.innerHTML=go,zt=a(),Ve=p("p"),Ve.innerHTML=_o,Gt=a(),Z=p("div"),h(ye.$$.fragment),qt=a(),Ne=p("p"),Ne.innerHTML=bo,Vt=a(),h(D.$$.fragment),Nt=a(),h(O.$$.fragment),ut=a(),h(Te.$$.fragment),ft=a(),M=p("div"),h(we.$$.fragment),Xt=a(),Xe=p("p"),Xe.textContent=vo,Rt=a(),Re=p("p"),Re.innerHTML=yo,It=a(),Ie=p("p"),Ie.innerHTML=To,Lt=a(),Le=p("p"),Le.innerHTML=wo,Bt=a(),z=p("div"),h(Ue.$$.fragment),Ht=a(),Be=p("p"),Be.innerHTML=Uo,Yt=a(),h(A.$$.fragment),Pt=a(),h(K.$$.fragment),gt=a(),h(Me.$$.fragment),_t=a(),$=p("div"),h(Ce.$$.fragment),Qt=a(),He=p("p"),He.innerHTML=Mo,Et=a(),Ye=p("p"),Ye.innerHTML=Co,Dt=a(),Pe=p("p"),Pe.innerHTML=ko,Ot=a(),G=p("div"),h(ke.$$.fragment),At=a(),Qe=p("p"),Qe.innerHTML=$o,Kt=a(),h(ee.$$.fragment),eo=a(),h(te.$$.fragment),bt=a(),h($e.$$.fragment),vt=a(),Ee=p("p"),this.h()},l(e){const n=Jo("svelte-u9bgzb",document.head);t=m(n,"META",{name:!0,content:!0}),n.forEach(s),T=r(e),l=m(e,"P",{}),q(l).forEach(s),d=r(e),u(y.$$.fragment,e),o=r(e),u(w.$$.fragment,e),Ae=r(e),ne=m(e,"P",{"data-svelte-h":!0}),v(ne)!=="svelte-1baphde"&&(ne.innerHTML=to),Ke=r(e),se=m(e,"P",{"data-svelte-h":!0}),v(se)!=="svelte-vfdo9a"&&(se.textContent=oo),et=r(e),ae=m(e,"P",{"data-svelte-h":!0}),v(ae)!=="svelte-1fmim7a"&&(ae.innerHTML=no),tt=r(e),re=m(e,"P",{"data-svelte-h":!0}),v(re)!=="svelte-b741d9"&&(re.innerHTML=so),ot=r(e),u(ie.$$.fragment,e),nt=r(e),le=m(e,"UL",{"data-svelte-h":!0}),v(le)!=="svelte-rmbmdt"&&(le.innerHTML=ao),st=r(e),u(ce.$$.fragment,e),at=r(e),de=m(e,"UL",{"data-svelte-h":!0}),v(de)!=="svelte-11qmliz"&&(de.innerHTML=ro),rt=r(e),u(pe.$$.fragment,e),it=r(e),j=m(e,"DIV",{class:!0});var N=q(j);u(me.$$.fragment,N),wt=r(N),je=m(N,"P",{"data-svelte-h":!0}),v(je)!=="svelte-gx7t9i"&&(je.innerHTML=io),Ut=r(N),xe=m(N,"P",{"data-svelte-h":!0}),v(xe)!=="svelte-jt0smb"&&(xe.innerHTML=lo),Mt=r(N),u(P.$$.fragment,N),N.forEach(s),lt=r(e),u(he.$$.fragment,e),ct=r(e),H=m(e,"DIV",{class:!0});var Se=q(H);u(ue.$$.fragment,Se),Ct=r(Se),Fe=m(Se,"P",{"data-svelte-h":!0}),v(Fe)!=="svelte-5t8cec"&&(Fe.innerHTML=co),Se.forEach(s),dt=r(e),u(fe.$$.fragment,e),pt=r(e),C=m(e,"DIV",{class:!0});var x=q(C);u(ge.$$.fragment,x),kt=r(x),Je=m(x,"P",{"data-svelte-h":!0}),v(Je)!=="svelte-1nzv48r"&&(Je.innerHTML=po),$t=r(x),We=m(x,"P",{"data-svelte-h":!0}),v(We)!=="svelte-158kkql"&&(We.innerHTML=mo),St=r(x),Ze=m(x,"P",{"data-svelte-h":!0}),v(Ze)!=="svelte-68lg8f"&&(Ze.innerHTML=ho),jt=r(x),W=m(x,"DIV",{class:!0});var X=q(W);u(_e.$$.fragment,X),xt=r(X),ze=m(X,"P",{"data-svelte-h":!0}),v(ze)!=="svelte-g4j2s2"&&(ze.innerHTML=uo),Ft=r(X),u(Q.$$.fragment,X),Jt=r(X),u(E.$$.fragment,X),X.forEach(s),x.forEach(s),mt=r(e),u(be.$$.fragment,e),ht=r(e),k=m(e,"DIV",{class:!0});var F=q(k);u(ve.$$.fragment,F),Wt=r(F),Ge=m(F,"P",{"data-svelte-h":!0}),v(Ge)!=="svelte-fx6v10"&&(Ge.innerHTML=fo),Zt=r(F),qe=m(F,"P",{"data-svelte-h":!0}),v(qe)!=="svelte-158kkql"&&(qe.innerHTML=go),zt=r(F),Ve=m(F,"P",{"data-svelte-h":!0}),v(Ve)!=="svelte-68lg8f"&&(Ve.innerHTML=_o),Gt=r(F),Z=m(F,"DIV",{class:!0});var R=q(Z);u(ye.$$.fragment,R),qt=r(R),Ne=m(R,"P",{"data-svelte-h":!0}),v(Ne)!=="svelte-1vy38ww"&&(Ne.innerHTML=bo),Vt=r(R),u(D.$$.fragment,R),Nt=r(R),u(O.$$.fragment,R),R.forEach(s),F.forEach(s),ut=r(e),u(Te.$$.fragment,e),ft=r(e),M=m(e,"DIV",{class:!0});var S=q(M);u(we.$$.fragment,S),Xt=r(S),Xe=m(S,"P",{"data-svelte-h":!0}),v(Xe)!=="svelte-awb8qe"&&(Xe.textContent=vo),Rt=r(S),Re=m(S,"P",{"data-svelte-h":!0}),v(Re)!=="svelte-m5ulxa"&&(Re.innerHTML=yo),It=r(S),Ie=m(S,"P",{"data-svelte-h":!0}),v(Ie)!=="svelte-158kkql"&&(Ie.innerHTML=To),Lt=r(S),Le=m(S,"P",{"data-svelte-h":!0}),v(Le)!=="svelte-68lg8f"&&(Le.innerHTML=wo),Bt=r(S),z=m(S,"DIV",{class:!0});var I=q(z);u(Ue.$$.fragment,I),Ht=r(I),Be=m(I,"P",{"data-svelte-h":!0}),v(Be)!=="svelte-p2wl4s"&&(Be.innerHTML=Uo),Yt=r(I),u(A.$$.fragment,I),Pt=r(I),u(K.$$.fragment,I),I.forEach(s),S.forEach(s),gt=r(e),u(Me.$$.fragment,e),_t=r(e),$=m(e,"DIV",{class:!0});var J=q($);u(Ce.$$.fragment,J),Qt=r(J),He=m(J,"P",{"data-svelte-h":!0}),v(He)!=="svelte-1odph5s"&&(He.innerHTML=Mo),Et=r(J),Ye=m(J,"P",{"data-svelte-h":!0}),v(Ye)!=="svelte-158kkql"&&(Ye.innerHTML=Co),Dt=r(J),Pe=m(J,"P",{"data-svelte-h":!0}),v(Pe)!=="svelte-68lg8f"&&(Pe.innerHTML=ko),Ot=r(J),G=m(J,"DIV",{class:!0});var oe=q(G);u(ke.$$.fragment,oe),At=r(oe),Qe=m(oe,"P",{"data-svelte-h":!0}),v(Qe)!=="svelte-1a3rs2w"&&(Qe.innerHTML=$o),Kt=r(oe),u(ee.$$.fragment,oe),eo=r(oe),u(te.$$.fragment,oe),oe.forEach(s),J.forEach(s),bt=r(e),u($e.$$.fragment,e),vt=r(e),Ee=m(e,"P",{}),q(Ee).forEach(s),this.h()},h(){V(t,"name","hf:doc:metadata"),V(t,"content",Bo),V(j,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),V(H,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),V(W,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),V(C,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),V(Z,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),V(k,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),V(z,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),V(M,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),V(G,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),V($,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,n){i(document.head,t),c(e,T,n),c(e,l,n),c(e,d,n),f(y,e,n),c(e,o,n),f(w,e,n),c(e,Ae,n),c(e,ne,n),c(e,Ke,n),c(e,se,n),c(e,et,n),c(e,ae,n),c(e,tt,n),c(e,re,n),c(e,ot,n),f(ie,e,n),c(e,nt,n),c(e,le,n),c(e,st,n),f(ce,e,n),c(e,at,n),c(e,de,n),c(e,rt,n),f(pe,e,n),c(e,it,n),c(e,j,n),f(me,j,null),i(j,wt),i(j,je),i(j,Ut),i(j,xe),i(j,Mt),f(P,j,null),c(e,lt,n),f(he,e,n),c(e,ct,n),c(e,H,n),f(ue,H,null),i(H,Ct),i(H,Fe),c(e,dt,n),f(fe,e,n),c(e,pt,n),c(e,C,n),f(ge,C,null),i(C,kt),i(C,Je),i(C,$t),i(C,We),i(C,St),i(C,Ze),i(C,jt),i(C,W),f(_e,W,null),i(W,xt),i(W,ze),i(W,Ft),f(Q,W,null),i(W,Jt),f(E,W,null),c(e,mt,n),f(be,e,n),c(e,ht,n),c(e,k,n),f(ve,k,null),i(k,Wt),i(k,Ge),i(k,Zt),i(k,qe),i(k,zt),i(k,Ve),i(k,Gt),i(k,Z),f(ye,Z,null),i(Z,qt),i(Z,Ne),i(Z,Vt),f(D,Z,null),i(Z,Nt),f(O,Z,null),c(e,ut,n),f(Te,e,n),c(e,ft,n),c(e,M,n),f(we,M,null),i(M,Xt),i(M,Xe),i(M,Rt),i(M,Re),i(M,It),i(M,Ie),i(M,Lt),i(M,Le),i(M,Bt),i(M,z),f(Ue,z,null),i(z,Ht),i(z,Be),i(z,Yt),f(A,z,null),i(z,Pt),f(K,z,null),c(e,gt,n),f(Me,e,n),c(e,_t,n),c(e,$,n),f(Ce,$,null),i($,Qt),i($,He),i($,Et),i($,Ye),i($,Dt),i($,Pe),i($,Ot),i($,G),f(ke,G,null),i(G,At),i(G,Qe),i(G,Kt),f(ee,G,null),i(G,eo),f(te,G,null),c(e,bt,n),f($e,e,n),c(e,vt,n),c(e,Ee,n),yt=!0},p(e,[n]){const N={};n&2&&(N.$$scope={dirty:n,ctx:e}),P.$set(N);const Se={};n&2&&(Se.$$scope={dirty:n,ctx:e}),Q.$set(Se);const x={};n&2&&(x.$$scope={dirty:n,ctx:e}),E.$set(x);const X={};n&2&&(X.$$scope={dirty:n,ctx:e}),D.$set(X);const F={};n&2&&(F.$$scope={dirty:n,ctx:e}),O.$set(F);const R={};n&2&&(R.$$scope={dirty:n,ctx:e}),A.$set(R);const S={};n&2&&(S.$$scope={dirty:n,ctx:e}),K.$set(S);const I={};n&2&&(I.$$scope={dirty:n,ctx:e}),ee.$set(I);const J={};n&2&&(J.$$scope={dirty:n,ctx:e}),te.$set(J)},i(e){yt||(g(y.$$.fragment,e),g(w.$$.fragment,e),g(ie.$$.fragment,e),g(ce.$$.fragment,e),g(pe.$$.fragment,e),g(me.$$.fragment,e),g(P.$$.fragment,e),g(he.$$.fragment,e),g(ue.$$.fragment,e),g(fe.$$.fragment,e),g(ge.$$.fragment,e),g(_e.$$.fragment,e),g(Q.$$.fragment,e),g(E.$$.fragment,e),g(be.$$.fragment,e),g(ve.$$.fragment,e),g(ye.$$.fragment,e),g(D.$$.fragment,e),g(O.$$.fragment,e),g(Te.$$.fragment,e),g(we.$$.fragment,e),g(Ue.$$.fragment,e),g(A.$$.fragment,e),g(K.$$.fragment,e),g(Me.$$.fragment,e),g(Ce.$$.fragment,e),g(ke.$$.fragment,e),g(ee.$$.fragment,e),g(te.$$.fragment,e),g($e.$$.fragment,e),yt=!0)},o(e){_(y.$$.fragment,e),_(w.$$.fragment,e),_(ie.$$.fragment,e),_(ce.$$.fragment,e),_(pe.$$.fragment,e),_(me.$$.fragment,e),_(P.$$.fragment,e),_(he.$$.fragment,e),_(ue.$$.fragment,e),_(fe.$$.fragment,e),_(ge.$$.fragment,e),_(_e.$$.fragment,e),_(Q.$$.fragment,e),_(E.$$.fragment,e),_(be.$$.fragment,e),_(ve.$$.fragment,e),_(ye.$$.fragment,e),_(D.$$.fragment,e),_(O.$$.fragment,e),_(Te.$$.fragment,e),_(we.$$.fragment,e),_(Ue.$$.fragment,e),_(A.$$.fragment,e),_(K.$$.fragment,e),_(Me.$$.fragment,e),_(Ce.$$.fragment,e),_(ke.$$.fragment,e),_(ee.$$.fragment,e),_(te.$$.fragment,e),_($e.$$.fragment,e),yt=!1},d(e){e&&(s(T),s(l),s(d),s(o),s(Ae),s(ne),s(Ke),s(se),s(et),s(ae),s(tt),s(re),s(ot),s(nt),s(le),s(st),s(at),s(de),s(rt),s(it),s(j),s(lt),s(ct),s(H),s(dt),s(pt),s(C),s(mt),s(ht),s(k),s(ut),s(ft),s(M),s(gt),s(_t),s($),s(bt),s(vt),s(Ee)),s(t),b(y,e),b(w,e),b(ie,e),b(ce,e),b(pe,e),b(me),b(P),b(he,e),b(ue),b(fe,e),b(ge),b(_e),b(Q),b(E),b(be,e),b(ve),b(ye),b(D),b(O),b(Te,e),b(we),b(Ue),b(A),b(K),b(Me,e),b(Ce),b(ke),b(ee),b(te),b($e,e)}}}const Bo='{"title":"UniSpeech","local":"unispeech","sections":[{"title":"Overview","local":"overview","sections":[],"depth":2},{"title":"Usage tips","local":"usage-tips","sections":[],"depth":2},{"title":"Resources","local":"resources","sections":[],"depth":2},{"title":"UniSpeechConfig","local":"transformers.UniSpeechConfig","sections":[],"depth":2},{"title":"UniSpeech specific outputs","local":"transformers.models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput","sections":[],"depth":2},{"title":"UniSpeechModel","local":"transformers.UniSpeechModel","sections":[],"depth":2},{"title":"UniSpeechForCTC","local":"transformers.UniSpeechForCTC","sections":[],"depth":2},{"title":"UniSpeechForSequenceClassification","local":"transformers.UniSpeechForSequenceClassification","sections":[],"depth":2},{"title":"UniSpeechForPreTraining","local":"transformers.UniSpeechForPreTraining","sections":[],"depth":2}],"depth":1}';function Ho(U){return jo(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Ko extends xo{constructor(t){super(),Fo(this,t,Ho,Lo,So,{})}}export{Ko as component};
