Spaces:

tobiasc
/

conex

Build error

App Files Files Community

conex / espnet2 /tts /duration_calculator.py

tobiasc

Initial commit

ad16788 over 2 years ago

raw

history blame contribute delete

2.28 kB

	# -- coding: utf-8 --

	# Copyright 2020 Nagoya University (Tomoki Hayashi)
	# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)

	"""Duration calculator for ESPnet2."""

	from typing import Tuple

	import torch


	class DurationCalculator(torch.nn.Module):
	"""Duration calculator module."""

	def __init__(self):
	"""Initilize duration calculator."""
	super().__init__()

	@torch.no_grad()
	def forward(self, att_ws: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	"""Convert attention weight to durations.

	Args:
	att_ws (Tesnor): Attention weight tensor (L, T) or (#layers, #heads, L, T).

	Returns:
	LongTensor: Duration of each input (T,).
	Tensor: Focus rate value.

	"""
	duration = self._calculate_duration(att_ws)
	focus_rate = self._calculate_focus_rete(att_ws)

	return duration, focus_rate

	@staticmethod
	def _calculate_focus_rete(att_ws):
	if len(att_ws.shape) == 2:
	# tacotron 2 case -> (L, T)
	return att_ws.max(dim=-1)[0].mean()
	elif len(att_ws.shape) == 4:
	# transformer case -> (#layers, #heads, L, T)
	return att_ws.max(dim=-1)[0].mean(dim=-1).max()
	else:
	raise ValueError("att_ws should be 2 or 4 dimensional tensor.")

	@staticmethod
	def _calculate_duration(att_ws):
	if len(att_ws.shape) == 2:
	# tacotron 2 case -> (L, T)
	pass
	elif len(att_ws.shape) == 4:
	# transformer case -> (#layers, #heads, L, T)
	# get the most diagonal head according to focus rate
	att_ws = torch.cat(
	[att_w for att_w in att_ws], dim=0
	) # (#heads * #layers, L, T)
	diagonal_scores = att_ws.max(dim=-1)[0].mean(dim=-1) # (#heads * #layers,)
	diagonal_head_idx = diagonal_scores.argmax()
	att_ws = att_ws[diagonal_head_idx] # (L, T)
	else:
	raise ValueError("att_ws should be 2 or 4 dimensional tensor.")
	# calculate duration from 2d attention weight
	durations = torch.stack(
	[att_ws.argmax(-1).eq(i).sum() for i in range(att_ws.shape[1])]
	)
	return durations.view(-1)