Spaces:

kotoba-tech
/

kotoba-speech

Running on T4

App Files Files Community

yuta0306 commited on Mar 14

Commit

565faca

•

0 Parent(s):

first commit

Browse files

Files changed (40) hide show

.gitattributes +35 -0
.gitignore +4 -0
Dockerfile +66 -0
README.md +11 -0
fam/__init__.py +0 -0
fam/llm/.fast_inference.py.swp +0 -0
fam/llm/__init__.py +0 -0
fam/llm/adapters/__init__.py +2 -0
fam/llm/adapters/base.py +5 -0
fam/llm/adapters/flattened_encodec.py +38 -0
fam/llm/adapters/tilted_encodec.py +45 -0
fam/llm/decoders.py +103 -0
fam/llm/enhancers.py +102 -0
fam/llm/fast_inference.py +151 -0
fam/llm/fast_inference_utils.py +453 -0
fam/llm/fast_model.py +261 -0
fam/llm/inference.py +710 -0
fam/llm/layers/__init__.py +3 -0
fam/llm/layers/attn.py +185 -0
fam/llm/layers/combined.py +52 -0
fam/llm/layers/layers.py +72 -0
fam/llm/mixins/__init__.py +2 -0
fam/llm/mixins/causal.py +511 -0
fam/llm/mixins/non_causal.py +67 -0
fam/llm/model.py +524 -0
fam/llm/sample.py +731 -0
fam/llm/serving.py +197 -0
fam/llm/utils.py +91 -0
fam/py.typed +0 -0
fam/quantiser/__init__.py +0 -0
fam/quantiser/audio/__init__.py +0 -0
fam/quantiser/audio/speaker_encoder/__init__.py +0 -0
fam/quantiser/audio/speaker_encoder/audio.py +22 -0
fam/quantiser/audio/speaker_encoder/ckpt/.gitattributes +1 -0
fam/quantiser/audio/speaker_encoder/ckpt/ckpt.pt +3 -0
fam/quantiser/audio/speaker_encoder/model.py +123 -0
fam/quantiser/text/tokenise.py +32 -0
fam/ui/app.py +201 -0
requirements.txt +138 -0
setup.py +6 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+/.venv/
+/models/
+/**/__pycache__/
+/**/*.pyc

Dockerfile ADDED Viewed

	@@ -0,0 +1,66 @@

+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y --no-install-recommends \
+	libgirepository1.0-dev \
+    git \
+    git-lfs \
+    wget \
+    curl \
+    # python build dependencies \
+    build-essential \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    libncursesw5-dev \
+    xz-utils \
+    tk-dev \
+    libxml2-dev \
+    libxmlsec1-dev \
+    libffi-dev \
+    liblzma-dev \
+	# nightly dependencies \
+	libdbus-glib-1-dev \
+	libpng-dev \
+	libjpeg-dev \
+	libcairo2-dev \
+    # gradio dependencies \
+    ffmpeg \
+    # fairseq2 dependencies \
+    libsndfile-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:${PATH}
+WORKDIR ${HOME}/app
+COPY --chown=user . $HOME/app
+RUN curl https://pyenv.run | bash
+ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
+ARG PYTHON_VERSION=3.10.13
+RUN pyenv install ${PYTHON_VERSION} && \
+    pyenv global ${PYTHON_VERSION} && \
+    pyenv rehash && \
+    pip install --no-cache-dir -U pip setuptools wheel
+RUN pip install packaging && \
+    pip install -r ${HOME}/app/requirements.txt && \
+    pip install -U flash-attn gradio spacy transformers fastapi tyro julius audiocraft tiktoken hf-transfer && \
+	pip install -U --pre torch torch torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
+ENV PYTHONPATH=${HOME}/app \
+    PYTHONUNBUFFERED=1 \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    GRADIO_ALLOW_FLAGGING=never \
+    GRADIO_NUM_PORTS=1 \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_THEME=huggingface \
+    TQDM_POSITION=-1 \
+    TQDM_MININTERVAL=1 \
+    SYSTEM=spaces
+CMD python fam/llm/serving.py --huggingface_repo_id kotoba-tech/kotoba-speech-v0.1 & python fam/ui/app.py

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: Kotoba Voice Testing
+emoji: 🚀
+colorFrom: red
+colorTo: green
+sdk: docker
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

fam/__init__.py ADDED Viewed

File without changes

fam/llm/.fast_inference.py.swp ADDED Viewed

Binary file (16.4 kB). View file

fam/llm/__init__.py ADDED Viewed

File without changes

fam/llm/adapters/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from fam.llm.adapters.flattened_encodec import FlattenedInterleavedEncodec2Codebook
2	+ from fam.llm.adapters.tilted_encodec import TiltedEncodec

fam/llm/adapters/base.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from abc import ABC
+class BaseDataAdapter(ABC):
+    pass

fam/llm/adapters/flattened_encodec.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from fam.llm.adapters.base import BaseDataAdapter
+class FlattenedInterleavedEncodec2Codebook(BaseDataAdapter):
+    def __init__(self, end_of_audio_token):
+        self._end_of_audio_token = end_of_audio_token
+    def decode(self, tokens: list[list[int]]) -> tuple[list[int], list[list[int]]]:
+        assert len(tokens) == 1
+        tokens = tokens[0]
+        text_ids = []
+        extracted_audio_ids = [[], []]
+        for t in tokens:
+            if t < self._end_of_audio_token:
+                extracted_audio_ids[0].append(t)
+            elif t >= self._end_of_audio_token and t < 2 * self._end_of_audio_token:
+                extracted_audio_ids[1].append(t - self._end_of_audio_token)
+            # We ignore t = 2 * self._end_of_audio_token, as it is the end of audio token
+            elif t > 2 * self._end_of_audio_token:
+                text_ids.append(t)
+        if len(set([len(x) for x in extracted_audio_ids])) != 1:
+            min_len = min([len(x) for x in extracted_audio_ids])
+            max_len = max([len(x) for x in extracted_audio_ids])
+            print("WARNING: Number of tokens at each hierarchy must be of the same length!")
+            print(f"Truncating to min length of {min_len} tokens from {max_len} max.")
+            print([len(x) for x in extracted_audio_ids])
+            extracted_audio_ids = [x[:min_len] for x in extracted_audio_ids]
+        return text_ids[:-1], extracted_audio_ids
+    def encode(self, text_tokens: list[int], audio_tokens: list[list[int]]):
+        """
+        Performs the required combination and padding as needed.
+        """
+        raise NotImplementedError

fam/llm/adapters/tilted_encodec.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from fam.llm.adapters.base import BaseDataAdapter
+class TiltedEncodec(BaseDataAdapter):
+    def __init__(self, end_of_audio_token):
+        self._end_of_audio_token = end_of_audio_token
+    def decode(self, tokens: list[list[int]]) -> tuple[list[int], list[list[int]]]:
+        assert len(tokens) > 1
+        text_ids = []
+        extracted_audio_ids = []
+        extracted_audio_ids.append([])
+        # Handle first hierarchy as special case as it contains text tokens as well
+        # TODO: maybe it doesn't need special case, and can be handled on it's own :)
+        for t in tokens[0]:
+            if t > self._end_of_audio_token:
+                text_ids.append(t)
+            elif t < self._end_of_audio_token:
+                extracted_audio_ids[0].append(t)
+        # Handle the rest of the hierarchies
+        for i in range(1, len(tokens)):
+            token_hierarchy_ids = tokens[i]
+            extracted_audio_ids.append([])
+            for t in token_hierarchy_ids:
+                if t < self._end_of_audio_token:
+                    extracted_audio_ids[i].append(t)
+        if len(set([len(x) for x in extracted_audio_ids])) != 1:
+            min_len = min([len(x) for x in extracted_audio_ids])
+            max_len = max([len(x) for x in extracted_audio_ids])
+            print("WARNING: Number of tokens at each hierarchy must be of the same length!")
+            print(f"Truncating to min length of {min_len} tokens from {max_len} max.")
+            print([len(x) for x in extracted_audio_ids])
+            extracted_audio_ids = [x[:min_len] for x in extracted_audio_ids]
+        return text_ids[:-1], extracted_audio_ids
+    def encode(self, text_tokens: list[int], audio_tokens: list[list[int]]):
+        """
+        Performs the required combination and padding as needed.
+        """
+        raise NotImplementedError

fam/llm/decoders.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+import pathlib
+import uuid
+from abc import ABC, abstractmethod
+from typing import Callable, Optional, Union
+import julius
+import torch
+from audiocraft.data.audio import audio_read, audio_write
+from audiocraft.models import MultiBandDiffusion  # type: ignore
+from IPython import embed
+class Decoder(ABC):
+    @abstractmethod
+    def decode(self, tokens: list[int], ref_audio_path: Optional[str] = None, causal: Optional[bool] = None):
+        raise NotImplementedError
+class EncodecDecoder(Decoder):
+    def __init__(
+        self,
+        tokeniser_decode_fn: Callable[[list[int]], str],
+        data_adapter_fn: Callable[[list[list[int]]], tuple[list[int], list[list[int]]]],
+        output_dir: str,
+    ):
+        self._mbd_bandwidth = 6  # 1.5
+        self._mbd_sample_rate = 24_000
+        self._end_of_audio_token = 1024
+        self._num_codebooks = 8
+        self.mbd = MultiBandDiffusion.get_mbd_24khz(bw=self._mbd_bandwidth)
+        self.tokeniser_decode_fn = tokeniser_decode_fn
+        self._data_adapter_fn = data_adapter_fn
+        self.output_dir = pathlib.Path(output_dir).resolve()
+        os.makedirs(self.output_dir, exist_ok=True)
+    def _save_audio(self, name: str, wav: torch.Tensor):
+        audio_write(
+            name,
+            wav.squeeze(0).cpu(),
+            self._mbd_sample_rate,
+            strategy="loudness",
+            loudness_compressor=True,
+        )
+    def get_tokens(self, audio_path: str) -> list[list[int]]:
+        """
+        Utility method to get tokens from audio. Useful when you want to test reconstruction in some form (e.g.
+        limited codebook reconstruction or sampling from second stage model only).
+        """
+        pass
+        wav, sr = audio_read(audio_path)
+        if sr != self._mbd_sample_rate:
+            wav = julius.resample_frac(wav, sr, self._mbd_sample_rate)
+        if wav.ndim == 2:
+            wav = wav.unsqueeze(1)
+        wav = wav.to("cuda")
+        tokens = self.mbd.codec_model.encode(wav)
+        tokens = tokens[0][0]
+        # embed()
+        return tokens.tolist()
+    def decode(
+        self, tokens: list[list[int]], causal: bool = True, ref_audio_path: Optional[str] = None
+    ) -> Union[str, torch.Tensor]:
+        # TODO: this has strange behaviour -- if causal is True, it returns tokens. if causal is False, it SAVES the audio file.
+        text_ids, extracted_audio_ids = self._data_adapter_fn(tokens)
+        text = self.tokeniser_decode_fn(text_ids)
+        print(f"Text: {text}")
+        tokens = torch.tensor(extracted_audio_ids, device="cuda").unsqueeze(0)
+        if tokens.shape[1] < self._num_codebooks:
+            tokens = torch.cat(
+                [tokens, *[torch.ones_like(tokens[0:1, 0:1]) * 0] * (self._num_codebooks - tokens.shape[1])], dim=1
+            )
+        if causal:
+            return tokens
+        else:
+            with torch.amp.autocast(device_type="cuda", dtype=torch.float32):
+                # embed()
+                wav = self.mbd.tokens_to_wav(tokens)
+            # NOTE: we couldn't just return wav here as it goes through loudness compression etc :)
+        if wav.shape[-1] < 9600:
+            # this causes problem for the code below, and is also odd :)
+            # first happened for tokens (1, 8, 28) -> wav (1, 1, 8960) (~320x factor in time dimension!)
+            raise Exception("wav predicted is shorter than 400ms!")
+        try:
+            wav_file_name = self.output_dir / f"synth_{text.replace(' ', '_')[:25]}_{uuid.uuid4()}"
+            self._save_audio(wav_file_name, wav)
+            print(f"\nSaved audio to {wav_file_name}.wav")
+            return wav_file_name
+        except Exception as e:
+            print(f"Failed to save audio! Reason: {e}")
+            wav_file_name = self.output_dir / f"synth_{uuid.uuid4()}"
+            self._save_audio(wav_file_name, wav)
+            print(f"\nSaved audio to {wav_file_name}.wav")
+            return wav_file_name

fam/llm/enhancers.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+from abc import ABC
+from typing import Literal, Optional
+from df.enhance import enhance, init_df, load_audio, save_audio
+from pydub import AudioSegment
+def convert_to_wav(input_file: str, output_file: str):
+    """Convert an audio file to WAV format
+    Args:
+        input_file (str): path to input audio file
+        output_file (str): path to output WAV file
+    """
+    # Detect the format of the input file
+    format = input_file.split(".")[-1].lower()
+    # Read the audio file
+    audio = AudioSegment.from_file(input_file, format=format)
+    # Export as WAV
+    audio.export(output_file, format="wav")
+def make_output_file_path(audio_file: str, tag: str, ext: Optional[str] = None) -> str:
+    """Generate the output file path
+    Args:
+        audio_file (str): path to input audio file
+        tag (str): tag to append to the output file name
+        ext (str, optional): extension of the output file. Defaults to None.
+    Returns:
+        str: path to output file
+    """
+    directory = "./enhanced"
+    # Get the name of the input file
+    filename = os.path.basename(audio_file)
+    # Get the name of the input file without the extension
+    filename_without_extension = os.path.splitext(filename)[0]
+    # Get the extension of the input file
+    extension = ext or os.path.splitext(filename)[1]
+    # Generate the output file path
+    output_file = os.path.join(directory, filename_without_extension + tag + extension)
+    return output_file
+class BaseEnhancer(ABC):
+    """Base class for audio enhancers"""
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError
+    def __call__(self, audio_file: str, output_file: Optional[str] = None) -> str:
+        raise NotImplementedError
+    def get_output_file(self, audio_file: str, tag: str, ext: Optional[str] = None) -> str:
+        output_file = make_output_file_path(audio_file, tag, ext=ext)
+        os.makedirs(os.path.dirname(output_file), exist_ok=True)
+        return output_file
+class DFEnhancer(BaseEnhancer):
+    def __init__(self, *args, **kwargs):
+        self.model, self.df_state, _ = init_df()
+    def __call__(self, audio_file: str, output_file: Optional[str] = None) -> str:
+        output_file = output_file or self.get_output_file(audio_file, "_df")
+        audio, _ = load_audio(audio_file, sr=self.df_state.sr())
+        enhanced = enhance(self.model, self.df_state, audio)
+        save_audio(output_file, enhanced, self.df_state.sr())
+        return output_file
+def get_enhancer(enhancer_name: Literal["df"]) -> BaseEnhancer:
+    """Get an audio enhancer
+    Args:
+        enhancer_name (Literal["df"]): name of the audio enhancer
+    Raises:
+        ValueError: if the enhancer name is not recognised
+    Returns:
+        BaseEnhancer: audio enhancer
+    """
+    if enhancer_name == "df":
+        return DFEnhancer()
+    else:
+        raise ValueError(f"Unknown enhancer name: {enhancer_name}")

fam/llm/fast_inference.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+import shutil
+import tempfile
+import time
+from pathlib import Path
+import librosa
+import torch
+from huggingface_hub import snapshot_download
+from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook
+from fam.llm.decoders import EncodecDecoder
+from fam.llm.fast_inference_utils import build_model, main
+from fam.llm.inference import (
+    EncodecDecoder,
+    InferenceConfig,
+    Model,
+    TiltedEncodec,
+    TrainedBPETokeniser,
+    get_cached_embedding,
+    get_cached_file,
+    get_enhancer,
+)
+from fam.llm.utils import (
+    check_audio_file,
+    get_default_dtype,
+    get_device,
+    normalize_text,
+)
+import argparse
+class TTS:
+    def __init__(
+            self, model_name: str = "kotoba-tech/kotoba-speech-v0.1", *, seed: int = 1337, output_dir: str = "outputs", first_model_path: str = None,
+    ):
+        """
+        model_name (str): refers to the model identifier from the Hugging Face Model Hub (https://huggingface.co/kotoba-tech/)
+        """
+        # NOTE: this needs to come first so that we don't change global state when we want to use
+        # the torch.compiled-model.
+        self._dtype = get_default_dtype()
+        self._device = get_device()
+        self._model_dir = snapshot_download(repo_id=model_name)
+        self.first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
+        self.output_dir = output_dir
+        os.makedirs(self.output_dir, exist_ok=True)
+        second_stage_ckpt_path = f"{self._model_dir}/second_stage.pt"
+        config_second_stage = InferenceConfig(
+            ckpt_path=second_stage_ckpt_path,
+            num_samples=1,
+            seed=seed,
+            device=self._device,
+            dtype=self._dtype,
+            compile=False,
+            init_from="resume",
+            output_dir=self.output_dir,
+        )
+        data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
+        self.llm_second_stage = Model(
+            config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
+        )
+        self.enhancer = get_enhancer("df")
+        self.precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[self._dtype]
+        self.model, self.tokenizer, self.smodel, self.model_size = build_model(
+            precision=self.precision,
+            checkpoint_path=Path(f"{self._model_dir}/first_stage.pt"),
+            spk_emb_ckpt_path=Path(f"{self._model_dir}/speaker_encoder.pt"),
+            device=self._device,
+            compile=True,
+            compile_prefill=True,
+            first_model_path=first_model_path,
+        )
+    def synthesise(self, text: str, spk_ref_path: str, top_p=0.95, guidance_scale=3.0, temperature=1.0) -> str:
+        """
+        text: Text to speak
+        spk_ref_path: Path to speaker reference file. Min. 30s of audio required. Supports both local paths & public URIs. Audio formats: wav, flac & mp3
+        top_p: Top p for sampling applied to first-stage model. Range [0.9, 1.0] are good. This is a measure of speech stability - improves text following for a challenging speaker
+        guidance_scale: Guidance scale [1.0, 3.0] for sampling. This is a measure of speaker similarity - how closely to match speaker identity and speech style.
+        temperature: Temperature for sampling applied to both LLMs (first & second stage)
+        returns: path to speech .wav file
+        """
+        text = normalize_text(text)
+        spk_ref_path = get_cached_file(spk_ref_path)
+        check_audio_file(spk_ref_path)
+        spk_emb = get_cached_embedding(
+            spk_ref_path,
+            self.smodel,
+        ).to(device=self._device, dtype=self.precision)
+        start = time.time()
+        # first stage LLM
+        tokens = main(
+            model=self.model,
+            tokenizer=self.tokenizer,
+            model_size=self.model_size,
+            prompt=text,
+            spk_emb=spk_emb,
+            top_p=torch.tensor(top_p, device=self._device, dtype=self.precision),
+            guidance_scale=torch.tensor(guidance_scale, device=self._device, dtype=self.precision),
+            temperature=torch.tensor(temperature, device=self._device, dtype=self.precision),
+        )
+        text_ids, extracted_audio_ids = self.first_stage_adapter.decode([tokens])
+        b_speaker_embs = spk_emb.unsqueeze(0)
+        # second stage LLM + multi-band diffusion model
+        wav_files = self.llm_second_stage(
+            texts=[text],
+            encodec_tokens=[torch.tensor(extracted_audio_ids, dtype=torch.int32, device=self._device).unsqueeze(0)],
+            speaker_embs=b_speaker_embs,
+            batch_size=1,
+            guidance_scale=None,
+            top_p=None,
+            top_k=200,
+            temperature=1.0,
+            max_new_tokens=None,
+        )
+        # enhance using deepfilternet
+        wav_file = wav_files[0]
+        with tempfile.NamedTemporaryFile(suffix=".wav") as enhanced_tmp:
+            self.enhancer(str(wav_file) + ".wav", enhanced_tmp.name)
+            shutil.copy2(enhanced_tmp.name, str(wav_file) + ".wav")
+            print(f"\nSaved audio to {wav_file}.wav")
+        # calculating real-time factor (RTF)
+        time_to_synth_s = time.time() - start
+        audio, sr = librosa.load(str(wav_file) + ".wav")
+        duration_s = librosa.get_duration(y=audio, sr=sr)
+        print(f"\nTotal time to synth (s): {time_to_synth_s}")
+        print(f"Real-time factor: {time_to_synth_s / duration_s:.2f}")
+        return str(wav_file) + ".wav"
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Split list into chunks")
+    parser.add_argument("--in_file", default="/home/data/reazon_large-v2_denoise/large.jsonl", help="Name of the file")
+    parser.add_argument("--text", type=str, default="This is a demo for text-to-speech.", help="input text")
+    parser.add_argument("--spk_ref_path", type=str, default="assets/bria.mp3", help="speaker path")
+    parser.add_argument("--first_model_path", type=str, default=None, help="path to the first model")
+    args = parser.parse_args()
+    tts = TTS(first_model_path=args.first_model_path)
+    tts.synthesise(text=args.text, spk_ref_path=args.spk_ref_path)

fam/llm/fast_inference_utils.py ADDED Viewed

	@@ -0,0 +1,453 @@

+# Copyright (c) Kotoba Technologies, Inc. and affiliates.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this list of
+# conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice, this
+# list of conditions and the following disclaimer in the documentation and/or other
+# materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import itertools
+import gc
+import time
+from pathlib import Path
+from typing import Optional, Tuple
+import torch
+import torch._dynamo.config
+import torch._inductor.config
+import tqdm
+def device_sync(device):
+    if "cuda" in device:
+        torch.cuda.synchronize()
+    elif "cpu" in device:
+        pass
+    else:
+        print(f"device={device} is not yet suppported")
+torch._inductor.config.coordinate_descent_tuning = True
+torch._inductor.config.triton.unique_kernel_names = True
+torch._inductor.config.fx_graph_cache = (
+    True  # Experimental feature to reduce compilation times, will be on by default in future
+)
+# imports need to happen after setting above flags
+from fam.llm.fast_model import Transformer
+from fam.quantiser.audio.speaker_encoder.model import SpeakerEncoder
+from fam.quantiser.text.tokenise import TrainedBPETokeniser
+def multinomial_sample_one_no_sync(
+    probs_sort,
+):  # Does multinomial sampling without a cuda synchronization
+    q = torch.empty_like(probs_sort).exponential_(1)
+    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
+def top_p_sample(logits: torch.Tensor, top_p: torch.Tensor):
+    # ref: huggingface/transformers
+    sorted_logits, sorted_indices = torch.sort(logits, descending=False)
+    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+    # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
+    sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
+    # Keep at least min_tokens_to_keep
+    sorted_indices_to_remove[-1:] = 0
+    # scatter sorted tensors to original indexing
+    indices_to_remove = sorted_indices_to_remove.scatter(0, sorted_indices, sorted_indices_to_remove)
+    scores = logits.masked_fill(indices_to_remove, -float("Inf"))
+    return scores
+def logits_to_probs(
+    logits,
+    *,
+    temperature: torch.Tensor,
+    top_p: Optional[torch.Tensor] = None,
+    top_k: Optional[torch.Tensor] = None,
+):
+    logits = logits / torch.max(temperature, 1e-5 * torch.ones_like(temperature))
+    if top_k is not None:
+        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+        pivot = v.select(-1, -1).unsqueeze(-1)
+        logits = torch.where(logits < pivot, -float("Inf"), logits)
+    if top_p is not None:
+        logits = top_p_sample(logits, top_p)
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    return probs
+def sample(
+    logits,
+    guidance_scale: torch.Tensor,
+    temperature: torch.Tensor,
+    top_p: Optional[torch.Tensor] = None,
+    top_k: Optional[torch.Tensor] = None,
+):
+    # (b, t, vocab_size)
+    logits = logits[:, -1]
+    logits_cond, logits_uncond_spkemb = logits.split(logits.size(0) // 2, dim=0)
+    logits = guidance_scale * logits_cond + (1 - guidance_scale) * logits_uncond_spkemb
+    probs = logits_to_probs(logits[0], temperature=temperature, top_p=top_p, top_k=top_k)
+    idx_next = multinomial_sample_one_no_sync(probs)
+    return idx_next, probs
+def prefill(
+    model: Transformer,
+    x: torch.Tensor,
+    spk_emb: torch.Tensor,
+    input_pos: torch.Tensor,
+    **sampling_kwargs,
+) -> torch.Tensor:
+    # input_pos: [B, S]
+    logits = model(x, spk_emb, input_pos)
+    return sample(logits, **sampling_kwargs)[0]
+def decode_one_token(
+    model: Transformer,
+    x: torch.Tensor,
+    spk_emb: torch.Tensor,
+    input_pos: torch.Tensor,
+    **sampling_kwargs,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # input_pos: [B, 1]
+    assert input_pos.shape[-1] == 1
+    logits = model(x, spk_emb, input_pos)
+    return sample(logits, **sampling_kwargs)
+def decode_n_tokens(
+    model: Transformer,
+    cur_token: torch.Tensor,
+    spk_emb: torch.Tensor,
+    input_pos: torch.Tensor,
+    num_new_tokens: int,
+    callback=lambda _: _,
+    return_probs: bool = False,
+    end_of_audio_token: int = 2048,
+    **sampling_kwargs,
+):
+    new_tokens, new_probs = [], []
+    for i in tqdm.tqdm(range(num_new_tokens)):
+        if (cur_token == end_of_audio_token).any():
+            break
+        with torch.backends.cuda.sdp_kernel(
+            enable_flash=False, enable_mem_efficient=False, enable_math=True
+        ):  # Actually better for Inductor to codegen attention here
+            next_token, next_prob = decode_one_token(model, cur_token, spk_emb, input_pos, **sampling_kwargs)
+            input_pos += 1
+            new_tokens.append(next_token.clone())
+            callback(new_tokens[-1])
+            if return_probs:
+                new_probs.append(next_prob.clone())
+            cur_token = next_token.view(1, -1).repeat(2, 1)
+    return new_tokens, new_probs
+def model_forward(model, x, spk_emb, input_pos):
+    return model(x, spk_emb, input_pos)
+@torch.no_grad()
+def generate(
+    model: Transformer,
+    prompt: torch.Tensor,
+    spk_emb: torch.Tensor,
+    *,
+    max_new_tokens: Optional[int] = None,
+    callback=lambda x: x,
+    end_of_audio_token: int = 2048,
+    **sampling_kwargs,
+) -> torch.Tensor:
+    """
+    Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
+    """
+    # create an empty tensor of the expected final shape and fill in the current tokens
+    T = prompt.size(0)
+    if max_new_tokens is None:
+        max_seq_length = model.config.block_size
+    else:
+        max_seq_length = T + max_new_tokens
+        max_seq_length = min(max_seq_length, model.config.block_size)
+    max_new_tokens = max_seq_length - T
+    if max_new_tokens <= 0:
+        raise ValueError("Prompt is too long to generate more tokens")
+    device, dtype = prompt.device, prompt.dtype
+    seq = torch.clone(prompt)
+    input_pos = torch.arange(0, T, device=device)
+    next_token = prefill(model, prompt.view(1, -1).repeat(2, 1), spk_emb, input_pos, **sampling_kwargs)
+    seq = torch.cat([seq, next_token.view(1)])
+    input_pos = torch.tensor([T], device=device, dtype=torch.int)
+    generated_tokens, _ = decode_n_tokens(
+        model,
+        next_token.view(1, -1).repeat(2, 1),
+        spk_emb,
+        input_pos,
+        max_new_tokens - 1,
+        callback=callback,
+        end_of_audio_token=end_of_audio_token,
+        **sampling_kwargs,
+    )
+    seq = torch.cat([seq, torch.cat(generated_tokens)])
+    return seq
+def encode_tokens(tokenizer, string, device="cuda"):
+    tokens = tokenizer.encode(string)
+    return torch.tensor(tokens, dtype=torch.int, device=device)
+def _load_model(checkpoint_path, spk_emb_ckpt_path, device, precision, first_model_path=None, unwanted_prefix="_orig_mod."):
+    ##### MODEL
+    with torch.device("meta"):
+        model = Transformer.from_name("kotoba-speech-v0.1")
+    # TODO(quantization): enable
+    # if "int8" in str(checkpoint_path):
+    #     print("Using int8 weight-only quantization!")
+    #     from quantize import WeightOnlyInt8QuantHandler
+    #     simple_quantizer = WeightOnlyInt8QuantHandler(model)
+    #     model = simple_quantizer.convert_for_runtime()
+    # from quantize import WeightOnlyInt8QuantHandler
+    # if "int4" in str(checkpoint_path):
+    #     print("Using int4 quantization!")
+    #     path_comps = checkpoint_path.name.split(".")
+    #     assert path_comps[-2].startswith("g")
+    #     groupsize = int(path_comps[-2][1:])
+    #     from quantize import WeightOnlyInt4QuantHandler
+    #     simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
+    #     model = simple_quantizer.convert_for_runtime()
+    checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=False)
+    ###### TOKENIZER
+    tokenizer_info = checkpoint.get("meta", {}).get("tokenizer", {})
+    tokenizer = TrainedBPETokeniser(**tokenizer_info)
+    if first_model_path is not None:
+        trained_ckpt = torch.load(str(first_model_path), mmap=True, weights_only=False)
+        state_dict = trained_ckpt["state_dict"]
+        del checkpoint
+        gc.collect()
+        torch.cuda.empty_cache()
+    else:
+        checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=False)
+        if "state_dict" in checkpoint.keys():
+            state_dict = checkpoint["state_dict"]
+        else:
+            state_dict = checkpoint["model"]
+    # convert Kotoba-Speech model weights naming to gptfast naming
+    for k, v in list(state_dict.items()):
+        if k.startswith(unwanted_prefix):
+            state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
+    state_dict["tok_embeddings.weight"] = state_dict.pop("transformer.wtes.0.weight")
+    state_dict["pos_embeddings.weight"] = state_dict.pop("transformer.wpe.weight")
+    state_dict["output.weight"] = state_dict.pop("lm_heads.0.weight")
+    state_dict["norm.weight"] = state_dict.pop("transformer.ln_f.weight")
+    for k, v in list(state_dict.items()):
+        if k.startswith("transformer.h."):
+            state_dict[k.replace("transformer.h.", "layers.")] = state_dict.pop(k)
+            k = k.replace("transformer.h.", "layers.")
+        if ".attn.c_attn." in k:
+            state_dict[k.replace(".attn.c_attn.", ".attention.wqkv.")] = state_dict.pop(k)
+            k = k.replace(".attn.c_attn.", ".attention.wqkv.")
+        if ".attn.c_proj." in k:
+            state_dict[k.replace(".attn.c_proj.", ".attention.wo.")] = state_dict.pop(k)
+            k = k.replace(".attn.c_proj.", ".attention.wo.")
+        if ".mlp.swiglu.w1." in k:
+            state_dict[k.replace(".mlp.swiglu.w1.", ".feed_forward.swiglu.w1.")] = state_dict.pop(k)
+            k = k.replace(".mlp.swiglu.w1.", ".feed_forward.swiglu.w1.")
+        if ".mlp.swiglu.w3." in k:
+            state_dict[k.replace(".mlp.swiglu.w3.", ".feed_forward.swiglu.w3.")] = state_dict.pop(k)
+            k = k.replace(".mlp.swiglu.w3.", ".feed_forward.swiglu.w3.")
+        if ".ln_1." in k:
+            state_dict[k.replace(".ln_1.", ".attention_norm.")] = state_dict.pop(k)
+            k = k.replace(".ln_1.", ".attention_norm.")
+        if ".ln_2." in k:
+            state_dict[k.replace(".ln_2.", ".ffn_norm.")] = state_dict.pop(k)
+            k = k.replace(".ln_2.", ".ffn_norm.")
+        if ".mlp.c_proj." in k:
+            state_dict[k.replace(".mlp.c_proj.", ".feed_forward.w2.")] = state_dict.pop(k)
+            k = k.replace(".mlp.c_proj.", ".feed_forward.w2.")
+    model.load_state_dict(state_dict, assign=True)
+    # simple_quantizer = WeightOnlyInt8QuantHandler(model)
+    # quantized_state_dict = simple_quantizer.create_quantized_state_dict()
+    # model = simple_quantizer.convert_for_runtime()
+    # model.load_state_dict(quantized_state_dict, assign=True)
+    model = model.to(device=device, dtype=precision)
+    ###### SPEAKER EMBEDDER
+    # TODO: fix!
+    smodel = SpeakerEncoder(
+        weights_fpath=spk_emb_ckpt_path,
+        device=device,
+        eval=True,
+        verbose=False,
+    )
+    return model.eval(), tokenizer, smodel
+def build_model(
+    *,
+    precision: torch.dtype,
+    checkpoint_path: Path = Path(""),
+    spk_emb_ckpt_path: Path = Path(""),
+    compile_prefill: bool = False,
+    compile: bool = True,
+    device: str = "cuda",
+    first_model_path: str = None,
+):
+    assert checkpoint_path.is_file(), checkpoint_path
+    print(f"Using device={device}")
+    print("Loading model ...")
+    t0 = time.time()
+    if first_model_path is None:
+        # model, tokenizer, smodel = _load_model(checkpoint_path, spk_emb_ckpt_path, device, precision)
+        model, tokenizer, smodel = _load_model(
+            checkpoint_path, spk_emb_ckpt_path, device, precision, unwanted_prefix="first_stage_model_transformer."
+        )
+    else:
+        model, tokenizer, smodel = _load_model(checkpoint_path, spk_emb_ckpt_path, device, precision, first_model_path, unwanted_prefix="first_stage_model_transformer.")
+    device_sync(device=device)  # MKG
+    print(f"Time to load model: {time.time() - t0:.02f} seconds")
+    torch.manual_seed(1234)
+    model_size = sum([p.numel() * p.dtype.itemsize for p in itertools.chain(model.parameters(), model.buffers())])
+    with torch.device(device):
+        model.setup_spk_cond_mask()
+        model.setup_caches(max_batch_size=2, max_seq_length=model.config.block_size)
+    if compile:
+        print("Compiling...Can take up to 2 mins.")
+        global decode_one_token, prefill
+        decode_one_token = torch.compile(
+            decode_one_token,
+            mode="max-autotune",
+            fullgraph=True,
+        )
+        if compile_prefill:
+            prefill = torch.compile(
+                prefill,
+                fullgraph=True,
+                dynamic=True,
+            )
+    encoded = encode_tokens(tokenizer, "Hello, what's up?", device=device)
+    spk_emb = torch.randn((1, 256), device=device, dtype=precision)
+    device_sync(device=device)  # MKG
+    t0 = time.perf_counter()
+    y = generate(
+        model,
+        encoded,
+        spk_emb,
+        max_new_tokens=200,
+        callback=lambda x: x,
+        temperature=torch.tensor(1.0, device=device, dtype=precision),
+        top_k=None,
+        top_p=torch.tensor(0.95, device=device, dtype=precision),
+        guidance_scale=torch.tensor(3.0, device=device, dtype=precision),
+        end_of_audio_token=9999,  # don't end early for compilation stage.
+    )
+    device_sync(device=device)  # MKG
+    print(f"Compilation time: {time.perf_counter() - t0:.2f} seconds")
+    return model, tokenizer, smodel, model_size
+def main(
+    *,
+    model,
+    tokenizer,
+    model_size,
+    prompt: str,
+    guidance_scale: torch.Tensor,
+    temperature: torch.Tensor,
+    spk_emb: torch.Tensor,
+    top_k: Optional[torch.Tensor] = None,
+    top_p: Optional[torch.Tensor] = None,
+    device: str = "cuda",
+) -> list:
+    """Generates text samples based on a pre-trained Transformer model and tokenizer."""
+    encoded = encode_tokens(tokenizer, prompt, device=device)
+    prompt_length = encoded.size(0)
+    aggregate_metrics: dict = {
+        "tokens_per_sec": [],
+    }
+    device_sync(device=device)  # MKG
+    if True:
+        callback = lambda x: x
+    t0 = time.perf_counter()
+    y = generate(
+        model,
+        encoded,
+        spk_emb,
+        callback=callback,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        guidance_scale=guidance_scale,
+    )
+    device_sync(device=device)  # MKG
+    t = time.perf_counter() - t0
+    tokens_generated = y.size(0) - prompt_length
+    tokens_sec = tokens_generated / t
+    aggregate_metrics["tokens_per_sec"].append(tokens_sec)
+    print(f"Time for 1st stage LLM inference: {t:.02f} sec total, {tokens_sec:.02f} tokens/sec")
+    print(f"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s")
+    # print(f"Average tokens/sec: {torch.mean(torch.tensor(aggregate_metrics['tokens_per_sec'])).item():.2f}")
+    print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB\n")
+    return y.tolist()

fam/llm/fast_model.py ADDED Viewed

	@@ -0,0 +1,261 @@

+# Copyright (c) Kotoba Technologies, Inc. and affiliates.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this list of
+# conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice, this
+# list of conditions and the following disclaimer in the documentation and/or other
+# materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from dataclasses import dataclass
+from functools import reduce
+from math import gcd
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import functional as F
+from fam.llm.utils import get_default_dtype
+import logging
+# Adjust the logging level
+logger = logging.getLogger("torch")
+logger.setLevel(logging.ERROR)
+def find_multiple(n: int, *args: Tuple[int]) -> int:
+    k = reduce(lambda x, y: x * y // gcd(x, y), args + (1,))
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+@dataclass
+class ModelArgs:
+    block_size: int = 2048
+    vocab_size: int = 32000
+    n_layer: int = 32
+    n_head: int = 32
+    dim: int = 4096
+    speaker_emb_dim: int = 256
+    intermediate_size: int = None
+    n_local_heads: int = -1
+    head_dim: int = 64
+    norm_eps: float = 1e-5
+    dtype: torch.dtype = torch.bfloat16
+    def __post_init__(self):
+        if self.n_local_heads == -1:
+            self.n_local_heads = self.n_head
+        if self.intermediate_size is None:
+            hidden_dim = 4 * self.dim
+            n_hidden = int(2 * hidden_dim / 3)
+            self.intermediate_size = find_multiple(n_hidden, 256)
+        self.head_dim = self.dim // self.n_head
+        self.dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[get_default_dtype()]
+    @classmethod
+    def from_name(cls, name: str):
+        if name in transformer_configs:
+            return cls(**transformer_configs[name])
+        # fuzzy search
+        config = [config for config in transformer_configs if config in str(name).upper() or config in str(name)]
+        assert len(config) == 1, name
+        return cls(**transformer_configs[config[0]])
+transformer_configs = {
+    "kotoba-speech-v0.1": dict(
+        n_layer=24,
+        n_head=16,
+        dim=2048,
+        vocab_size=2562,
+    ),
+}
+class KVCache(nn.Module):
+    def __init__(self, max_batch_size, max_seq_length, n_heads, head_dim, dtype):
+        super().__init__()
+        cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
+        self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype))
+        self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype))
+    def update(self, input_pos, k_val, v_val):
+        # input_pos: [S], k_val: [B, H, S, D]
+        assert input_pos.shape[0] == k_val.shape[2]
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out[:, :, input_pos] = k_val
+        v_out[:, :, input_pos] = v_val
+        return k_out, v_out
+class Transformer(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.config = config
+        self.tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
+        self.pos_embeddings = nn.Embedding(config.block_size, config.dim)
+        self.speaker_cond_pos = nn.Linear(config.speaker_emb_dim, config.dim, bias=False)
+        self.layers = nn.ModuleList(TransformerBlock(config) for _ in range(config.n_layer))
+        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
+        self.mask_cache: Optional[Tensor] = None
+        self.max_batch_size = -1
+        self.max_seq_length = -1
+    def setup_spk_cond_mask(self):
+        self.spk_cond_mask = torch.zeros((2, 1, self.config.dim), dtype=torch.bool)
+        self.spk_cond_mask[0] = 1
+    def setup_caches(self, max_batch_size, max_seq_length):
+        if self.max_seq_length >= max_seq_length and self.max_batch_size >= max_batch_size:
+            return
+        head_dim = self.config.dim // self.config.n_head
+        max_seq_length = find_multiple(max_seq_length, 8)
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+        for b in self.layers:
+            b.attention.kv_cache = KVCache(
+                max_batch_size, max_seq_length, self.config.n_local_heads, head_dim, dtype=self.config.dtype
+            )
+        self.causal_mask = torch.tril(torch.ones(self.max_seq_length, self.max_seq_length, dtype=torch.bool))
+    def forward(self, idx: Tensor, spk_emb: Tensor, input_pos: Tensor) -> Tensor:
+        mask = self.causal_mask[None, None, input_pos]
+        x = (
+            self.tok_embeddings(idx)
+            + self.pos_embeddings(input_pos)
+            # masking for speaker condition free guidance
+            + self.speaker_cond_pos(spk_emb) * self.spk_cond_mask
+        )
+        for i, layer in enumerate(self.layers):
+            x = layer(x, input_pos, mask)
+        x = self.norm(x)
+        logits = self.output(x)
+        return logits
+    @classmethod
+    def from_name(cls, name: str):
+        return cls(ModelArgs.from_name(name))
+class TransformerBlock(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.attention = Attention(config)
+        self.feed_forward = FeedForward(config)
+        self.ffn_norm = RMSNorm(config.dim, config.norm_eps)
+        self.attention_norm = RMSNorm(config.dim, config.norm_eps)
+    def forward(self, x: Tensor, input_pos: Tensor, mask: Tensor) -> Tensor:
+        h = x + self.attention(self.attention_norm(x), mask, input_pos)
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+class Attention(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        assert config.dim % config.n_head == 0
+        total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
+        # key, query, value projections for all heads, but in a batch
+        self.wqkv = nn.Linear(config.dim, total_head_dim, bias=False)
+        self.wo = nn.Linear(config.dim, config.dim, bias=False)
+        self.kv_cache = None
+        self.n_head = config.n_head
+        self.head_dim = config.head_dim
+        self.n_local_heads = config.n_local_heads
+        self.dim = config.dim
+    def forward(
+        self,
+        x: Tensor,
+        mask: Tensor,
+        input_pos: Optional[Tensor] = None,
+    ) -> Tensor:
+        bsz, seqlen, _ = x.shape
+        kv_size = self.n_local_heads * self.head_dim
+        q, k, v = self.wqkv(x).split([self.dim, kv_size, kv_size], dim=-1)
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+        if self.kv_cache is not None:
+            k, v = self.kv_cache.update(input_pos, k, v)
+        k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+        y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+        y = self.wo(y)
+        return y
+class SwiGLU(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.w1 = nn.Linear(config.dim, config.intermediate_size, bias=False)
+        self.w3 = nn.Linear(config.dim, config.intermediate_size, bias=False)
+    def forward(self, x: Tensor) -> Tensor:
+        return F.silu(self.w1(x)) * self.w3(x)
+class FeedForward(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.swiglu = SwiGLU(config)
+        self.w2 = nn.Linear(config.intermediate_size, config.dim, bias=False)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.w2(self.swiglu(x))
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+    def forward(self, x: Tensor) -> Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight

fam/llm/inference.py ADDED Viewed

	@@ -0,0 +1,710 @@

+import dataclasses
+import hashlib
+import json
+import os
+import pathlib
+import shutil
+import subprocess
+import tempfile
+import time
+from contextlib import nullcontext
+from dataclasses import dataclass
+from typing import List, Literal, Optional, Tuple, Type, Union
+import torch
+import tqdm
+import tqdm.contrib.concurrent
+import tyro
+from huggingface_hub import snapshot_download
+from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook, TiltedEncodec
+from fam.llm.decoders import Decoder, EncodecDecoder
+from fam.llm.enhancers import BaseEnhancer, get_enhancer
+from fam.llm.model import GPT, GPTConfig
+from fam.llm.utils import check_audio_file, get_default_dtype, normalize_text
+from fam.quantiser.audio.speaker_encoder.model import SpeakerEncoder
+from fam.quantiser.text.tokenise import TrainedBPETokeniser
+@dataclass
+class InferenceConfig:
+    ckpt_path: str  # path to checkpoint
+    output_dir: str
+    num_samples: int = 10  # number of samples to draw
+    seed: int = 1337  # random seed
+    device: str = "cuda"
+    dtype: str = "bfloat16"
+    compile: bool = False
+    init_from: str = "resume"  # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
+    def __str__(self):
+        field_strs = []
+        for field in dataclasses.fields(self):
+            value = getattr(self, field.name)
+            field_strs.append(f"  {field.name}: {value}")
+        return "InferenceConfig:\n" + "\n".join(field_strs)
+class Model:
+    def __init__(
+        self,
+        config: InferenceConfig,
+        tokenizer_cls: Type[TrainedBPETokeniser],
+        decoder_cls: Type[Decoder],
+        data_adapter_fn,
+        use_kv_cache: Optional[Literal["vanilla"]] = None,
+    ):
+        # TODO: disentangle the encodec stuff and numbers etc with rest of this code (esp at encoder-only / second stage model inference)
+        # TODO: remove magic number
+        self._encodec_codes_pad_token = 1024
+        self._num_encodec_codebooks = 8
+        self.config = config
+        self.use_kv_cache = use_kv_cache
+        torch.manual_seed(config.seed)
+        torch.cuda.manual_seed(config.seed)
+        torch.backends.cuda.matmul.allow_tf32 = True if config.dtype != "float32" else False  # allow tf32 on matmul
+        torch.backends.cudnn.allow_tf32 = True if config.dtype != "float32" else False  # allow tf32 on cudnn
+        device_type = "cuda" if "cuda" in config.device else "cpu"  # for later use in torch.autocast
+        self.ptdtype = {
+            "float32": torch.float32,
+            "tfloat32": torch.float32,
+            "bfloat16": torch.bfloat16,
+            "float16": torch.float16,
+        }[config.dtype]
+        self._ctx = (
+            nullcontext() if device_type == "cpu" else torch.amp.autocast(device_type=device_type, dtype=self.ptdtype)
+        )
+        self.use_bpe_tokenizer = False
+        self.load_meta = None
+        self.speaker_cond = None
+        self.meta = None
+        self.model = None
+        self.checkpoint_config = None
+        self.vocab_sizes = None
+        self.smodel = None
+        self._init_model()
+        self.tokenizer = tokenizer_cls(**self.meta["tokenizer"])
+        self.decoder = decoder_cls(
+            tokeniser_decode_fn=self.tokenizer.decode,
+            output_dir=self.config.output_dir,
+            data_adapter_fn=data_adapter_fn,
+        )
+    def _init_model(self):
+        if self.config.init_from == "resume":
+            # init from a model saved in a specific directory
+            checkpoint = torch.load(self.config.ckpt_path, map_location=self.config.device)
+            self.vocab_sizes = checkpoint["model_args"]["vocab_sizes"]
+            self.load_meta = False
+            self.speaker_cond = False
+            if "config" in checkpoint:
+                self.checkpoint_config = checkpoint["config"]
+                self.meta = checkpoint["meta"]
+                load_meta = True
+            if load_meta:
+                self.use_bpe_tokenizer = "stoi" not in self.meta or "itos" not in self.meta
+                self.speaker_cond = self.meta.get("speaker_cond")
+            if self.speaker_cond:
+                speaker_emb_size = self.meta["speaker_emb_size"]
+            model_args = checkpoint["model_args"]
+            if "causal" in self.checkpoint_config and self.checkpoint_config["causal"] is False:
+                self._encodec_ctx_window = model_args["block_size"]
+            gptconf = GPTConfig(**model_args)
+            # TODO: rename `speaker_emb_dim` to `speaker_emb_size`.
+            self.model = GPT(gptconf, speaker_emb_dim=speaker_emb_size if self.speaker_cond else None)
+            state_dict = checkpoint["model"]
+            unwanted_prefix = "_orig_mod."
+            for k, v in list(state_dict.items()):
+                if k.startswith(unwanted_prefix):
+                    state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
+            self.model.load_state_dict(state_dict)
+        # model
+        self.model.eval()
+        self.model.to(self.config.device)
+        if self.config.compile:
+            from einops._torch_specific import allow_ops_in_compiled_graph
+            allow_ops_in_compiled_graph()
+            self.model = torch.compile(self.model)  # type: ignore
+        if self.use_kv_cache is not None:
+            if "causal" in self.checkpoint_config and self.checkpoint_config["causal"] is False:
+                raise Exception("kv_cache not supported for non-causal models!")
+            if self.use_kv_cache == "vanilla":
+                self.model.enable_kv_cache()
+            else:
+                raise NotImplementedError(f"kv_cache type {self.use_kv_cache} not implemented!")
+    def causal_sample(
+        self,
+        *,
+        texts: list[str],
+        batch_size: int,
+        max_new_tokens: int,
+        temperature: Optional[float],
+        top_k: Optional[int],
+        top_p: Optional[float],
+        speaker_embs: Optional[torch.Tensor] = None,
+        guidance_scale: Optional[float] = None,
+    ) -> list[torch.Tensor]:
+        """
+        Returns list of torch.Tensors of tokens. Each tensor is of shape (1, c, t) where c is the number of codebooks.
+        Any flattening / inteleaving / tilting gets reversed before the output is returned.
+        """
+        if speaker_embs is not None:
+            assert len(texts) == len(speaker_embs)
+        encoded_texts = [self.tokenizer.encode(text) for text in texts]
+        ## create multiple hierarchies and get seq_lens
+        seq_lens = []
+        xs = []
+        for i, encoded_text in enumerate(encoded_texts):
+            encoded_text = torch.tensor([encoded_text], dtype=torch.long, device=self.config.device)
+            # TODO: remove magic number
+            xs.append(
+                torch.cat(
+                    # [1st hierarchy of text, *remaining hierarchies of padded tokens]
+                    # TODO: self.vocab_sizes should be from the model config?
+                    [encoded_text, *[torch.ones_like(encoded_text) * 1024] * (len(self.vocab_sizes) - 1)],
+                    dim=0,
+                ).unsqueeze(0)
+            )  # b x [(b=1, c, t)]
+            seq_lens.append(xs[-1].shape[-1])
+        max_len = max(seq_lens)
+        assert len(xs) == len(seq_lens)
+        ## equalise the shapes in the batch. we can use torch.zeros as tokens > seq_lens will be masked out.
+        x = torch.zeros((len(encoded_texts), xs[0].shape[1], max_len), dtype=torch.long, device=self.config.device)
+        for i, _xs in enumerate(xs):
+            assert _xs.shape[-1] == seq_lens[i]
+            x[i, :, : seq_lens[i]] = _xs
+        ## check that the input is correct
+        for i in range(x.shape[0]):
+            assert x[i, 0, : seq_lens[i]].tolist() == encoded_texts[i]
+            # TODO: remove magic number
+            if x.shape[1] > 1:
+                assert set(x[i, 1, : seq_lens[i]].tolist()) == set([1024])
+        assert x.shape[0] == speaker_embs.shape[0] if speaker_embs is not None else True
+        if self.speaker_cond is False:
+            speaker_embs = None
+        # run sampling loop
+        with torch.no_grad():
+            with self._ctx:  # type: ignore
+                to_return = []
+                for k in range(self.config.num_samples):
+                    assert seq_lens is not None
+                    assert batch_size is not None
+                    if max(seq_lens) + max_new_tokens >= self.model.config.block_size:
+                        raise Exception(
+                            f"max_new_tokens {max_new_tokens} too large! Choose {self.model.config.block_size - max(seq_lens) - 1} instead."
+                        )
+                    y = self.model.generate(
+                        x,
+                        max_new_tokens,
+                        seq_lens=seq_lens,
+                        temperature=temperature,
+                        top_k=top_k,
+                        top_p=top_p,
+                        speaker_embs=speaker_embs,
+                        batch_size=batch_size,
+                        guidance_scale=guidance_scale,
+                        dtype=self.ptdtype,
+                        end_of_audio_token=self.tokenizer.offset - 1,
+                        end_of_text_token=self.tokenizer.eot_token,
+                    )
+                    for i in range(len(y)):
+                        to_return.append(self.decoder.decode(tokens=y[i].tolist(), causal=True))
+                return to_return
+    def non_causal_sample(
+        self,
+        *,
+        texts: list[str],
+        encodec_tokens: list[torch.Tensor],
+        batch_size: int,
+        top_k: Optional[int],
+        temperature: Optional[float],
+        speaker_embs: Optional[torch.Tensor] = None,
+    ) -> list[str]:
+        """
+        Returns paths to saved audio files.
+        """
+        if speaker_embs is not None:
+            assert len(texts) == len(speaker_embs)
+        encoded_texts = [self.tokenizer.encode(text) for text in texts]
+        # setup input
+        # TODO: same code is used during data prep. refactor
+        padded_hierarchies_inputs = []
+        for encoded_text, encodec_token in zip(encoded_texts, encodec_tokens):
+            x = torch.tensor(encoded_text, dtype=torch.long, device=self.config.device)[
+                None, None, ...
+            ]  # (b=1, c=1, t)
+            # TODO: should only happen if decoder is encodecdeocder?
+            assert encodec_token.shape[0] == 1
+            encodec_token = encodec_token[0].tolist()  # (b=1, c, t) -> (c, t)
+            assert len(encodec_token) >= 1 and len(encodec_token) <= self._num_encodec_codebooks
+            ## setup hierarchies of tokens
+            # TODO: refactor and merge with code in processing.py
+            text_tokens = encoded_text  # (t,)
+            hierarchies_in = []
+            hierarchies_in.append(text_tokens + encodec_token[0] + [self._encodec_codes_pad_token])
+            hierarchies_in.append(
+                [self._encodec_codes_pad_token] * len(text_tokens) + encodec_token[1] + [self._encodec_codes_pad_token]
+            )
+            ## adding padding / cutting to the right size as needed
+            # TODO: refactor and merge with code in processing.py
+            padded_hierarchies_input = []
+            for _, t_hierarchy in enumerate(hierarchies_in):
+                assert len(t_hierarchy) == len(hierarchies_in[0])
+                if len(t_hierarchy) < self._encodec_ctx_window:
+                    padded_hierarchies_input.append(
+                        t_hierarchy + [self._encodec_codes_pad_token] * (self._encodec_ctx_window - len(t_hierarchy))
+                    )
+                elif len(t_hierarchy) > self._encodec_ctx_window:
+                    padded_hierarchies_input.append(t_hierarchy[: self._encodec_ctx_window])
+                else:
+                    padded_hierarchies_input.append(t_hierarchy)
+            padded_hierarchies_inputs.append(padded_hierarchies_input)
+        ## check that the input is correct
+        in_x = torch.tensor(padded_hierarchies_inputs, dtype=torch.long, device=self.config.device)
+        assert in_x.shape[0] == speaker_embs.shape[0] if speaker_embs is not None else True
+        if self.speaker_cond is False:
+            speaker_embs = None
+        # run sampling loop
+        with torch.no_grad():
+            with self._ctx:  # type: ignore
+                to_return = []
+                for k in range(self.config.num_samples):
+                    y = self.model.generate(
+                        in_x,
+                        None,
+                        temperature=temperature,
+                        top_k=top_k,
+                        # TODO: handle separate top_p for this model explicitly
+                        top_p=None,
+                        speaker_embs=speaker_embs,
+                        batch_size=batch_size,
+                        guidance_scale=None,
+                    )
+                    b_tokens = torch.cat([in_x, y], dim=1)
+                    for tokens in b_tokens:
+                        try:
+                            to_return.append(self.decoder.decode(tokens=tokens.tolist(), causal=False))
+                        except Exception as e:
+                            print("failed to run MBD.")
+                            print(f"reason: {str(e)}")
+                            to_return.append(None)
+                return to_return
+    def __call__(
+        self,
+        *,
+        texts: list[str],
+        batch_size: int,
+        max_new_tokens: Optional[int],
+        top_k: Optional[int],
+        top_p: Optional[float],
+        temperature: Optional[float],
+        encodec_tokens: Optional[list[torch.Tensor]] = None,
+        speaker_embs: Optional[torch.Tensor] = None,
+        guidance_scale: Optional[float] = None,
+    ):
+        if self.checkpoint_config.get("causal", True):
+            return self.causal_sample(
+                texts=texts,
+                batch_size=batch_size,
+                speaker_embs=speaker_embs,
+                guidance_scale=guidance_scale,
+                max_new_tokens=max_new_tokens,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+            )
+        else:
+            assert encodec_tokens is not None
+            assert guidance_scale is None
+            assert max_new_tokens is None
+            assert top_p is None
+            return self.non_causal_sample(
+                texts=texts,
+                encodec_tokens=encodec_tokens,
+                batch_size=batch_size,
+                speaker_embs=speaker_embs,
+                top_k=top_k,
+                temperature=temperature,
+            )
+def save_result_metadata(wav_path, ref_path, text, first_stage_ckpt_path, second_stage_ckpt_path):
+    if first_stage_ckpt_path is None or second_stage_ckpt_path is None:
+        return
+    json.dump(
+        {
+            "speaker": ref_path,
+            "text": text,
+        },
+        pathlib.Path(str(wav_path) + ".json").open("w"),
+    )
+def get_cached_file(file_or_uri: str):
+    """
+    If it's an s3 file, download it to a local temporary file and return that path.
+    Otherwise return the path as is.
+    """
+    is_uri = file_or_uri.startswith("http")
+    cache_path = None
+    if is_uri:
+        ext = pathlib.Path(file_or_uri).suffix
+        # hash the file path to get the cache name
+        _cache_name = "audio_" + hashlib.md5(file_or_uri.encode("utf-8")).hexdigest() + ext
+        os.makedirs(os.path.expanduser("~/.cache/fam/"), exist_ok=True)
+        cache_path = os.path.expanduser(f"~/.cache/fam/{_cache_name}")
+        if not os.path.exists(cache_path):
+            command = f"curl -o {cache_path} {file_or_uri}"
+            subprocess.run(command, shell=True, check=True)
+    else:
+        if os.path.exists(file_or_uri):
+            cache_path = file_or_uri
+        else:
+            raise FileNotFoundError(f"File {file_or_uri} not found!")
+    return cache_path
+def get_cached_embedding(local_file_path: str, spkemb_model):
+    if not os.path.exists(local_file_path):
+        raise FileNotFoundError(f"File {local_file_path} not found!")
+    # hash the file path to get the cache name
+    _cache_name = "embedding_" + hashlib.md5(local_file_path.encode("utf-8")).hexdigest() + ".pt"
+    os.makedirs(os.path.expanduser("~/.cache/fam/"), exist_ok=True)
+    cache_path = os.path.expanduser(f"~/.cache/fam/{_cache_name}")
+    if not os.path.exists(cache_path):
+        spk_emb = spkemb_model.embed_utterance_from_file(local_file_path, numpy=False).unsqueeze(0)  # (b=1, c)
+        torch.save(spk_emb, cache_path)
+    else:
+        spk_emb = torch.load(cache_path)
+    return spk_emb
+def _sample_utterance_batch(
+    texts: list[str],
+    spk_cond_paths: list[Optional[str]],
+    spkemb_model,
+    first_stage_model,
+    second_stage_model,
+    enhancer: Optional[Union[Literal["df"], BaseEnhancer]],
+    first_stage_ckpt_path: str,
+    second_stage_ckpt_path: str,
+    guidance_scale: Optional[Tuple[float, float]],
+    max_new_tokens: int,
+    top_k: Optional[int],
+    top_p: Optional[float],
+    temperature: Optional[float],
+    batch_size: int = 128,
+) -> List[str]:
+    speaker_embs = []
+    refs = spk_cond_paths.copy()
+    # multithreaded loop to cache all the files
+    spk_cond_paths = tqdm.contrib.concurrent.thread_map(
+        get_cached_file, spk_cond_paths, desc="getting cached speaker ref files"
+    )
+    for i, (text, spk_cond_path) in tqdm.tqdm(
+        enumerate(zip(texts, spk_cond_paths)), total=len(texts), desc="calculating speaker embeddings"
+    ):
+        texts[i] = normalize_text(text)
+        speaker_embs.append(get_cached_embedding(spk_cond_path, spkemb_model) if spk_cond_path else None)
+    b_speaker_embs = torch.cat(speaker_embs, dim=0)
+    start = time.time()
+    b_tokens = first_stage_model(
+        texts=texts,
+        speaker_embs=b_speaker_embs,
+        batch_size=batch_size,
+        guidance_scale=guidance_scale,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        max_new_tokens=max_new_tokens,
+    )
+    # TODO: set batch size for second stage model!
+    wav_files = second_stage_model(
+        texts=texts,
+        encodec_tokens=b_tokens,
+        speaker_embs=b_speaker_embs,
+        batch_size=batch_size,
+        guidance_scale=None,
+        top_p=None,
+        top_k=top_k,
+        temperature=temperature,
+        max_new_tokens=None,
+    )
+    for text, tokens, speaker_embs, ref_name, wav_file in zip(texts, b_tokens, b_speaker_embs, refs, wav_files):
+        if wav_file is None:
+            continue
+        with tempfile.NamedTemporaryFile(suffix=".wav") as enhanced_tmp:
+            if enhancer is not None:
+                enhancer = get_enhancer(enhancer) if isinstance(enhancer, str) else enhancer
+                enhancer(str(wav_file) + ".wav", enhanced_tmp.name)
+                # copy enhanced_tmp.name back to wav_file
+                print(f"copying enhanced file from {enhanced_tmp.name} to {str(wav_file) + '.wav'}.")
+                shutil.copy2(enhanced_tmp.name, str(wav_file) + ".wav")
+            save_result_metadata(
+                wav_file,
+                ref_name,
+                text,
+                first_stage_ckpt_path,
+                second_stage_ckpt_path,
+            )
+    print(f"time_to_synth_s: {time.time() - start}")
+    return [str(w) + ".wav" if not str(w).endswith(".wav") else str(w) for w in wav_files]
+def sample_utterance(
+    text: str,
+    spk_cond_path: Optional[str],
+    spkemb_model,
+    first_stage_model,
+    second_stage_model,
+    enhancer: Optional[Union[Literal["df"], BaseEnhancer]],
+    first_stage_ckpt_path: str,
+    second_stage_ckpt_path: str,
+    guidance_scale: Optional[Tuple[float, float]],
+    max_new_tokens: int,
+    top_k: Optional[int],
+    top_p: Optional[float],
+    temperature: Optional[float],
+) -> str:
+    # NOTE: supports max. 220 characters atm.
+    # Long form synthesis coming soon...
+    MAX_CHARS = 220
+    if len(text) > MAX_CHARS:
+        print(
+            f"\n***WARNING: Max {MAX_CHARS} characters supported. Provided: {len(text)}. Truncating and generating speech...Can lead to unpredictable speech at the end.***"
+        )
+    return _sample_utterance_batch(
+        texts=[text],
+        spk_cond_paths=[spk_cond_path],
+        spkemb_model=spkemb_model,
+        first_stage_model=first_stage_model,
+        second_stage_model=second_stage_model,
+        enhancer=enhancer,
+        first_stage_ckpt_path=first_stage_ckpt_path,
+        second_stage_ckpt_path=second_stage_ckpt_path,
+        batch_size=1,
+        guidance_scale=guidance_scale,
+        max_new_tokens=max_new_tokens,
+        top_k=top_k,
+        top_p=top_p,
+        temperature=temperature,
+    )[0]
+def build_models(config_first_stage, config_second_stage, model_dir, device, use_kv_cache):
+    smodel = SpeakerEncoder(
+        weights_fpath=os.path.join(model_dir, "speaker_encoder.pt"), device=device, eval=True, verbose=False
+    )
+    data_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
+    llm_first_stage = Model(
+        config_first_stage,
+        TrainedBPETokeniser,
+        EncodecDecoder,
+        data_adapter_fn=data_adapter.decode,
+        use_kv_cache=use_kv_cache,
+    )
+    data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
+    llm_second_stage = Model(
+        config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
+    )
+    return smodel, llm_first_stage, llm_second_stage
+def get_first_stage_path(model_dir: str):
+    """Absolute path to checkpoint for the first stage model."""
+    return os.path.join(os.path.expanduser(model_dir), "first_stage.pt")
+def get_second_stage_path(model_dir: str):
+    """Absolute path to checkpoint for the second stage model."""
+    return os.path.join(os.path.expanduser(model_dir), "second_stage.pt")
+@dataclass
+class SamplingControllerConfig:
+    """
+    Sample from a trained model.
+    """
+    spk_cond_path: str
+    """Path to speaker reference file. Min. 30s of audio required. Supports both local paths & public URIs. Audio formats: wav, flac & mp3"""
+    huggingface_repo_id: str = "kotoba-tech/kotoba-speech-v0.1"
+    """Absolute path to the model directory."""
+    text: str = (
+        "This is a demo of text to speech by MetaVoice-1B, an open-source foundational audio model by MetaVoice."
+    )
+    """Text to synthesise."""
+    num_samples: int = 1
+    """Number of samples to generate from each model."""
+    max_new_tokens: int = 864
+    """Maximum number of new tokens to generate from the first stage model."""
+    temperature: float = 1.0
+    """Temperature for sampling applied to both models."""
+    top_k: Optional[int] = None
+    """Top k for sampling applied to both models."""
+    top_p: Optional[float] = 0.95
+    """Top p for sampling applied to first-stage model."""
+    seed: int = 1337
+    """Random seed for sampling."""
+    device: Literal["cuda", "cpu"] = "cuda"
+    """Device to use for sampling."""
+    dtype: Literal["bfloat16", "float16", "float32", "tfloat32"] = get_default_dtype()
+    """Data type to use for sampling."""
+    compile: bool = False
+    """Whether to compile the model using PyTorch 2.0."""
+    enhancer: Optional[Literal["df"]] = "df"
+    """Enhancer to use for post-processing."""
+    init_from: str = "resume"
+    """Either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')."""
+    use_kv_cache: Optional[Literal["vanilla"]] = "vanilla"
+    """Type of kv caching to use for inference: 1) [none] no kv caching, 2) [vanilla] use torch attention with hand implemented kv-cache."""
+    output_dir: str = "samples/"
+    """Relative path to output directory"""
+    guidance_scale: Optional[Tuple[float, float]] = (3.0, 1.0)
+    """Guidance scale for sampling: (speaker conditioning guidance_scale, prompt conditioning guidance scale)."""
+    batch_size: int = 128
+    """Batch size to use for sampling. Note that the batch size gets doubled when guidance is used. For H100, and 1B model,
+    1 w/ guidance and 1 w/o guidance work well (without kv-caching). With kv-caching, 128 (w/o guidance) and
+    64 (w/ guidance) works well."""
+if __name__ == "__main__":
+    # TODO: add support for batch sampling via CLI. Function has been implemented above.
+    sampling_config = tyro.cli(SamplingControllerConfig, use_underscores=True)
+    check_audio_file(sampling_config.spk_cond_path)
+    model_dir = snapshot_download(repo_id=sampling_config.huggingface_repo_id)
+    first_stage_ckpt_path = get_first_stage_path(model_dir)
+    second_stage_ckpt_path = get_second_stage_path(model_dir)
+    config_first_stage = InferenceConfig(
+        ckpt_path=first_stage_ckpt_path,
+        num_samples=sampling_config.num_samples,
+        seed=sampling_config.seed,
+        device=sampling_config.device,
+        dtype=sampling_config.dtype,
+        compile=sampling_config.compile,
+        init_from=sampling_config.init_from,
+        output_dir=sampling_config.output_dir,
+    )
+    config_second_stage = InferenceConfig(
+        ckpt_path=second_stage_ckpt_path,
+        num_samples=sampling_config.num_samples,
+        seed=sampling_config.seed,
+        device=sampling_config.device,
+        dtype=sampling_config.dtype,
+        compile=sampling_config.compile,
+        init_from=sampling_config.init_from,
+        output_dir=sampling_config.output_dir,
+    )
+    sampling_config.max_new_tokens *= (
+        2  # deal with max_new_tokens for flattened interleaving! (should scale with num_codebooks?)
+    )
+    # define models
+    smodel, llm_first_stage, llm_second_stage = build_models(
+        config_first_stage,
+        config_second_stage,
+        model_dir=model_dir,
+        device=sampling_config.device,
+        use_kv_cache=sampling_config.use_kv_cache,
+    )
+    sample_utterance(
+        sampling_config.text,
+        os.path.expanduser(sampling_config.spk_cond_path),
+        smodel,
+        llm_first_stage,
+        llm_second_stage,
+        sampling_config.enhancer,
+        first_stage_ckpt_path,
+        second_stage_ckpt_path,
+        sampling_config.guidance_scale,
+        max_new_tokens=sampling_config.max_new_tokens,
+        top_k=sampling_config.top_k,
+        top_p=sampling_config.top_p,
+        temperature=sampling_config.temperature,
+    )

fam/llm/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from fam.llm.layers.attn import SelfAttention
+from fam.llm.layers.combined import Block
+from fam.llm.layers.layers import MLP, LayerNorm, RMSNorm, SwiGLU

fam/llm/layers/attn.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import warnings
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+class SelfAttention(nn.Module):
+    def __init__(self, config):
+        """
+        Initializes the SelfAttention module.
+        Args:
+            config: An object containing the configuration parameters for the SelfAttention module.
+        """
+        super().__init__()
+        self._validate_config(config)
+        self._initialize_parameters(config)
+    def empty_kv_cache(self, batch_size: int, kv_cache_maxlen: int, dtype: torch.dtype):
+        """
+        Empties the key-value cache.
+        Args:
+            batch_size: The batch size.
+            kv_cache_maxlen: The maximum length of the key-value cache.
+            dtype: The data type of the cache.
+        Raises:
+            Exception: If trying to empty the KV cache when it is disabled.
+        """
+        if self.kv_cache_enabled is False:
+            raise Exception("Trying to empty KV cache when it is disabled")
+        # register so that the cache moves devices along with the module
+        # TODO: get rid of re-allocation.
+        self.register_buffer(
+            "kv_cache",
+            torch.zeros(
+                2,
+                batch_size,
+                kv_cache_maxlen,
+                self.n_head,
+                self.n_embd // self.n_head,
+                dtype=dtype,
+                device=self.c_attn.weight.device,
+            ),
+            persistent=False,
+        )
+        self.kv_cache_first_empty_index = 0
+    def _initialize_parameters(self, config):
+        """
+        Initializes the parameters of the SelfAttention module.
+        Args:
+            config: An object containing the configuration parameters for the SelfAttention module.
+        """
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # regularization
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        self.causal = config.causal
+        self.attn_kernel_type = config.attn_kernel_type
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.kv_cache_enabled = False
+    def _validate_config(self, config):
+        """
+        Validates the configuration parameters.
+        Args:
+            config: An object containing the configuration parameters for the SelfAttention module.
+        Raises:
+            AssertionError: If the embedding dimension is not divisible by the number of heads.
+        """
+        assert config.n_embd % config.n_head == 0, "Embedding dimension must be divisible by number of heads"
+    def _update_kv_cache(self, q, k, v):
+        """
+        Updates the key-value cache.
+        Args:
+            q: The query tensor.
+            k: The key tensor.
+            v: The value tensor.
+        Returns:
+            The updated key and value tensors.
+        Raises:
+            AssertionError: If the dimensions of the query, key, and value tensors are not compatible.
+        """
+        q_time, k_time, v_time = q.shape[1], k.shape[1], v.shape[1]
+        if self.kv_cache_first_empty_index == 0:
+            assert q_time == k_time and q_time == v_time
+        else:
+            assert (
+                q_time == 1
+            ), f"Only one query at a time is supported, but got q_time={q_time} for kv_cache_first_empty_index={self.kv_cache_first_empty_index}"
+        self.kv_cache[0, :, self.kv_cache_first_empty_index : self.kv_cache_first_empty_index + q_time] = k
+        self.kv_cache[1, :, self.kv_cache_first_empty_index : self.kv_cache_first_empty_index + q_time] = v
+        self.kv_cache_first_empty_index += q_time
+        k = self.kv_cache[0, :, : self.kv_cache_first_empty_index]
+        v = self.kv_cache[1, :, : self.kv_cache_first_empty_index]
+        return k, v
+    def _torch_attn(self, c_x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs attention using the torch.nn.functional.scaled_dot_product_attention function.
+        Args:
+            c_x: The input tensor.
+        Returns:
+            The output tensor.
+        """
+        q, k, v = c_x.split(1, dim=2)  # q, k, v of shape (B, T, 1, nh, hs)
+        q = q.squeeze(2)  # (B, T, nh, hs)
+        k = k.squeeze(2)  # (B, T, nh, hs)
+        v = v.squeeze(2)  # (B, T, nh, hs)
+        # if kv-caching and causal, for the "prefill" stage, we need to use a causal mask, and
+        # use no mask for the "one time step" parts.
+        # calculate this before updating kv_caching so we have the right value for kv_cache_first_empty_index
+        is_causal_attn_mask = self.causal and (not self.kv_cache_enabled or self.kv_cache_first_empty_index == 0)
+        if self.kv_cache_enabled:
+            k, v = self._update_kv_cache(q, k, v)
+        q = q.transpose(1, 2)  # (B, nh, T, hs)
+        k = k.transpose(1, 2)  # (B, nh, T, hs)
+        v = v.transpose(1, 2)  # (B, nh, T, hs)
+        y = torch.nn.functional.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=None,
+            dropout_p=self.dropout if self.training else 0,
+            is_causal=is_causal_attn_mask,
+        ).transpose(
+            1, 2
+        )  # (B, nh, T, hs) -> (B, T, nh, hs)
+        return y
+    def forward(self, x):
+        """
+        Performs the forward pass of the SelfAttention module.
+        Args:
+            x: The input tensor.
+        Returns:
+            The output tensor.
+        """
+        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        c_x = self.c_attn(x).view(B, T, 3, self.n_head, C // self.n_head)  # (B, T, 3, nh, hs)
+        # causal self-attention;
+        if self.attn_kernel_type == "torch_attn":
+            y = self._torch_attn(c_x)
+        else:
+            raise Exception(f"Unknown attention kernel type: {self.attn_kernel_type}")
+        y = y.contiguous().view(B, T, C)  # re-assemble all head outputs side by side: (B, T, nh, hs) -> (B, T, hs * nh)
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y

fam/llm/layers/combined.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch.nn as nn
+from fam.llm.layers.attn import SelfAttention
+from fam.llm.layers.layers import MLP, LayerNorm, RMSNorm
+class Block(nn.Module):
+    """
+    Block class represents a single block in the model.
+    Args:
+        config (object): Configuration object containing parameters for the block.
+    Attributes:
+        ln_1 (object): Layer normalization for the attention layer.
+        ln_2 (object): Layer normalization for the feed-forward layer.
+        attn (object): Self-attention layer.
+        mlp (object): Multi-layer perceptron layer.
+    Methods:
+        forward(x): Performs forward pass through the block.
+    """
+    def __init__(self, config):
+        super().__init__()
+        if config.norm_type == "rmsnorm":
+            if config.rmsnorm_eps is None:
+                raise Exception("RMSNorm requires rmsnorm_eps to be set")
+            self.ln_1 = RMSNorm(config.n_embd, eps=config.rmsnorm_eps)  # attn norm
+            self.ln_2 = RMSNorm(config.n_embd, eps=config.rmsnorm_eps)  # ffn norm
+        elif config.norm_type == "layernorm":
+            self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)  # attn norm
+            self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)  # ffn norm
+        else:
+            raise Exception(f"Unknown norm type: {config.norm_type}")
+        self.attn = SelfAttention(config)
+        self.mlp = MLP(config)
+    def forward(self, x):
+        """
+        Performs forward pass through the block.
+        Args:
+            x (tensor): Input tensor.
+        Returns:
+            tensor: Output tensor after passing through the block.
+        """
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x

fam/llm/layers/layers.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+class LayerNorm(nn.Module):
+    """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+class RMSNorm(torch.nn.Module):
+    def __init__(self, ndim: int, eps: float):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(ndim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        return self._norm(x) * self.weight
+class SwiGLU(nn.Module):
+    def __init__(self, in_dim, out_dim, bias) -> None:
+        super().__init__()
+        self.w1 = nn.Linear(in_dim, out_dim, bias=bias)
+        self.w3 = nn.Linear(in_dim, out_dim, bias=bias)
+    def forward(self, x):
+        return F.silu(self.w1(x)) * self.w3(x)
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.non_linearity = config.nonlinearity_type
+        hidden_dim = 4 * config.n_embd
+        if config.nonlinearity_type == "gelu":
+            self.c_fc = nn.Linear(config.n_embd, hidden_dim, bias=config.bias)
+            self.gelu = nn.GELU()
+            self.c_proj = nn.Linear(hidden_dim, config.n_embd, bias=config.bias)
+        elif config.nonlinearity_type == "swiglu":
+            if config.swiglu_multiple_of is None:
+                raise Exception("SwiGLU requires swiglu_multiple_of to be set")
+            hidden_dim = int(2 * hidden_dim / 3)
+            hidden_dim = config.swiglu_multiple_of * math.ceil(hidden_dim / config.swiglu_multiple_of)
+            # set name to `c_proj` so that the right initialisation gets applied to it in GPT.__init__()
+            self.swiglu = SwiGLU(config.n_embd, hidden_dim, bias=config.bias)
+            self.c_proj = nn.Linear(hidden_dim, config.n_embd, bias=config.bias)
+        else:
+            raise Exception(f"Unknown nonlinearity type: {config.nonlinearity_type}")
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        if self.non_linearity == "gelu":
+            x = self.c_fc(x)
+            x = self.gelu(x)
+        elif self.non_linearity == "swiglu":
+            x = self.swiglu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x

fam/llm/mixins/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from fam.llm.mixins.causal import CausalInferenceMixin
2	+ from fam.llm.mixins.non_causal import NonCausalInferenceMixin

fam/llm/mixins/causal.py ADDED Viewed

	@@ -0,0 +1,511 @@

+from typing import Optional, Tuple
+import numpy as np
+import torch
+import tqdm
+from torch.nn import functional as F
+from IPython import embed
+def top_p_sample(prob_dist: torch.Tensor, top_p: float):
+    sorted_probs, sorted_indices = torch.sort(prob_dist, descending=True, dim=-1)
+    cum_sum_probs = torch.cumsum(sorted_probs, dim=-1)  # (b, vocab_size)
+    sorted_indices_to_remove = cum_sum_probs > top_p
+    # Shift the indices to the right to keep also the first token above the threshold
+    sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+    sorted_indices_to_remove[:, 0] = 0
+    sorted_indices_to_remove = sorted_indices_to_remove.bool()
+    # replace probs to be removed with 0 in the sorted_probs
+    sorted_probs[sorted_indices_to_remove] = 0
+    # reverse the sorting process
+    reversed_indices = torch.argsort(sorted_indices)
+    prob_dist = torch.gather(sorted_probs, -1, reversed_indices)
+    # normalize
+    prob_dist = prob_dist / prob_dist.sum(dim=-1, keepdim=True)
+    return prob_dist
+class CausalInferenceMixin:
+    """
+    Mixin class for performing inference in a causal language model.
+    This mixin provides methods for predicting the next token in a sequence, sampling from the model,
+    and applying token prediction masks.
+    Attributes:
+        None
+    Methods:
+        _sample_next_token: Predicts the next token in the sequence.
+        _create_token_pred_mask: Creates a token prediction mask based on sequence lengths.
+        _apply_token_pred_mask: Applies a token prediction mask to the next token predictions.
+        _sample_batch: Samples a batch of tokens from the model.
+        _sort_for_batching: Sorts the input sequences for efficient batching.
+        _causal_sample: Generates a sequence of tokens using causal sampling.
+    """
+    @torch.no_grad()
+    def _sample_next_token(
+        self,
+        *,
+        idx: torch.Tensor,
+        speaker_embs: Optional[torch.Tensor],
+        temperature: float,
+        top_k: Optional[int],
+        top_p: Optional[float],
+        guidance_scale: Optional[float],
+    ) -> torch.Tensor:
+        """
+        Predict the next token in the sequence.
+        Args:
+            idx (torch.Tensor): Initial sequence indices of shape (batch, num_hierarchies, time).
+            speaker_embs (Optional[torch.Tensor]): Speaker embeddings. Set to `None` if using an unconditional model.
+            temperature (float): Sampling temperature.
+            top_k (Optional[int]): Top-k filtering threshold. Set to `None` to disable top-k filtering.
+            top_p (Optional[float]): Nucleus sampling threshold. Set to `None` to disable it.
+            guidance_scale (Optional[float]): Scale factor for the guidance loss. Set to `None` to disable guidance.
+        Returns:
+            torch.Tensor: Next index in the sequence after sampling. Shape: (batch, num_hierarchies).
+        """
+        if top_k is not None and top_p is not None:
+            raise ValueError("Only one of top_k and top_p can be set")
+        # if the sequence context is growing too long we must crop it at block_size
+        idx_cond = idx if idx.size(-1) <= self.config.block_size else idx[:, :, -self.config.block_size :]
+        # forward the model to get the logits for the index in the sequence
+        list_logits, _ = self(
+            idx_cond, speaker_embs=speaker_embs
+        )  # list with len num_hierarchies of (b,1,vocab_size) tensors
+        # print(f'{list_logits[0].shape=}, {len(list_logits)=}')
+        # print(f'{list_logits[0][:,:,:10]}')
+        if guidance_scale is not None:
+            assert idx_cond.shape[0] % 2 == 0
+            assert list_logits[0].shape[0] % 2 == 0
+            for i, logits in enumerate(list_logits):
+                logits_cond, logits_uncond = logits.split(logits.shape[0] // 2, dim=0)
+                list_logits[i] = (guidance_scale) * logits_cond + (1 - guidance_scale) * logits_uncond
+            assert list_logits[0].shape[0] == idx_cond.shape[0] // 2
+        # pluck the logits at the final step and scale by desired temperature
+        list_logits = [
+            logits[:, -1, :] / temperature for logits in list_logits
+        ]  # list with len num_hierarchies of (b,vocab_size) tensors
+        # optionally crop the logits to only the top k options
+        if top_k is not None:
+            for i in range(len(list_logits)):
+                logits = list_logits[i]
+                v, _ = torch.topk(
+                    logits, min(top_k, logits.size(-1))
+                )  # returns a descending sorted list of values and indices of top_k values
+                logits[logits < v[:, [-1]]] = -float("Inf")  # set all logits below the smallest top_k value to -Inf
+                list_logits[i] = logits
+        # apply softmax to convert logits to (normalized) probabilities
+        # embed()
+        probs = [
+            F.softmax(logits, dim=-1) for logits in list_logits
+        ]  # list of len num_hierarchies of (b,vocab_size) tensors
+        # print(f'{probs[0].shape=}')
+        # print(f'{probs[0][:,:,:10]}')
+        if top_p is not None:
+            for i in range(len(probs)):
+                probs[i] = top_p_sample(probs[i], top_p)
+        # sample from the distribution
+        idx_next = [
+            torch.multinomial(prob, num_samples=1) for prob in probs
+        ]  # list of len num_hierarchies of (b,1) tensors
+        idx_next = torch.cat(idx_next, dim=-1)  # (b, num_hierarchies) tensor
+        return idx_next  # (b, num_hierarchies) tensor
+    @torch.no_grad()
+    def _create_token_pred_mask(self, idx: torch.Tensor, seq_lens: list[int]) -> torch.Tensor:
+        """
+        Creates a token prediction mask based on sequence lengths.
+        Args:
+            idx (torch.Tensor): Initial sequence indices of shape (batch, num_hierarchies, time).
+            seq_lens (list[int]): List of sequence lengths for each sequence in idx.
+        Returns:
+            torch.Tensor: Token prediction mask of shape (batch, time).
+        """
+        token_pred_mask = torch.zeros((idx.shape[0], idx.shape[-1]), dtype=torch.bool, device=idx.device)
+        for i in range(len(seq_lens)):
+            token_pred_mask[i, : seq_lens[i]] = True
+        assert (token_pred_mask[:, : min(seq_lens)] == 1).all()
+        return token_pred_mask
+    @torch.no_grad()
+    def _apply_token_pred_mask(
+        self, *, idx_next: torch.Tensor, orig_input_at_t: torch.Tensor, token_pred_mask_at_t: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Applies a token prediction mask to the next token predictions.
+        Args:
+            idx_next (torch.Tensor): Next token predictions of shape (batch, num_hierarchies).
+            orig_input_at_t (torch.Tensor): Original input at time step t of shape (batch, num_hierarchies).
+            token_pred_mask_at_t (torch.Tensor): Token prediction mask at time step t of shape (batch, 1).
+        Returns:
+            torch.Tensor: Updated next token predictions after applying the token prediction mask.
+        """
+        idx_next = idx_next * (~token_pred_mask_at_t) + orig_input_at_t * token_pred_mask_at_t
+        return idx_next
+    @torch.no_grad()
+    def _sample_batch(
+        self,
+        *,
+        idx: torch.Tensor,
+        max_new_tokens: int,
+        seq_lens: list[int],
+        temperature: float,
+        top_k: Optional[int],
+        top_p: Optional[float],
+        speaker_embs: Optional[torch.Tensor],
+        guidance_scale: Optional[float],
+    ):
+        """
+        Samples a batch of tokens from the model.
+        Args:
+            idx (torch.Tensor): Initial sequence indices of shape (batch, num_hierarchies, time).
+            max_new_tokens (int): Maximum number of NEW tokens to generate (in addition to largest sequence in idx).
+            seq_lens (list[int]): List of sequence lengths for each sequence in idx.
+            temperature (float): Sampling temperature.
+            top_k (Optional[int]): Top-k filtering threshold. Set to `None` to disable top-k filtering.
+            top_p (Optional[float]): Nucleus sampling threshold. Set to `None` to disable it.
+            speaker_embs (Optional[torch.Tensor]): Speaker embeddings. Set to `None` if using an unconditional model.
+            guidance_scale (Optional[float]): Scale factor for the guidance loss. Set to `None` to disable guidance.
+        Returns:
+            torch.Tensor: Generated sequence indices of shape (batch, num_hierarchies, time).
+        """
+        assert max(seq_lens) <= idx.shape[-1]
+        token_pred_mask = self._create_token_pred_mask(idx, seq_lens)
+        input = torch.clone(idx)
+        min_seq_lens = min(seq_lens)
+        idx = idx[:, :, :min_seq_lens]
+        if guidance_scale is not None:
+            if speaker_embs is None:
+                raise Exception("Guidance is only supported for conditional models")
+            # create speaker embeddings equivalent to the batch size, filling with None
+            # for second half to do unconditional generation.
+            speaker_embs = list(speaker_embs) + [None] * (speaker_embs.shape[0])
+        for timestep in tqdm.tqdm(range(min_seq_lens, min_seq_lens + max_new_tokens), desc="tokens: "):
+            if (self.kv_cache_enabled is True) and (timestep > min_seq_lens):
+                idx_input = idx[:, :, -1:]
+            else:
+                idx_input = idx
+            if guidance_scale is not None:
+                # TODO: fix: will cause a problem with kv-caching as it's not expecting larger batch-size.
+                if timestep == min_seq_lens:
+                    print("[hack!!!!] Guidance is on, so we're doubling batch size!")
+                # replicate idx in the batch dimension
+                idx_input = (
+                    idx_input.unsqueeze(0).repeat(2, 1, 1, 1).reshape(-1, idx_input.shape[1], idx_input.shape[2])
+                )
+                # sanity checks
+                assert idx_input.shape[0] % 2 == 0
+            idx_next = self._sample_next_token(
+                idx=idx_input,
+                speaker_embs=speaker_embs,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                guidance_scale=guidance_scale,
+            )  # (b, num_hierarchies)
+            assert idx_next.shape[0] == idx.shape[0]
+            if timestep < token_pred_mask.shape[-1]:
+                idx_next = self._apply_token_pred_mask(
+                    idx_next=idx_next,
+                    orig_input_at_t=input[:, :, timestep],
+                    token_pred_mask_at_t=token_pred_mask[:, [timestep]],
+                )
+            idx_next = idx_next.unsqueeze(-1)  # (b, num_hierarchies, T=1) tensor
+            # append sampled index to the running sequence and continue
+            idx = torch.cat((idx, idx_next), dim=2)
+        return idx
+    @torch.no_grad()
+    def _sort_for_batching(
+        self,
+        *,
+        idx: torch.Tensor,
+        seq_lens: list[int],
+        speaker_embs: Optional[torch.Tensor],
+        batch_size: int,
+        max_new_tokens: int,
+    ) -> Tuple[list[int], list[int], torch.Tensor, list[int], Optional[torch.Tensor], int]:
+        """
+        Sorts the input sequences for efficient batching.
+        Args:
+            idx (torch.Tensor): Initial sequence indices of shape (batch, num_hierarchies, time).
+            seq_lens (list[int]): List of sequence lengths for each sequence in idx.
+            speaker_embs (Optional[torch.Tensor]): Speaker embeddings. Set to `None` if using an unconditional model.
+            batch_size (int): Batch size for sampling. idx is split into batches of this size for sampling.
+            max_new_tokens (int): Maximum number of NEW tokens to generate (in addition to largest sequence in idx).
+        Returns:
+            Tuple[list[int], list[int], torch.Tensor, list[int], Optional[torch.Tensor], int]:
+                - sorted_indices (list[int]): List of indices of the input sequences that transform it into sorted order.
+                - invert_sorted_indices (list[int]): List of indices to invert the sorted sequences back to the original order.
+                - idx (torch.Tensor): Input sequence indices in sorted order.
+                - seq_lens (list[int]): Sequence lengths in sorted order.
+                - speaker_embs (Optional[torch.Tensor]): speaker embeddings in sorted order.
+                - max_token_len (int): Effective maximum number of tokens to generate.
+        """
+        assert len(seq_lens) == idx.shape[0]
+        assert max(seq_lens) <= idx.shape[-1]
+        sorted_indices = np.argsort(seq_lens)
+        inverted_sorted_indices = np.zeros(len(seq_lens), dtype=np.int32)
+        inverted_sorted_indices[sorted_indices] = np.arange(len(seq_lens), dtype=np.int32)
+        idx = idx[sorted_indices]
+        seq_lens = [seq_lens[i] for i in sorted_indices]
+        speaker_embs = speaker_embs[sorted_indices] if speaker_embs is not None else None
+        max_token_len = 0
+        # figure out effective max_tokens to generate
+        for start_index in range(0, len(seq_lens), batch_size):
+            end_index = min(start_index + batch_size, len(seq_lens))
+            batch_seq_lens = seq_lens[start_index:end_index]
+            # random heuristic...
+            # # TODO: fix!
+            max_token_len = max(max_token_len, min(batch_seq_lens) + max_new_tokens)
+        return sorted_indices, inverted_sorted_indices, idx, seq_lens, speaker_embs, max_token_len
+    @torch.no_grad()
+    def _causal_sample(
+        self,
+        *,
+        idx: torch.Tensor,
+        max_new_tokens: int,
+        seq_lens: list[int],
+        temperature: float,
+        top_k: Optional[int],
+        top_p: Optional[float],
+        speaker_embs: Optional[torch.Tensor],
+        batch_size: int,
+        guidance_scale: Optional[float] = None,
+    ) -> torch.Tensor:
+        """
+        Generates a sequence of tokens using causal sampling.
+        Args:
+            idx (torch.Tensor): Initial sequence indices of shape (batch, num_hierarchies, time).
+            max_new_tokens (int): Maximum number of NEW tokens to generate (in addition to largest sequence in idx).
+            seq_lens (list[int]): List of sequence lengths for each sequence in idx.
+            temperature (float): Sampling temperature.
+            top_k (Optional[int]): Top-k filtering threshold. Set to `None` to disable top-k filtering.
+            top_p (Optional[float]): Nucleus sampling threshold. Set to `None` to disable it.
+            speaker_embs (Optional[torch.Tensor]): Speaker embeddings. Set to `None` if using an unconditional model.
+            batch_size (int): Batch size for sampling. idx is split into batches of this size for sampling.
+            guidance_scale (Optional[float]): Scale factor for the guidance loss. Set to `None` to disable guidance.
+        Returns:
+            torch.Tensor: Generated sequence indices of shape (batch, num_hierarchies, time).
+        """
+        (
+            _,
+            invert_sorted_indices,
+            idx,
+            seq_lens,
+            speaker_embs,
+            max_token_len,
+        ) = self._sort_for_batching(
+            idx=idx, seq_lens=seq_lens, speaker_embs=speaker_embs, batch_size=batch_size, max_new_tokens=max_new_tokens
+        )
+        return_idx = torch.zeros((len(seq_lens), idx.size(1), max_token_len), dtype=torch.long, device=idx.device)
+        for start_index in tqdm.tqdm(range(0, len(seq_lens), batch_size), desc="batch: "):
+            end_index = min(start_index + batch_size, len(seq_lens))
+            kv_batch_size = end_index - start_index
+            if guidance_scale is not None:
+                kv_batch_size = 2 * kv_batch_size
+            if self.kv_cache_enabled:
+                print("!!!! USING KV-CACHING ASSUMED TORCH.BFLOAT16")
+                self.empty_kv_cache(
+                    batch_size=kv_batch_size,
+                    kv_cache_maxlen=self.config.block_size,
+                    dtype=torch.bfloat16,
+                )
+            batch_seq_lens = seq_lens[start_index:end_index]
+            batch_max_new_tokens = max_token_len - min(batch_seq_lens)
+            batch_idx = idx[start_index:end_index]
+            batch_speaker_embs = speaker_embs[start_index:end_index] if speaker_embs is not None else None
+            batch_idx = self._sample_batch(
+                idx=batch_idx,
+                max_new_tokens=batch_max_new_tokens,
+                seq_lens=batch_seq_lens,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                speaker_embs=batch_speaker_embs,
+                guidance_scale=guidance_scale,
+            )
+            return_idx[start_index:end_index] = batch_idx
+        return return_idx[invert_sorted_indices]
+    def empty_kv_cache(self, *, batch_size: int, kv_cache_maxlen: int, dtype: torch.dtype):
+        """
+        Empties key-value (KV) cache for causal attention.
+        Args:
+            batch_size (int): The batch size.
+            kv_cache_maxlen (int): The maximum length of the KV cache.
+            dtype (torch.dtype): The data type of the KV cache.
+        Raises:
+            Exception: If KV cache is enabled for non-causal attention.
+        """
+        if self.kv_cache_enabled is False:
+            raise Exception("KV cache is not enabled")
+        if self.config.causal is False:
+            raise Exception("KV cache is not supported for non-causal attention")
+        self.kv_pos = 0
+        for block in self.transformer.h:
+            block.attn.empty_kv_cache(batch_size=batch_size, kv_cache_maxlen=kv_cache_maxlen, dtype=dtype)
+    def enable_kv_cache(self):
+        """
+        Enables key-value (KV) cache for causal attention.
+        Raises:
+            Exception: If KV cache is enabled for non-causal attention.
+        """
+        if self.config.causal is False:
+            raise Exception("KV cache is not supported for non-causal attention")
+        self.kv_cache_enabled = True
+        for block in self.transformer.h:
+            block.attn.kv_cache_enabled = True
+    def disable_kv_cache(self):
+        """
+        Disables the key-value cache for the transformer and all its blocks.
+        """
+        self.kv_cache_enabled = False
+        for block in self.transformer.h:
+            block.attn.kv_cache_enabled = False
+            block.attn.kv_cache = None
+            block.attn.kv_cache_first_empty_index = 0
+    @torch.no_grad()
+    def _slow_causal_sampling_loop(
+        self,
+        idx: torch.Tensor,
+        max_new_tokens: int,
+        temperature: float = 1.0,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        speaker_embs: Optional[torch.Tensor] = None,
+        guidance_scale: Optional[float] = None,
+    ):
+        """
+        Old non-batched version of causal sampling. Kept for testing / reference.
+        Take a conditioning sequence of indices idx (LongTensor of shape (b,n_head,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        """
+        assert idx.dim() == 3, "idx must be a batch of sequences of hierarchical tokens"
+        assert idx.size(0) == 1, "can only do one sequence at a time for now"
+        assert top_p is None, "nucleus sampling not supported yet with _slow_causal_sampling_loop"
+        if self.config.causal is not True:
+            raise Exception("Causal sampling is only supported for causal models")
+        if self.kv_cache_enabled:
+            print("!!!! USING KV-CACHING ASSUMED TORCH.BFLOAT16")
+            self.empty_kv_cache(
+                batch_size=1,
+                kv_cache_maxlen=self.config.block_size,
+                dtype=torch.bfloat16,
+            )
+        for i in range(max_new_tokens):
+            # if the sequence context is growing too long we must crop it at block_size
+            idx_cond = idx if idx.size(-1) <= self.config.block_size else idx[:, -self.config.block_size :]
+            if self.kv_cache_enabled:
+                if i > 0:
+                    idx_cond = idx_cond[:, :, -1:]
+            # forward the model to get the logits for the index in the sequence
+            list_logits, _ = self(idx_cond, speaker_embs=speaker_embs)
+            if guidance_scale is not None:
+                # we've already checked that kv-caching is not switched on
+                # so this should be ok.
+                list_logits_uncond, _ = self(idx_cond, speaker_embs=None)
+                list_logits = [
+                    (guidance_scale) * logits + (1 - guidance_scale) * logits_uncond
+                    for logits, logits_uncond in zip(list_logits, list_logits_uncond)
+                ]
+            # pluck the logits at the final step and scale by desired temperature
+            list_logits = [logits[:, -1, :] / temperature for logits in list_logits]
+            # optionally crop the logits to only the top k options
+            if top_k is not None:
+                for i in range(len(list_logits)):
+                    logits = list_logits[i]
+                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = -float("Inf")
+                    list_logits[i] = logits
+            # apply softmax to convert logits to (normalized) probabilities
+            probs = [F.softmax(logits, dim=-1) for logits in list_logits]
+            # sample from the distribution
+            idx_next = torch.tensor(
+                [torch.multinomial(prob, num_samples=1) for prob in probs], device=idx.device
+            )  # (c, 1)
+            # append sampled index to the running sequence and continue
+            idx = torch.cat((idx, idx_next.unsqueeze(0).unsqueeze(-1)), dim=2)
+        return idx

fam/llm/mixins/non_causal.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from typing import Optional
+import torch
+from torch.nn import functional as F
+class NonCausalInferenceMixin:
+    """
+    Mixin class for non-causal inference in a language model.
+    This class provides methods for performing non-causal sampling using a language model.
+    """
+    @torch.no_grad()
+    def _non_causal_sample(
+        self, *, idx: torch.Tensor, speaker_embs: Optional[torch.Tensor], temperature: float, top_k: int
+    ):
+        """
+        Perform non-causal sampling.
+        Args:
+            idx (torch.Tensor): Input tensor of shape (batch_size, num_in_hierarchies, sequence_length).
+            speaker_embs (Optional[torch.Tensor]): Speaker embeddings tensor of shape (batch_size, embedding_size).
+            temperature (float): Temperature parameter for scaling the logits.
+            top_k (int): Number of top options to consider.
+        Returns:
+            torch.Tensor: Sampled output tensor of shape (batch_size, num_out_hierarchies, sequence_length).
+        """
+        b, c, t = idx.size()
+        assert t == self.config.block_size, f"input size {t} != config.block_size {self.config.block_size}"
+        # forward the model to get the logits for the index in the sequence
+        list_logits, _ = self(idx, speaker_embs=speaker_embs)  # c x (b, t, vocab_size)
+        # scale by desired temperature
+        list_logits = [logits / temperature for logits in list_logits]  # c x (b, t, vocab_size)
+        # optionally crop the logits to only the top k options
+        if top_k is not None:
+            for i in range(len(list_logits)):
+                logits = list_logits[i]  # (b, t, vocab_size)
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))  # (b, t, top_k)
+                logits[logits < v[:, :, [-1]]] = -float("Inf")
+                list_logits[i] = logits  # (b, t, vocab_size)
+                assert logits.shape[0] == b and logits.shape[1] == t
+        # apply softmax to convert logits to (normalized) probabilities
+        # TODO: check shapes here!
+        probs = [F.softmax(logits, dim=-1) for logits in list_logits]  # c x (b, t, top_k)
+        assert probs[0].shape[0] == b and probs[0].shape[1] == t
+        # TODO: output shape is as expected
+        outs = []
+        for b_prob in probs:  # c x (b, t, top_k) -> (b, t, top_k)
+            out = [
+                torch.multinomial(prob, num_samples=1).transpose(0, 1).unsqueeze(0) for prob in b_prob
+            ]  # b x (t, top_k) -> b x (t, 1) -> b x (1, t) -> b x (1, 1, t)
+            assert len(out) == b and out[0].shape[0] == 1 and out[0].shape[1] == 1 and out[0].shape[2] == t
+            out = torch.cat(out, dim=0)  # (b, 1, t)
+            assert out.shape[0] == b and out.shape[1] == 1 and out.shape[2] == t
+            outs.append(out)
+        out = torch.cat(outs, dim=1)  # (b, c, t)
+        assert out.shape[0] == b and out.shape[2] == t
+        return out

fam/llm/model.py ADDED Viewed

	@@ -0,0 +1,524 @@

+import inspect
+import math
+from dataclasses import dataclass, field
+from typing import Literal, Optional, Union
+import torch
+import torch.nn as nn
+import tqdm
+from einops import rearrange
+from torch.nn import functional as F
+from fam.llm.layers import Block, LayerNorm, RMSNorm
+from fam.llm.mixins import CausalInferenceMixin, NonCausalInferenceMixin
+from IPython import embed
+END_OF_TEXT_TOKEN = 1537
+def _select_spkemb(spkemb, mask):
+    _, examples, _ = spkemb.shape
+    mask = torch.nn.functional.one_hot(mask.long(), num_classes=examples).to(spkemb)  # shape: (batch, time, examples)
+    spkemb = spkemb.transpose(1, 2)  # b ex c -> b c ex
+    mask = mask.transpose(1, 2)  # b t ex -> b ex t
+    return torch.bmm(spkemb, mask).transpose(1, 2)  # b c t -> b t c
+@dataclass
+class GPTConfig:
+    block_size: int = 1024
+    vocab_sizes: list = field(default_factory=list)
+    target_vocab_sizes: Optional[list] = None
+    n_layer: int = 12
+    n_head: int = 12
+    n_embd: int = 768
+    dropout: float = 0.0
+    spkemb_dropout: float = 0.0
+    bias: bool = True  # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+    causal: bool = (
+        True  # auto-regressive or not, i.e. whether to have attention mask that prevents attending to future tokens
+    )
+    spk_emb_on_text: bool = True  # whether to add speaker embedding conditioning to text tokens or not
+    norm_type: str = "layernorm"  # "rmsnorm" or "layernorm
+    rmsnorm_eps: Optional[float] = None  # only used for rmsnorm
+    nonlinearity_type: str = "gelu"  # "gelu" or "swiglu"
+    swiglu_multiple_of: Optional[int] = None  # MLP hidden layer (using SwiGLU) will be multiple of this
+    attn_kernel_type: Literal["torch_attn"] = "torch_attn"
+    #Literal["fa2", "torch_attn", "hand"] = "fa2"
+    kv_cache_enabled: bool = False  # whether to use key-value cache for attention
+def _check_speaker_emb_dims(
+    speaker_embs: Union[list, torch.Tensor], expected_speaker_emb_dim: int, expected_batch_size: int
+) -> Union[torch.Tensor, list]:
+    """
+    Checks that the speaker embedding dimensions are correct, and reshapes them if necessary.
+    """
+    if type(speaker_embs) == list:
+        b_se = len(speaker_embs)
+        for i, s in enumerate(speaker_embs):
+            if s is not None:
+                emb_dim = s.shape[-1]
+                if s.ndim == 1:
+                    speaker_embs[i] = speaker_embs[i].unsqueeze(0)
+    else:
+        if speaker_embs.ndim == 2:
+            # if we have a single speaker embedding for the whole sequence,
+            # add a dummy dimension for backwards compatibility
+            speaker_embs = speaker_embs[:, None, :]
+        # num_examples is the number of utterances packed into this sequence
+        b_se, num_examples, emb_dim = speaker_embs.size()
+    assert b_se == expected_batch_size, f"Batch size mismatch: {b_se} != {expected_batch_size}"
+    assert (
+        emb_dim == expected_speaker_emb_dim
+    ), f"Speaker embedding dimension mismatch: {emb_dim} != {expected_speaker_emb_dim}"
+    return speaker_embs
+class GPT(nn.Module, NonCausalInferenceMixin, CausalInferenceMixin):
+    def __init__(self, config: GPTConfig, speaker_emb_dim: Optional[int] = None):
+        """
+        Initialize the GPT model.
+        Args:
+            config (GPTConfig): Configuration object for the model.
+            speaker_emb_dim (Optional[int]): Dimension of the speaker embedding. Default is None.
+        """
+        super().__init__()
+        assert config.vocab_sizes is not None
+        assert config.block_size is not None
+        self.config = config
+        self.kv_cache_enabled = False  # disabled by default
+        self.kv_pos = 0
+        self.speaker_emb_dim = speaker_emb_dim
+        self.spk_emb_on_text = config.spk_emb_on_text
+        if self.config.causal is True and self.spk_emb_on_text is False:
+            print("!!!!!!!!!!!!!!!!!!")
+            print(
+                f"!!!!!!!! Using DEFAULT of {END_OF_TEXT_TOKEN} as end of text token to find speaker cond masking!! You likely need to change this."
+            )
+            print("!!!!!!!!!!!!!!!!!!")
+        if self.config.causal is False and self.spk_emb_on_text is False:
+            raise Exception(
+                "Cannot use speaker embedding masking with non-causal model. This is unexpected. Check for relevant changes required in code before proceeding."
+            )
+        if config.norm_type == "rmsnorm":
+            if config.rmsnorm_eps is None:
+                raise Exception("RMSNorm requires rmsnorm_eps to be set")
+            ln_f = RMSNorm(config.n_embd, eps=config.rmsnorm_eps)
+        elif config.norm_type == "layernorm":
+            ln_f = LayerNorm(config.n_embd, bias=config.bias)
+        else:
+            raise Exception(f"Unknown norm type: {config.norm_type}")
+        self.transformer = nn.ModuleDict(
+            dict(
+                wtes=nn.ModuleList([nn.Embedding(vsize, config.n_embd,) for vsize in config.vocab_sizes]),
+                wpe=nn.Embedding(config.block_size, config.n_embd),
+                drop=nn.Dropout(config.dropout),
+                h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+                ln_f=ln_f,
+            )
+        )
+        if speaker_emb_dim is not None:
+            self.speaker_cond_pos = nn.Linear(speaker_emb_dim, config.n_embd, bias=False) # ここで256->2048
+        self.lm_heads = nn.ModuleList()
+        if config.target_vocab_sizes is not None:
+            assert config.causal is False
+        else:
+            assert config.causal is True
+        for vsize in config.vocab_sizes if config.target_vocab_sizes is None else config.target_vocab_sizes:
+            self.lm_heads.append(nn.Linear(config.n_embd, vsize, bias=False))
+        if config.target_vocab_sizes is None:
+            for i in range(len(config.vocab_sizes)):
+                # TODO: do we not need to take the transpose here?
+                # https://paperswithcode.com/method/weight-tying
+                self.lm_heads[i].weight = self.transformer.wtes[i].weight  # type: ignore
+            assert len(self.lm_heads) == len(
+                self.transformer.wtes  # type: ignore
+            ), f"Number of heads ({len(self.lm_heads)}) must match number of one-hot embedding matrics ({len(self.transformer.wtes)})."  # type: ignore
+        # - causal
+        # GPT(
+        # (transformer): ModuleDict(
+        #     (wtes): ModuleList(
+        #     (0): Embedding(2562, 2048)
+        #     )
+        #     (wpe): Embedding(2048, 2048)
+        #     (drop): Dropout(p=0.0, inplace=False)
+        #     (h): ModuleList(
+        #     (0-23): 24 x Block(
+        #         (ln_1): RMSNorm()
+        #         (ln_2): RMSNorm()
+        #         (attn): SelfAttention(
+        #         (c_attn): Linear(in_features=2048, out_features=6144, bias=False)
+        #         (c_proj): Linear(in_features=2048, out_features=2048, bias=False)
+        #         (resid_dropout): Dropout(p=0.0, inplace=False)
+        #         )
+        #         (mlp): MLP(
+        #         (swiglu): SwiGLU(
+        #             (w1): Linear(in_features=2048, out_features=5632, bias=False)
+        #             (w3): Linear(in_features=2048, out_features=5632, bias=False)
+        #         )
+        #         (c_proj): Linear(in_features=5632, out_features=2048, bias=False)
+        #         (dropout): Dropout(p=0.0, inplace=False)
+        #         )
+        #     )
+        #     )
+        #     (ln_f): RMSNorm()
+        # )
+        # (speaker_cond_pos): Linear(in_features=256, out_features=2048, bias=False)
+        # (lm_heads): ModuleList(
+        #     (0): Linear(in_features=2048, out_features=2562, bias=False)
+        # )
+        # )
+        # GPTConfig(block_size=2048, vocab_sizes=[2562], target_vocab_sizes=None, n_layer=24, n_head=16, n_embd=2048, dropout=0.0, spkemb_dropout=0.1, bias=False, causal=True, spk_emb_on_text=True, norm_type='rmsnorm', rmsnorm_eps=1e-05, nonlinearity_type='swiglu', swiglu_multiple_of=256, attn_kernel_type='torch_attn', kv_cache_enabled=False)
+        #
+        # - non causal
+        # GPT(
+        #   (transformer): ModuleDict(
+        #     (wtes): ModuleList(
+        #       (0): Embedding(1538, 384)
+        #       (1): Embedding(1025, 384)
+        #     )
+        #     (wpe): Embedding(1024, 384)
+        #     (drop): Dropout(p=0.0, inplace=False)
+        #     (h): ModuleList(
+        #       (0-5): 6 x Block(
+        #         (ln_1): LayerNorm()
+        #         (ln_2): LayerNorm()
+        #         (attn): SelfAttention(
+        #           (c_attn): Linear(in_features=384, out_features=1152, bias=False)
+        #           (c_proj): Linear(in_features=384, out_features=384, bias=False)
+        #           (resid_dropout): Dropout(p=0.0, inplace=False)
+        #         )
+        #         (mlp): MLP(
+        #           (c_fc): Linear(in_features=384, out_features=1536, bias=False)
+        #           (gelu): GELU(approximate='none')
+        #           (c_proj): Linear(in_features=1536, out_features=384, bias=False)
+        #           (dropout): Dropout(p=0.0, inplace=False)
+        #         )
+        #       )
+        #     )
+        #     (ln_f): LayerNorm()
+        #   )
+        #   (speaker_cond_pos): Linear(in_features=256, out_features=384, bias=False)
+        #   (lm_heads): ModuleList(
+        #     (0-5): 6 x Linear(in_features=384, out_features=1025, bias=False)
+        #   )
+        # )
+        # GPTConfig(block_size=1024, vocab_sizes=[1538, 1025], target_vocab_sizes=[1025, 1025, 1025, 1025, 1025, 1025], n_layer=6, n_head=6, n_embd=384, dropout=0.0, spkemb_dropout=0.0, bias=False, causal=False, spk_emb_on_text=True, norm_type='layernorm', rmsnorm_eps=None, nonlinearity_type='gelu', swiglu_multiple_of=None, attn_kernel_type='fa2', kv_cache_enabled=False)
+        # if config.causal is False:
+        #     embed()
+        # init all weights
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith("c_proj.weight"):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer))
+        # report number of parameters
+        print("number of parameters: %.2fM" % (self.get_num_params() / 1e6,))
+    def get_num_params(self, non_embedding=True):
+        """
+        Return the number of parameters in the model.
+        For non-embedding count (default), the position embeddings get subtracted.
+        The token embeddings would too, except due to the parameter sharing these
+        params are actually used as weights in the final layer, so we include them.
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def _mask_spk_emb_on_text(self, idx: torch.Tensor, spk_emb: torch.Tensor) -> torch.Tensor:
+        """
+        This is in a separate function so we can test it easily.
+        """
+        # find index of end of text token in each sequence, then generate a binary mask
+        # of shape (b, 1, t) to mask out the speaker embedding for all tokens before the end of text token.
+        # Note: this does NOT mask the <end_of_text_token> token. This is important so that the first audio token predicted
+        # has speaker information to use.
+        # Check in channel dimension 0 as this is usually the first hierarchy where we put the text tokens.
+        is_end_of_text = idx[:, 0, :] == END_OF_TEXT_TOKEN
+        # use > 0, in case end_of_text_token is repeated for any reason.
+        mask = (torch.cumsum(is_end_of_text, dim=-1) > 0).float()
+        spk_emb = spk_emb * mask[:, :, None]
+        return spk_emb
+    def forward(
+        self,
+        idx,
+        targets=None,
+        speaker_embs=None,
+        embedding=None,
+        speaker_emb_mask=None,
+        loss_reduce: Literal["mean", "none"] = "mean",
+    ):
+        # print(f'{idx.shape}')
+        device = idx.device
+        b, num_hierarchies, t = idx.size()
+        if speaker_embs is not None:
+            speaker_embs = _check_speaker_emb_dims(
+                speaker_embs=speaker_embs, expected_speaker_emb_dim=self.speaker_emb_dim, expected_batch_size=b
+            )
+        assert (
+            t <= self.config.block_size
+        ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        if self.kv_cache_enabled:
+            if self.kv_pos == 0:
+                pos = torch.arange(0, t, dtype=torch.long, device=device)
+                self.kv_pos += t
+            else:
+                assert t == 1, "KV cache is only supported for single token inputs"
+                pos = torch.tensor([self.kv_pos], dtype=torch.long, device=device)  # shape (1)
+                self.kv_pos += 1
+        else:
+            pos = torch.arange(0, t, dtype=torch.long, device=device)  # shape (t)
+        # print("here1")
+        # forward the GPT model itself
+        # assert num_hierarchies == len(
+        #     self.transformer.wtes
+        # ), f"Input tensor has {num_hierarchies} hierarchies, but model has {len(self.transformer.wtes)} set of input embeddings."
+        # embed the tokens, positional encoding, and speaker embedding
+        tok_emb = torch.zeros((b, t, self.config.n_embd), device=device)
+        # ends up swapping (B, num_hierarchies, t) tokens -> (B, t, c) embeddings.
+        # print(f'{len(self.transformer.wtes)=}')
+        # print(f'{self.transformer.wtes[0]=}')
+        wte = self.transformer.wtes[0]
+        #for i in range(num_hierarchies):
+        for i, wte in enumerate(self.transformer.wtes):
+            # print(f'{idx[:,i,:].shape=}')
+            # print(f'{idx[:,i,:]=}')
+            # print(f'{torch.max(idx[:,i,:])}')
+            # print(f'{torch.min(idx[:,i,:])}')
+            # print(f'{wte(idx[:,i,:]).shape=}')
+            # print(f'{tok_emb.shape=}')
+            mask_pad = idx[:, i, :] == -1 # 要素が-1であるindexを取得
+            masked_idx = idx[:, i, :].clone()
+            masked_idx[mask_pad] = 0
+            # embed(header='a')
+            embedded_idx = wte(masked_idx)
+            # embed(header='b')
+            # embedding_dim = embedded_idx.shape[-1]
+            # mask_expanded = mask_pad.unsqueeze(-1).expand(-1, -1, embedding_dim)
+            embedded_idx[mask_pad] = 0
+            # embedded_idx = wte(idx[:, i, :])
+            # print(embedded_idx[:,:,:10])
+            # embed(header='c')
+            # embed()
+            # masked_embedded_idx =
+            tok_emb += embedded_idx
+            # tok_emb += wte(idx[:, i, :])
+        # embed()
+        pos_emb = self.transformer.wpe(pos)  # position embeddings of shape (t, n_embd)
+        spk_emb = 0.0
+        if speaker_embs is not None:
+            if type(speaker_embs) == list:
+                assert speaker_emb_mask is None
+                assert self.training is False
+                assert self.spk_emb_on_text is True
+                # print(f'{self.config.n_embd=}')
+                spk_emb = []
+                for speaker_emb_row in speaker_embs:
+                    if speaker_emb_row is not None:
+                        spk_emb.append(self.speaker_cond_pos(speaker_emb_row.unsqueeze(0)))
+                        assert spk_emb[-1].shape == (1, 1, self.config.n_embd), f"spk_emb[-1].shape={spk_emb[-1].shape}"
+                    else:
+                        spk_emb.append(torch.zeros((1, 1, self.config.n_embd), device=device, dtype=pos_emb.dtype))
+                # print(f'{len(spk_emb)}, {[v.shape for v in spk_emb]=}')
+                spk_emb = torch.cat(spk_emb, dim=0)
+                assert (
+                    spk_emb.ndim == 3 and spk_emb.shape[1] == 1 and spk_emb.shape[0] == b
+                ), f"spk_emb.ndim={spk_emb.ndim}, spk_emb.shape={spk_emb.shape}, len(speaker_embs)={len(speaker_embs)}"
+            else:
+                speakers_embedded = self.speaker_cond_pos(speaker_embs)  # shape (b, num_examples, c)
+                if speaker_emb_mask is not None:
+                    spk_emb = _select_spkemb(speakers_embedded, speaker_emb_mask)
+                    assert spk_emb.shape == (b, t, self.config.n_embd)
+                else:
+                    spk_emb = speakers_embedded
+                    # if we don't have a mask, we assume that the speaker embedding is the same for all tokens
+                    # then num_examples dimension just becomes the time dimension
+                    assert spk_emb.ndim == 3 and spk_emb.shape[1] == 1
+                if self.training and self.config.spkemb_dropout > 0.0:
+                    # Remove speaker conditioning at random.
+                    dropout = torch.ones_like(speakers_embedded) * (
+                        torch.rand(speakers_embedded.shape[0], 1, 1, device=device) >= self.config.spkemb_dropout
+                    )
+                    spk_emb = torch.where(dropout == 0, torch.zeros_like(speakers_embedded), speakers_embedded)
+            if self.spk_emb_on_text is False:
+                assert speaker_emb_mask is None, "Not implemented for spk_emb_on_text=False"
+                spk_emb = self._mask_spk_emb_on_text(idx, spk_emb)
+        elif embedding is not None:
+            # spk_emb = embedding
+            # spk_emb = torch.zeros((b, t, self.config.n_embd), device=device)
+            # for i, wte in enumerate(self.transformer.wtes):
+            #     print(f'{embedding[:, i, :].shape=}, {embedding.shape=}')
+            #     print(f'{wte(embedding[:, i, :]).shape=}')
+            #     spk_emb += wte(embedding[:, i, :])
+            spk_emb = self.speaker_cond_pos(embedding)
+        # TODO: implement causal attnetion mask here
+        # memo:
+        # b, t, d=2048のとき， tok_emb=(b,t,d), pos_emb=(t,d), spk_emp=(b,1,d)
+        # train: tok_emb.shape=torch.Size([128, 187, 2048]), pos_emb.shape=torch.Size([187, 2048]), spk_emb.shape=torch.Size([128, 1, 1, 187])<- spk_embは(b,1,2048)になってほしい？
+        # sample: tok_emb.shape=torch.Size([2, 369, 2048]), pos_emb.shape=torch.Size([369, 2048]), spk_emb.shape=torch.Size([2, 1, 2048])
+        # print(f'{tok_emb.shape=}, {pos_emb.shape=}, {spk_emb.shape=}')
+        x = self.transformer.drop(tok_emb + pos_emb + spk_emb)
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+        if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            list_logits = [lm_head(x) for lm_head in self.lm_heads]
+            # print(f'{len(list_logits)=}, {list_logits[0].shape=}')
+            # embed(header='cc')
+            losses = [
+                F.cross_entropy(
+                    logits.view(-1, logits.size(-1)),
+                    targets[:, i, :].contiguous().view(-1),
+                    ignore_index=-1,
+                    reduction=loss_reduce,
+                )
+                for i, logits in enumerate(list_logits)
+            ]
+            # TODO: should we do this better without stack somehow?
+            # embed(header='bb')
+            losses = torch.stack(losses)
+            if loss_reduce == "mean":
+                # embed(header='aa')
+                losses = losses.mean()
+            else:
+                losses = rearrange(losses, "h (b t) -> b h t", h=len(self.lm_heads), b=b, t=t)
+        else:
+            # inference-time mini-optimization: only forward the lm_head on the very last position
+            if self.config.causal:
+                list_logits = [
+                    lm_head(x[:, [-1], :]) for lm_head in self.lm_heads
+                ]  # note: using list [-1] to preserve the time dim
+                # print(f'{len(list_logits)=}, {list_logits[0].shape=}')
+            else:
+                list_logits = [lm_head(x) for lm_head in self.lm_heads]
+            losses = None
+        return list_logits, losses
+    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
+        # start with all of the candidate parameters
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        # filter out those that do not require grad
+        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
+        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
+        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
+        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
+        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
+        optim_groups = [
+            {"params": decay_params, "weight_decay": weight_decay},
+            {"params": nodecay_params, "weight_decay": 0.0},
+        ]
+        num_decay_params = sum(p.numel() for p in decay_params)
+        num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
+        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
+        # Create AdamW optimizer and use the fused version if it is available
+        fused_available = "fused" in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == "cuda"
+        extra_args = dict(fused=True) if use_fused else dict()
+        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
+        print(f"using fused AdamW: {use_fused}")
+        return optimizer
+    @torch.no_grad()
+    def generate(
+        self,
+        idx: torch.Tensor,
+        max_new_tokens: int,
+        seq_lens: Optional[list] = None,
+        temperature: float = 1.0,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        speaker_embs: Optional[torch.Tensor] = None,
+        batch_size: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+    ):
+        """
+        Take a conditioning sequence of indices idx (LongTensor of shape (b,num_hierarchies,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        """
+        assert idx.dim() == 3, "idx must be a batch of sequences of hierarchical tokens"
+        if self.config.causal:
+            if seq_lens is None or batch_size is None:
+                raise Exception("seq_lens and batch_size must be provided for causal sampling")
+            return self._causal_sample(
+                idx=idx,
+                max_new_tokens=max_new_tokens,
+                seq_lens=seq_lens,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                speaker_embs=speaker_embs,
+                batch_size=batch_size,
+                guidance_scale=guidance_scale,
+            )
+        else:
+            if seq_lens is not None:
+                raise Exception("seq_lens is not supported yet for non-causal sampling")
+            if batch_size is None:
+                raise Exception("batch_size must be provided for non-causal sampling")
+            if guidance_scale is not None:
+                raise Exception("guidance_scale is not supported for non-causal sampling")
+            if top_p is not None:
+                raise Exception("top_p is not supported for non-causal sampling")
+            out = []
+            for start_index in tqdm.tqdm(range(0, idx.shape[0], batch_size), desc="non-causal batching"):
+                end_index = min(start_index + batch_size, idx.shape[0])
+                out.append(
+                    self._non_causal_sample(
+                        idx=idx[start_index:end_index],
+                        speaker_embs=speaker_embs[start_index:end_index] if speaker_embs is not None else None,
+                        temperature=temperature,
+                        top_k=top_k,
+                    )
+                )
+            return torch.cat(out, dim=0)
+            return torch.cat(out, dim=0)

fam/llm/sample.py ADDED Viewed

	@@ -0,0 +1,731 @@

+import dataclasses
+import hashlib
+import json
+import os
+import pathlib
+import shutil
+import subprocess
+import tempfile
+from contextlib import nullcontext
+from dataclasses import dataclass
+from typing import List, Literal, Optional, Type, Union
+import librosa
+import torch
+import tqdm
+import tqdm.contrib.concurrent
+import tyro
+from huggingface_hub import snapshot_download
+from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook, TiltedEncodec
+from fam.llm.decoders import Decoder, EncodecDecoder
+from fam.llm.enhancers import BaseEnhancer, get_enhancer
+from fam.llm.model import GPT, GPTConfig
+from fam.llm.utils import normalize_text
+from fam.quantiser.audio.speaker_encoder.model import SpeakerEncoder
+from fam.quantiser.text.tokenise import TrainedBPETokeniser
+@dataclass
+class InferenceConfig:
+    ckpt_path: str  # path to checkpoint
+    output_dir: str
+    num_samples: int = 10  # number of samples to draw
+    seed: int = 1337  # random seed
+    device: str = "cuda"
+    dtype: str = "bfloat16"
+    compile: bool = False
+    init_from: str = "resume"  # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
+    def __str__(self):
+        field_strs = []
+        for field in dataclasses.fields(self):
+            value = getattr(self, field.name)
+            field_strs.append(f"  {field.name}: {value}")
+        return "InferenceConfig:\n" + "\n".join(field_strs)
+class Model:
+    """
+    Class to sample from a trained model.
+    """
+    def __init__(
+        self,
+        config: InferenceConfig,
+        tokenizer_cls: Type[TrainedBPETokeniser],
+        decoder_cls: Type[Decoder],
+        data_adapter_fn,
+        use_kv_cache: Optional[Literal["none", "flash_decoding", "vanilla"]] = None,
+        first_model_path = None
+    ):
+        # TODO: disentangle the encodec stuff and numbers etc with rest of this code (esp at encoder-only / second stage model inference)
+        # TODO: remove magic number
+        self._encodec_codes_pad_token = 1024
+        self._num_encodec_codebooks = 8
+        self.config = config
+        self.use_kv_cache = use_kv_cache
+        torch.manual_seed(config.seed)
+        torch.cuda.manual_seed(config.seed)
+        torch.backends.cuda.matmul.allow_tf32 = True if config.dtype != "float32" else False  # allow tf32 on matmul
+        torch.backends.cudnn.allow_tf32 = True if config.dtype != "float32" else False  # allow tf32 on cudnn
+        device_type = "cuda" if "cuda" in config.device else "cpu"  # for later use in torch.autocast
+        ptdtype = {
+            "float32": torch.float32,
+            "tfloat32": torch.float32,
+            "bfloat16": torch.bfloat16,
+            "float16": torch.float16,
+        }[config.dtype]
+        self._ctx = (
+            nullcontext() if device_type == "cpu" else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+        )
+        self.use_bpe_tokenizer = False
+        self.load_meta = None
+        self.speaker_cond = None
+        self.meta = None
+        self.model = None
+        self.checkpoint_config = None
+        self.vocab_sizes = None
+        self.smodel = None
+        self.first_model_path = first_model_path
+        self._init_model()
+        self.tokenizer = tokenizer_cls(**self.meta["tokenizer"])
+        self.decoder = decoder_cls(
+            tokeniser_decode_fn=self.tokenizer.decode,
+            output_dir=self.config.output_dir,
+            data_adapter_fn=data_adapter_fn,
+        )
+    def _init_model(self):
+        if self.config.init_from == "resume":
+            # init from a model saved in a specific directory
+            checkpoint = torch.load(self.config.ckpt_path, map_location=self.config.device)
+            self.vocab_sizes = checkpoint["model_args"]["vocab_sizes"]
+            self.load_meta = False
+            self.speaker_cond = False
+            if "config" in checkpoint:
+                self.checkpoint_config = checkpoint["config"]
+                self.meta = checkpoint["meta"]
+                load_meta = True
+            if load_meta:
+                self.use_bpe_tokenizer = "stoi" not in self.meta or "itos" not in self.meta
+                self.speaker_cond = self.meta.get("speaker_cond")
+            if self.speaker_cond:
+                speaker_emb_size = self.meta["speaker_emb_size"]
+            model_args = checkpoint["model_args"]
+            if "causal" in self.checkpoint_config and self.checkpoint_config["causal"] is False:
+                self._encodec_ctx_window = model_args["block_size"]
+            gptconf = GPTConfig(**model_args)
+            # TODO: rename `speaker_emb_dim` to `speaker_emb_size`.
+            self.model = GPT(gptconf, speaker_emb_dim=speaker_emb_size if self.speaker_cond else None)
+            if not getattr(self.config, 'train_from_scratch', False):
+                state_dict = checkpoint["model"]
+                unwanted_prefix = "_orig_mod."
+                for k, v in list(state_dict.items()):
+                    if k.startswith(unwanted_prefix):
+                        state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
+                self.model.load_state_dict(state_dict)
+            else:
+                print("training a model from scratch!!")
+            if self.first_model_path is not None:
+                if self.checkpoint_config.get("causal", True):
+                    new_dict = {}
+                    state_dict = torch.load(self.first_model_path)['state_dict']
+                    for key, val in state_dict.items():
+                        if 'first_stage_model_transformer' in key:
+                            new_dict[key.replace('first_stage_model_transformer.', '')] = val
+                    self.model.load_state_dict(new_dict)
+        # model
+        self.model.eval()
+        self.model.to(self.config.device)
+        if self.config.compile:
+            from einops._torch_specific import allow_ops_in_compiled_graph
+            allow_ops_in_compiled_graph()
+            self.model = torch.compile(self.model)  # type: ignore
+        if self.use_kv_cache is not None:
+            if "causal" in self.checkpoint_config and self.checkpoint_config["causal"] is False:
+                raise Exception("kv_cache not supported for non-causal models!")
+            if self.use_kv_cache == "flash_decoding":
+                self.model.enable_kv_cache()
+                for block in self.model.transformer.h:
+                    block.attn.attn_kernel_type = "fd"
+            elif self.use_kv_cache == "vanilla":
+                for block in self.model.transformer.h:
+                    if block.attn.attn_kernel_type != "fa2":
+                        raise Exception(
+                            f"kv_cache only supported for flash attention 2 but found {block.attn.attn_kernel_type} inside model!"
+                        )
+                self.model.enable_kv_cache()
+            else:
+                raise NotImplementedError(f"kv_cache type {self.use_kv_cache} not implemented!")
+    def causal_sample(
+        self,
+        *,
+        texts: list[str],
+        batch_size: int,
+        max_new_tokens: int,
+        temperature: Optional[float],
+        top_k: Optional[int],
+        top_p: Optional[float],
+        speaker_embs: Optional[torch.Tensor] = None,
+        guidance_scale: Optional[float] = None,
+    ) -> list[torch.Tensor]:
+        """
+        Returns list of torch.Tensors of tokens. Each tensor is of shape (1, c, t) where c is the number of codebooks.
+        Any flattening / inteleaving / tilting gets reversed before the output is returned.
+        """
+        if speaker_embs is not None:
+            assert len(texts) == len(speaker_embs)
+        encoded_texts = [self.tokenizer.encode(text) for text in texts]
+        ## create multiple hierarchies and get seq_lens
+        seq_lens = []
+        xs = []
+        for i, encoded_text in enumerate(encoded_texts):
+            encoded_text = torch.tensor([encoded_text], dtype=torch.long, device=self.config.device)
+            # TODO: remove magic number
+            xs.append(
+                torch.cat(
+                    # [1st hierarchy of text, *remaining hierarchies of padded tokens]
+                    # TODO: self.vocab_sizes should be from the model config?
+                    [encoded_text, *[torch.ones_like(encoded_text) * 1024] * (len(self.vocab_sizes) - 1)],
+                    dim=0,
+                ).unsqueeze(0)
+            )  # b x [(b=1, c, t)]
+            seq_lens.append(xs[-1].shape[-1])
+        max_len = max(seq_lens)
+        assert len(xs) == len(seq_lens)
+        ## equalise the shapes in the batch. we can use torch.zeros as tokens > seq_lens will be masked out.
+        x = torch.zeros((len(encoded_texts), xs[0].shape[1], max_len), dtype=torch.long, device=self.config.device)
+        for i, _xs in enumerate(xs):
+            assert _xs.shape[-1] == seq_lens[i]
+            x[i, :, : seq_lens[i]] = _xs
+        ## check that the input is correct
+        for i in range(x.shape[0]):
+            assert x[i, 0, : seq_lens[i]].tolist() == encoded_texts[i]
+            # TODO: remove magic number
+            if x.shape[1] > 1:
+                assert set(x[i, 1, : seq_lens[i]].tolist()) == set([1024])
+        assert x.shape[0] == speaker_embs.shape[0] if speaker_embs is not None else True
+        if self.speaker_cond is False:
+            speaker_embs = None
+        # run sampling loop
+        with torch.no_grad():
+            with self._ctx:  # type: ignore
+                to_return = []
+                for k in range(self.config.num_samples):
+                    assert seq_lens is not None
+                    assert batch_size is not None
+                    if max(seq_lens) + max_new_tokens >= self.model.config.block_size:
+                        raise Exception(
+                            f"max_new_tokens {max_new_tokens} too large! Choose {self.model.config.block_size - max(seq_lens) - 1} instead."
+                        )
+                    y = self.model.generate(
+                        x,
+                        max_new_tokens,
+                        seq_lens=seq_lens,
+                        temperature=temperature,
+                        top_k=top_k,
+                        top_p=top_p,
+                        speaker_embs=speaker_embs,
+                        batch_size=batch_size,
+                        guidance_scale=guidance_scale,
+                    )
+                    for i in range(len(y)):
+                        to_return.append(self.decoder.decode(tokens=y[i].tolist(), causal=True))
+                return to_return
+    def non_causal_sample(
+        self,
+        *,
+        texts: list[str],
+        encodec_tokens: list[torch.Tensor],
+        batch_size: int,
+        top_k: Optional[int],
+        temperature: Optional[float],
+        speaker_embs: Optional[torch.Tensor] = None,
+    ) -> list[str]:
+        """
+        Returns paths to saved audio files.
+        """
+        if speaker_embs is not None:
+            assert len(texts) == len(speaker_embs)
+        encoded_texts = [self.tokenizer.encode(text) for text in texts]
+        # setup input
+        # TODO: same code is used during data prep. refactor
+        padded_hierarchies_inputs = []
+        for encoded_text, encodec_token in zip(encoded_texts, encodec_tokens):
+            x = torch.tensor(encoded_text, dtype=torch.long, device=self.config.device)[
+                None, None, ...
+            ]  # (b=1, c=1, t)
+            # TODO: should only happen if decoder is encodecdeocder?
+            assert encodec_token.shape[0] == 1
+            encodec_token = encodec_token[0].tolist()  # (b=1, c, t) -> (c, t)
+            assert len(encodec_token) >= 1 and len(encodec_token) <= self._num_encodec_codebooks
+            ## setup hierarchies of tokens
+            # TODO: refactor and merge with code in processing.py
+            text_tokens = encoded_text  # (t,)
+            # print(f'{encodec_tokens[0].shape=}, {len(encodec_tokens)}')
+            hierarchies_in = []
+            hierarchies_in.append(text_tokens + encodec_token[0] + [self._encodec_codes_pad_token])
+            hierarchies_in.append(
+                [self._encodec_codes_pad_token] * len(text_tokens) + encodec_token[1] + [self._encodec_codes_pad_token]
+            )
+            ## adding padding / cutting to the right size as needed
+            # TODO: refactor and merge with code in processing.py
+            padded_hierarchies_input = []
+            for _, t_hierarchy in enumerate(hierarchies_in):
+                assert len(t_hierarchy) == len(hierarchies_in[0])
+                if len(t_hierarchy) < self._encodec_ctx_window:
+                    padded_hierarchies_input.append(
+                        t_hierarchy + [self._encodec_codes_pad_token] * (self._encodec_ctx_window - len(t_hierarchy))
+                    )
+                elif len(t_hierarchy) > self._encodec_ctx_window:
+                    padded_hierarchies_input.append(t_hierarchy[: self._encodec_ctx_window])
+                else:
+                    padded_hierarchies_input.append(t_hierarchy)
+            padded_hierarchies_inputs.append(padded_hierarchies_input)
+        ## check that the input is correct
+        in_x = torch.tensor(padded_hierarchies_inputs, dtype=torch.long, device=self.config.device)
+        assert in_x.shape[0] == speaker_embs.shape[0] if speaker_embs is not None else True
+        if self.speaker_cond is False:
+            speaker_embs = None
+        # run sampling loop
+        with torch.no_grad():
+            with self._ctx:  # type: ignore
+                to_return = []
+                for k in range(self.config.num_samples):
+                    y = self.model.generate(
+                        in_x,
+                        None,
+                        temperature=temperature,
+                        top_k=top_k,
+                        # TODO: handle separate top_p for this model explicitly
+                        top_p=None,
+                        speaker_embs=speaker_embs,
+                        batch_size=batch_size,
+                        guidance_scale=None,
+                    )
+                    b_tokens = torch.cat([in_x, y], dim=1)
+                    for tokens in b_tokens:
+                        try:
+                            to_return.append(self.decoder.decode(tokens=tokens.tolist(), causal=False))
+                        except Exception as e:
+                            print("failed to run MBD.")
+                            print(f"reason: {str(e)}")
+                            to_return.append(None)
+                return to_return
+    def __call__(
+        self,
+        *,
+        texts: list[str],
+        batch_size: int,
+        max_new_tokens: Optional[int],
+        top_k: Optional[int],
+        top_p: Optional[float],
+        temperature: Optional[float],
+        encodec_tokens: Optional[list[torch.Tensor]] = None,
+        speaker_embs: Optional[torch.Tensor] = None,
+        guidance_scale: Optional[float] = None,
+    ):
+        if self.checkpoint_config.get("causal", True):
+            return self.causal_sample(
+                texts=texts,
+                batch_size=batch_size,
+                speaker_embs=speaker_embs,
+                guidance_scale=guidance_scale,
+                max_new_tokens=max_new_tokens,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+            )
+        else:
+            assert encodec_tokens is not None
+            assert guidance_scale is None
+            assert max_new_tokens is None
+            assert top_p is None
+            return self.non_causal_sample(
+                texts=texts,
+                encodec_tokens=encodec_tokens,
+                batch_size=batch_size,
+                speaker_embs=speaker_embs,
+                top_k=top_k,
+                temperature=temperature,
+            )
+def save_result_metadata(wav_path, ref_path, text, first_stage_ckpt_path, second_stage_ckpt_path):
+    if first_stage_ckpt_path is None or second_stage_ckpt_path is None:
+        return
+    json.dump(
+        {
+            "speaker": ref_path,
+            "text": text,
+        },
+        pathlib.Path(str(wav_path) + ".json").open("w"),
+    )
+def get_cached_file(file_or_uri: str):
+    """
+    If it's an s3 file, download it to a local temporary file and return that path.
+    Otherwise return the path as is.
+    """
+    is_uri = file_or_uri.startswith("http")
+    cache_path = None
+    if is_uri:
+        ext = pathlib.Path(file_or_uri).suffix
+        # hash the file path to get the cache name
+        _cache_name = "audio_" + hashlib.md5(file_or_uri.encode("utf-8")).hexdigest() + ext
+        os.makedirs(os.path.expanduser("~/.cache/fam/"), exist_ok=True)
+        cache_path = os.path.expanduser(f"~/.cache/fam/{_cache_name}")
+        if not os.path.exists(cache_path):
+            command = f"curl -o {cache_path} {file_or_uri}"
+            subprocess.run(command, shell=True, check=True)
+    else:
+        if os.path.exists(file_or_uri):
+            cache_path = file_or_uri
+        else:
+            raise FileNotFoundError(f"File {file_or_uri} not found!")
+    # check audio file is at min. 30s in length
+    audio, sr = librosa.load(cache_path)
+    #assert librosa.get_duration(y=audio, sr=sr) >= 30, "Speaker reference audio file needs to be >= 30s in duration."
+    return cache_path
+def get_cached_embedding(local_file_path: str, spkemb_model):
+    if not os.path.exists(local_file_path):
+        raise FileNotFoundError(f"File {local_file_path} not found!")
+    # hash the file path to get the cache name
+    _cache_name = "embedding_" + hashlib.md5(local_file_path.encode("utf-8")).hexdigest() + ".pt"
+    os.makedirs(os.path.expanduser("~/.cache/fam/"), exist_ok=True)
+    cache_path = os.path.expanduser(f"~/.cache/fam/{_cache_name}")
+    if not os.path.exists(cache_path):
+        spk_emb = spkemb_model.embed_utterance_from_file(local_file_path, numpy=False).unsqueeze(0)  # (b=1, c)
+        torch.save(spk_emb, cache_path)
+    else:
+        spk_emb = torch.load(cache_path)
+    return spk_emb
+def _sample_utterance_batch(
+    texts: list[str],
+    spk_cond_paths: list[Optional[str]],
+    spkemb_model,
+    first_stage_model,
+    second_stage_model,
+    enhancer: Optional[Union[Literal["df"], BaseEnhancer]],
+    first_stage_ckpt_path: str,
+    second_stage_ckpt_path: str,
+    guidance_scale: Optional[float],
+    max_new_tokens: int,
+    top_k: Optional[int],
+    top_p: Optional[float],
+    temperature: Optional[float],
+    batch_size: int = 128,
+) -> List[str]:
+    speaker_embs = []
+    refs = spk_cond_paths.copy()
+    # multithreaded loop to cache all the files
+    spk_cond_paths = tqdm.contrib.concurrent.thread_map(
+        get_cached_file, spk_cond_paths, desc="getting cached speaker ref files"
+    )
+    for i, (text, spk_cond_path) in tqdm.tqdm(
+        enumerate(zip(texts, spk_cond_paths)), total=len(texts), desc="calculating speaker embeddings"
+    ):
+        texts[i] = normalize_text(text)
+        speaker_embs.append(get_cached_embedding(spk_cond_path, spkemb_model) if spk_cond_path else None)
+    b_speaker_embs = torch.cat(speaker_embs, dim=0)
+    b_tokens = first_stage_model(
+        texts=texts,
+        speaker_embs=b_speaker_embs,
+        batch_size=batch_size,
+        guidance_scale=guidance_scale,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        max_new_tokens=max_new_tokens,
+    )
+    # TODO: set batch size for second stage model!
+    wav_files = second_stage_model(
+        texts=texts,
+        encodec_tokens=b_tokens,
+        speaker_embs=b_speaker_embs,
+        batch_size=batch_size,
+        guidance_scale=None,
+        top_p=None,
+        top_k=top_k,
+        temperature=temperature,
+        max_new_tokens=None,
+    )
+    for text, tokens, speaker_embs, ref_name, wav_file in zip(texts, b_tokens, b_speaker_embs, refs, wav_files):
+        if wav_file is None:
+            continue
+        with tempfile.NamedTemporaryFile(suffix=".wav") as enhanced_tmp:
+            if enhancer is not None:
+                enhancer = get_enhancer(enhancer) if isinstance(enhancer, str) else enhancer
+                enhancer(str(wav_file) + ".wav", enhanced_tmp.name)
+                # copy enhanced_tmp.name back to wav_file
+                print(f"copying enhanced file from {enhanced_tmp.name} to {str(wav_file) + '.wav'}.")
+                shutil.copy2(enhanced_tmp.name, str(wav_file) + ".wav")
+            save_result_metadata(
+                wav_file,
+                ref_name,
+                text,
+                first_stage_ckpt_path,
+                second_stage_ckpt_path,
+            )
+    return [str(w) + ".wav" if not str(w).endswith(".wav") else str(w) for w in wav_files]
+def sample_utterance(
+    text: str,
+    spk_cond_path: Optional[str],
+    spkemb_model,
+    first_stage_model,
+    second_stage_model,
+    enhancer: Optional[Union[Literal["df"], BaseEnhancer]],
+    first_stage_ckpt_path: str,
+    second_stage_ckpt_path: str,
+    guidance_scale: Optional[float],
+    max_new_tokens: int,
+    top_k: Optional[int],
+    top_p: Optional[float],
+    temperature: Optional[float],
+) -> str:
+    # NOTE: supports max. 220 characters atm.
+    # Long form synthesis coming soon...
+    MAX_CHARS = 220
+    if len(text) > MAX_CHARS:
+        print(
+            f"\n***WARNING: Max {MAX_CHARS} characters supported. Provided: {len(text)}. Truncating and generating speech...Can lead to unpredictable speech at the end.***"
+        )
+    return _sample_utterance_batch(
+        texts=[text],
+        spk_cond_paths=[spk_cond_path],
+        spkemb_model=spkemb_model,
+        first_stage_model=first_stage_model,
+        second_stage_model=second_stage_model,
+        enhancer=enhancer,
+        first_stage_ckpt_path=first_stage_ckpt_path,
+        second_stage_ckpt_path=second_stage_ckpt_path,
+        batch_size=1,
+        guidance_scale=guidance_scale,
+        max_new_tokens=max_new_tokens,
+        top_k=top_k,
+        top_p=top_p,
+        temperature=temperature,
+    )[0]
+def build_models(config_first_stage, config_second_stage, device, use_kv_cache, first_model_path=None):
+    smodel = SpeakerEncoder(device=device, eval=True, verbose=False)
+    data_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
+    llm_first_stage = Model(
+        config_first_stage,
+        TrainedBPETokeniser,
+        EncodecDecoder,
+        data_adapter_fn=data_adapter.decode,
+        use_kv_cache=use_kv_cache,
+        first_model_path=first_model_path,
+    )
+    data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
+    llm_second_stage = Model(
+        config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
+    )
+    return smodel, llm_first_stage, llm_second_stage
+def get_first_stage_path(model_dir: str):
+    """Absolute path to checkpoint for the first stage model."""
+    return os.path.join(os.path.expanduser(model_dir), "first_stage.pt")
+def get_second_stage_path(model_dir: str):
+    """Absolute path to checkpoint for the second stage model."""
+    return os.path.join(os.path.expanduser(model_dir), "second_stage.pt")
+@dataclass
+class SamplingControllerConfig:
+    huggingface_repo_id: str
+    """Absolute path to the model directory."""
+    spk_cond_path: str
+    """Path to speaker reference file. Min. 30s of audio required. Supports both local paths & public URIs. Audio formats: wav, flac & mp3"""
+    text: str = "コトバテクノロジーズのミッションは、音声基盤モデルを作ることです。"
+    """Text to synthesise."""
+    num_samples: int = 1
+    """Number of samples to generate from each model."""
+    max_new_tokens: int = 864
+    """Maximum number of new tokens to generate from the first stage model."""
+    temperature: float = 1.0
+    """Temperature for sampling applied to both models."""
+    top_k: Optional[int] = None
+    """Top k for sampling applied to both models."""
+    top_p: Optional[float] = 0.95
+    """Top p for sampling applied to first-stage model."""
+    seed: int = 1337
+    """Random seed for sampling."""
+    device: Literal["cuda", "cpu"] = "cuda"
+    """Device to use for sampling."""
+    dtype: Literal["bfloat16", "float16", "float32", "tfloat32"] = "bfloat16"
+    """Data type to use for sampling."""
+    compile: bool = False
+    """Whether to compile the model using PyTorch 2.0."""
+    enhancer: Optional[Literal["df"]] = "df"
+    """Enhancer to use for post-processing."""
+    init_from: str = "resume"
+    """Either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')."""
+    use_kv_cache: Optional[Literal["flash_decoding", "vanilla"]] = None
+    """Type of kv caching to use for inference: 1) [none] no kv caching, 2) [flash_decoding] use the
+    flash decoding kernel, 3) [vanilla] use flash attention 2 with hand implemented kv-cache."""
+    output_dir: str = "samples/"
+    """Relative path to output directory"""
+    guidance_scale: Optional[float] = 3.0
+    """Guidance scale for sampling."""
+    batch_size: int = 128
+    """Batch size to use for sampling. Note that the batch size gets doubled when guidance is used. For H100, and 1B model,
+    1 w/ guidance and 1 w/o guidance work well (without kv-caching). With kv-caching, 128 (w/o guidance) and
+    64 (w/ guidance) works well."""
+    """
+    Sample from a trained model.
+    """
+    first_model_path: str = None
+    """first model path"""
+if __name__ == "__main__":
+    # TODO: add support for batch sampling via CLI. Function has been implemented above.
+    sampling_config = tyro.cli(SamplingControllerConfig, use_underscores=True)
+    model_dir = snapshot_download(repo_id=sampling_config.huggingface_repo_id)
+    first_stage_ckpt_path = get_first_stage_path(model_dir)
+    second_stage_ckpt_path = get_second_stage_path(model_dir)
+    config_first_stage = InferenceConfig(
+        ckpt_path=first_stage_ckpt_path,
+        num_samples=sampling_config.num_samples,
+        seed=sampling_config.seed,
+        device=sampling_config.device,
+        dtype=sampling_config.dtype,
+        compile=sampling_config.compile,
+        init_from=sampling_config.init_from,
+        output_dir=sampling_config.output_dir,
+    )
+    config_second_stage = InferenceConfig(
+        ckpt_path=second_stage_ckpt_path,
+        num_samples=sampling_config.num_samples,
+        seed=sampling_config.seed,
+        device=sampling_config.device,
+        dtype=sampling_config.dtype,
+        compile=sampling_config.compile,
+        init_from=sampling_config.init_from,
+        output_dir=sampling_config.output_dir,
+    )
+    sampling_config.max_new_tokens *= (
+        2  # deal with max_new_tokens for flattened interleaving! (should scale with num_codebooks?)
+    )
+    # define models
+    smodel, llm_first_stage, llm_second_stage = build_models(
+        config_first_stage, config_second_stage, sampling_config.device, sampling_config.use_kv_cache, sampling_config.first_model_path,
+    )
+    print(f"Synthesising utterance...")
+    sample_utterance(
+        sampling_config.text,
+        os.path.expanduser(sampling_config.spk_cond_path),
+        smodel,
+        llm_first_stage,
+        llm_second_stage,
+        sampling_config.enhancer,
+        first_stage_ckpt_path,
+        second_stage_ckpt_path,
+        sampling_config.guidance_scale,
+        max_new_tokens=sampling_config.max_new_tokens,
+        top_k=sampling_config.top_k,
+        top_p=sampling_config.top_p,
+        temperature=sampling_config.temperature,
+    )

fam/llm/serving.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import json
+import logging
+import os
+import shlex
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Literal, Optional
+import fastapi
+import fastapi.middleware.cors
+import torch
+import tyro
+import uvicorn
+from attr import dataclass
+from fastapi import Request
+from fastapi.responses import Response
+from huggingface_hub import snapshot_download
+from fam.llm.sample import (
+    InferenceConfig,
+    Model,
+    build_models,
+    get_first_stage_path,
+    get_second_stage_path,
+    # sample_utterance,
+)
+from fam.llm.fast_inference import TTS
+logger = logging.getLogger(__name__)
+## Setup FastAPI server.
+app = fastapi.FastAPI()
+@dataclass
+class ServingConfig:
+    huggingface_repo_id: str
+    """Absolute path to the model directory."""
+    max_new_tokens: int = 864 * 2
+    """Maximum number of new tokens to generate from the first stage model."""
+    temperature: float = 1.0
+    """Temperature for sampling applied to both models."""
+    top_k: int = 200
+    """Top k for sampling applied to both models."""
+    seed: int = 1337
+    """Random seed for sampling."""
+    dtype: Literal["bfloat16", "float16", "float32", "tfloat32"] = "bfloat16"
+    """Data type to use for sampling."""
+    enhancer: Optional[Literal["df"]] = "df"
+    """Enhancer to use for post-processing."""
+    port: int = 58003
+# Singleton
+class _GlobalState:
+    config: ServingConfig
+    tts: TTS
+GlobalState = _GlobalState()
+@dataclass(frozen=True)
+class TTSRequest:
+    text: str
+    guidance: Optional[float] = 3.0
+    top_p: Optional[float] = 0.95
+    speaker_ref_path: Optional[str] = None
+    top_k: Optional[int] = None
+def sample_utterance(
+    text: str,
+    spk_cond_path: str | None,
+    guidance_scale,
+    max_new_tokens,
+    top_k,
+    top_p,
+    temperature,
+) -> str:
+    return GlobalState.tts.synthesise(
+        text,
+        spk_cond_path,
+        top_p=top_p,
+        guidance_scale=guidance_scale,
+        temperature=temperature,
+    )
+@app.post("/tts", response_class=Response)
+async def text_to_speech(req: Request):
+    audiodata = await req.body()
+    payload = None
+    wav_out_path = None
+    try:
+        headers = req.headers
+        payload = headers["X-Payload"]
+        payload = json.loads(payload)
+        tts_req = TTSRequest(**payload)
+        with tempfile.NamedTemporaryFile(suffix=".wav") as wav_tmp:
+            if tts_req.speaker_ref_path is None:
+                wav_path = _convert_audiodata_to_wav_path(audiodata, wav_tmp)
+            else:
+                wav_path = tts_req.speaker_ref_path
+            wav_out_path = sample_utterance(
+                tts_req.text,
+                wav_path,
+                guidance_scale=tts_req.guidance,
+                max_new_tokens=GlobalState.config.max_new_tokens,
+                temperature=GlobalState.config.temperature,
+                top_k=tts_req.top_k,
+                top_p=tts_req.top_p,
+            )
+        with open(wav_out_path, "rb") as f:
+            return Response(content=f.read(), media_type="audio/wav")
+    except Exception as e:
+        # traceback_str = "".join(traceback.format_tb(e.__traceback__))
+        logger.exception(f"Error processing request {payload}")
+        return Response(
+            content="Something went wrong. Please try again in a few mins or contact us on Discord",
+            status_code=500,
+        )
+    finally:
+        if wav_out_path is not None:
+            Path(wav_out_path).unlink(missing_ok=True)
+def _convert_audiodata_to_wav_path(audiodata, wav_tmp):
+    with tempfile.NamedTemporaryFile() as unknown_format_tmp:
+        assert unknown_format_tmp.write(audiodata) > 0
+        unknown_format_tmp.flush()
+        subprocess.check_output(
+            # arbitrary 2 minute cutoff
+            shlex.split(f"ffmpeg -t 120 -y -i {unknown_format_tmp.name} -f wav {wav_tmp.name}")
+        )
+        return wav_tmp.name
+if __name__ == "__main__":
+    # This has to be here to avoid some weird audiocraft shenaningans messing up matplotlib
+    from fam.llm.enhancers import get_enhancer
+    for name in logging.root.manager.loggerDict:
+        logger = logging.getLogger(name)
+        logger.setLevel(logging.INFO)
+    logging.root.setLevel(logging.INFO)
+    GlobalState.config = tyro.cli(ServingConfig)
+    app.add_middleware(
+        fastapi.middleware.cors.CORSMiddleware,
+        allow_origins=["*", f"http://localhost:{GlobalState.config.port}", "http://localhost:3000"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    common_config = dict(
+        num_samples=1,
+        seed=1337,
+        device=device,
+        dtype=GlobalState.config.dtype,
+        compile=False,
+        init_from="resume",
+        output_dir=tempfile.mkdtemp(),
+    )
+    model_dir = snapshot_download(repo_id=GlobalState.config.huggingface_repo_id)
+    config1 = InferenceConfig(
+        ckpt_path=get_first_stage_path(model_dir),
+        **common_config,
+    )
+    config2 = InferenceConfig(
+        ckpt_path=get_second_stage_path(model_dir),
+        **common_config,
+    )
+    GlobalState.tts = TTS()
+    # start server
+    uvicorn.run(
+        app,
+        host="127.0.0.1",
+        port=GlobalState.config.port,
+        log_level="info",
+    )

fam/llm/utils.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import re
+import subprocess
+import tempfile
+import librosa
+import torch
+def normalize_text(text: str) -> str:
+    unicode_conversion = {
+        8175: "'",
+        8189: "'",
+        8190: "'",
+        8208: "-",
+        8209: "-",
+        8210: "-",
+        8211: "-",
+        8212: "-",
+        8213: "-",
+        8214: "||",
+        8216: "'",
+        8217: "'",
+        8218: ",",
+        8219: "`",
+        8220: '"',
+        8221: '"',
+        8222: ",,",
+        8223: '"',
+        8228: ".",
+        8229: "..",
+        8230: "...",
+        8242: "'",
+        8243: '"',
+        8245: "'",
+        8246: '"',
+        180: "'",
+        2122: "TM",  # Trademark
+    }
+    text = text.translate(unicode_conversion)
+    non_bpe_chars = set([c for c in list(text) if ord(c) >= 256])
+    #if len(non_bpe_chars) > 0:
+    #    non_bpe_points = [(c, ord(c)) for c in non_bpe_chars]
+    #    raise ValueError(f"Non-BPE single token characters found: {non_bpe_points}")
+    text = text.replace("\t", " ")
+    text = text.replace("\n", " ")
+    text = text.replace("*", " ")
+    text = text.strip()
+    text = re.sub("\s\s+", " ", text)  # remove multiple spaces
+    return text
+def check_audio_file(path_or_uri, threshold_s=10):  # default 30
+    if "http" in path_or_uri:
+        temp_fd, filepath = tempfile.mkstemp()
+        os.close(temp_fd)  # Close the file descriptor, curl will create a new connection
+        curl_command = ["curl", "-L", path_or_uri, "-o", filepath]
+        subprocess.run(curl_command, check=True)
+    else:
+        filepath = path_or_uri
+    audio, sr = librosa.load(filepath)
+    duration_s = librosa.get_duration(y=audio, sr=sr)
+    if duration_s < threshold_s:
+        raise Exception(
+            f"The audio file is too short. Please provide an audio file that is at least {threshold_s} seconds long to proceed."
+        )
+    # Clean up the temporary file if it was created
+    if "http" in path_or_uri:
+        os.remove(filepath)
+def get_default_dtype() -> str:
+    """Compute default 'dtype' based on GPU architecture"""
+    if torch.cuda.is_available():
+        for i in range(torch.cuda.device_count()):
+            device_properties = torch.cuda.get_device_properties(i)
+            dtype = "float16" if device_properties.major <= 7 else "bfloat16"  # tesla and turing architectures
+    else:
+        dtype = "float16"
+    print(f"using dtype={dtype}")
+    return dtype
+def get_device() -> str:
+    return "cuda" if torch.cuda.is_available() else "cpu"

fam/py.typed ADDED Viewed

File without changes

fam/quantiser/__init__.py ADDED Viewed

File without changes

fam/quantiser/audio/__init__.py ADDED Viewed

File without changes

fam/quantiser/audio/speaker_encoder/__init__.py ADDED Viewed

File without changes

fam/quantiser/audio/speaker_encoder/audio.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import librosa
+import numpy as np
+mel_window_length = 25
+mel_window_step = 10
+mel_n_channels = 40
+sampling_rate = 16000
+def wav_to_mel_spectrogram(wav):
+    """
+    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+    Note: this not a log-mel spectrogram.
+    """
+    frames = librosa.feature.melspectrogram(
+        y=wav,
+        sr=sampling_rate,
+        n_fft=int(sampling_rate * mel_window_length / 1000),
+        hop_length=int(sampling_rate * mel_window_step / 1000),
+        n_mels=mel_n_channels,
+    )
+    return frames.astype(np.float32).T

fam/quantiser/audio/speaker_encoder/ckpt/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.pt filter=lfs diff=lfs merge=lfs -text

fam/quantiser/audio/speaker_encoder/ckpt/ckpt.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc7ff82ef75becd495aab2ede3a8220da393a717f178ae9534df355a6173bbca
+size 17090379

fam/quantiser/audio/speaker_encoder/model.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+from time import perf_counter as timer
+from typing import List, Optional, Union
+import librosa
+import numpy as np
+import torch
+from torch import nn
+from fam.quantiser.audio.speaker_encoder import audio
+DEFAULT_SPKENC_CKPT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ckpt/ckpt.pt")
+mel_window_step = 10
+mel_n_channels = 40
+sampling_rate = 16000
+partials_n_frames = 160
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
+class SpeakerEncoder(nn.Module):
+    def __init__(
+        self,
+        weights_fpath: Optional[str] = None,
+        device: Optional[Union[str, torch.device]] = None,
+        verbose: bool = True,
+        eval: bool = False,
+    ):
+        super().__init__()
+        # Define the network
+        self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+        self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+        self.relu = nn.ReLU()
+        # Get the target device
+        if device is None:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        self.device = device
+        start = timer()
+        if eval and weights_fpath is None:
+            weights_fpath = DEFAULT_SPKENC_CKPT_PATH
+        if weights_fpath is not None:
+            checkpoint = torch.load(weights_fpath, map_location="cpu")
+            self.load_state_dict(checkpoint["model_state"], strict=False)
+            self.to(device)
+        if eval:
+            self.eval()
+        if verbose:
+            print("Loaded the speaker embedding model on %s in %.2f seconds." % (device.type, timer() - start))
+    def forward(self, mels: torch.FloatTensor):
+        _, (hidden, _) = self.lstm(mels)
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+    @staticmethod
+    def compute_partial_slices(n_samples: int, rate, min_coverage):
+        # Compute how many frames separate two partial utterances
+        samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+        n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+        frame_step = int(np.round((sampling_rate / rate) / samples_per_frame))
+        # Compute the slices
+        wav_slices, mel_slices = [], []
+        steps = max(1, n_frames - partials_n_frames + frame_step + 1)
+        for i in range(0, steps, frame_step):
+            mel_range = np.array([i, i + partials_n_frames])
+            wav_range = mel_range * samples_per_frame
+            mel_slices.append(slice(*mel_range))
+            wav_slices.append(slice(*wav_range))
+        # Evaluate whether extra padding is warranted or not
+        last_wav_range = wav_slices[-1]
+        coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+        if coverage < min_coverage and len(mel_slices) > 1:
+            mel_slices = mel_slices[:-1]
+            wav_slices = wav_slices[:-1]
+        return wav_slices, mel_slices
+    def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75, numpy: bool = True):
+        wav_slices, mel_slices = self.compute_partial_slices(len(wav), rate, min_coverage)
+        max_wave_length = wav_slices[-1].stop
+        if max_wave_length >= len(wav):
+            wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+        mel = audio.wav_to_mel_spectrogram(wav)
+        mels = np.array([mel[s] for s in mel_slices])
+        with torch.no_grad():
+            mels = torch.from_numpy(mels).to(self.device)  # type: ignore
+            partial_embeds = self(mels)
+        if numpy:
+            partial_embeds = partial_embeds.cpu().numpy()
+            raw_embed = np.mean(partial_embeds, axis=0)
+            embed = raw_embed / np.linalg.norm(raw_embed, 2)
+        else:
+            raw_embed = partial_embeds.mean(dim=0)
+            embed = raw_embed / torch.linalg.norm(raw_embed, 2)
+        if return_partials:
+            return embed, partial_embeds, wav_slices
+        return embed
+    def embed_speaker(self, wavs: List[np.ndarray], **kwargs):
+        raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) for wav in wavs], axis=0)
+        return raw_embed / np.linalg.norm(raw_embed, 2)
+    def embed_utterance_from_file(self, fpath: str, numpy: bool) -> torch.Tensor:
+        wav_tgt, _ = librosa.load(fpath, sr=16000)
+        wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
+        embedding = self.embed_utterance(wav_tgt, numpy=numpy)
+        return embedding

fam/quantiser/text/tokenise.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import tiktoken
+class TrainedBPETokeniser:
+    def __init__(self, name, pat_str, mergeable_ranks, special_tokens, offset=None) -> None:
+        self.tokenizer = tiktoken.Encoding(
+            name=name,
+            pat_str=pat_str,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens=special_tokens,
+        )
+        self.offset = offset
+    def encode(self, text: str) -> list[int]:
+        # note: we add a end of text token!
+        tokens = self.tokenizer.encode(text) + [self.tokenizer.eot_token]
+        if self.offset is not None:
+            tokens = [x + self.offset for x in tokens]
+        return tokens
+    def decode(self, tokens: list[int]):
+        if self.offset is not None:
+            tokens = [x - self.offset for x in tokens]
+        return self.tokenizer.decode(tokens)
+    @property
+    def eot_token(self):
+        if self.offset is not None:
+            return self.tokenizer.eot_token + self.offset
+        else:
+            return self.tokenizer.eot_token

fam/ui/app.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import io
+import json
+import os
+import gradio as gr
+import requests
+import soundfile as sf
+API_SERVER_URL = "http://127.0.0.1:58003/tts"
+RADIO_CHOICES = ["Preset voices", "Upload target voice", "Record your voice"]
+MAX_CHARS = 220
+PRESET_VOICES = {
+    # female
+    "Ava": "https://cdn.themetavoice.xyz/speakers/ava.flac",
+    "Bria": "https://cdn.themetavoice.xyz/speakers/bria.mp3",
+    # male
+    "Alex": "https://cdn.themetavoice.xyz/speakers/alex.mp3",
+    "Jacob": "https://cdn.themetavoice.xyz/speakers/jacob.wav",
+}
+def denormalise_top_p(top_p):
+    # returns top_p in the range [0.9, 1.0]
+    return round(0.9 + top_p / 100, 2)
+def denormalise_guidance(guidance):
+    # returns guidance in the range [1.0, 3.0]
+    return 1 + ((guidance - 1) * (3 - 1)) / (5 - 1)
+def _handle_edge_cases(to_say, upload_target):
+    if not to_say:
+        raise gr.Error("Please provide text to synthesise")
+    def _check_file_size(path):
+        if not path:
+            return
+        filesize = os.path.getsize(path)
+        filesize_mb = filesize / 1024 / 1024
+        if filesize_mb >= 50:
+            raise gr.Error(
+                f"Please upload a sample less than 20MB for voice cloning. Provided: {round(filesize_mb)} MB"
+            )
+    _check_file_size(upload_target)
+def tts(to_say, top_p, guidance, toggle, preset_dropdown, upload_target, record_target):
+    d_top_p = denormalise_top_p(top_p)
+    d_guidance = denormalise_guidance(guidance)
+    _handle_edge_cases(to_say, upload_target)
+    to_say = to_say if len(to_say) < MAX_CHARS else to_say[:MAX_CHARS]
+    custom_target_path = None
+    if toggle == RADIO_CHOICES[1]:
+        custom_target_path = upload_target
+    elif toggle == RADIO_CHOICES[2]:
+        custom_target_path = record_target
+    config = {
+        "text": to_say,
+        "guidance": d_guidance,
+        "top_p": d_top_p,
+        "speaker_ref_path": PRESET_VOICES[preset_dropdown] if toggle == RADIO_CHOICES[0] else None,
+    }
+    headers = {"Content-Type": "audio/wav", "X-Payload": json.dumps(config)}
+    if not custom_target_path:
+        response = requests.post(API_SERVER_URL, headers=headers, data=None)
+    else:
+        with open(custom_target_path, "rb") as f:
+            data = f.read()
+            response = requests.post(API_SERVER_URL, headers=headers, data=data)
+    wav, sr = None, None
+    if response.status_code == 200:
+        audio_buffer = io.BytesIO(response.content)
+        audio_buffer.seek(0)
+        wav, sr = sf.read(audio_buffer, dtype="float32")
+    else:
+        print(f"Something went wrong. response status code: {response.status_code}")
+    return sr, wav
+def change_voice_selection_layout(choice):
+    index = RADIO_CHOICES.index(choice)
+    return [
+        gr.update(visible=True)
+        if i == index else gr.update(visible=False)
+        for i in range(len(RADIO_CHOICES))
+    ]
+title = "# TTS by Kotoba-Speech"
+description = """
+<strong>Kotoba-Speech v0.1</strong>は、1.2Bのトランスフォーマーに基づく音声生成モデルです。
+以下の機能をサポートしています：
+\n
+* 日本語における滑らかなテキスト読み上げ生成
+* スピーチプロンプトを通じたOne-shot音声クローニング
+Kotoba Technologiesは、公開されたモデルを商用可能なApache 2.0ライセンスで公開します。
+推論およびモデルコードは、Meta-Voiceをベースに作られており、学習コードは弊社のGitHubで近日中に公開する予定です。
+Kotoba Technologiesは、音声基盤モデルの開発に取り組んでおり、今後もモデルの公開を行なっていきます。是非、[Discord Community](https://discord.gg/qPVFqhGN7Z)に参加してご意見ください！
+<strong>Kotoba-Speech v0.1</strong> is a 1.2B Transformer-based speech generative model. It supports the following properties:
+\n
+* Fluent text-to-speech generation in Japanese
+* One-shot voice cloning through speech prompt
+We are releasing our model under the Apache 2.0 license. Our inference and model code is adapted from Meta-Voice, and we will our training code on our GitHub repository shortly.
+Kotoba Technologies is committing on developing speech foundation models, and we’ll continue releasing our models. Please join [our discord](https://discord.gg/qPVFqhGN7Z) to contribute to out community.
+"""
+with gr.Blocks(title="TTS by Kotoba-Speech") as demo:
+    gr.Markdown(title)
+    with gr.Row():
+        gr.Markdown(description)
+    with gr.Row():
+        with gr.Column():
+            to_say = gr.TextArea(
+                label="What should I say!?",
+                lines=4,
+                value="コトバテクノロジーズのミッションは、音声基盤モデルを作ることです。",
+            )
+            with gr.Row(), gr.Column():
+                # voice settings
+                top_p = gr.Slider(
+                    value=5.0,
+                    minimum=0.0,
+                    maximum=10.0,
+                    step=1.0,
+                    label="Speech Stability - improves text following for a challenging speaker",
+                )
+                guidance = gr.Slider(
+                    value=5.0,
+                    minimum=1.0,
+                    maximum=5.0,
+                    step=1.0,
+                    label="Speaker similarity - How closely to match speaker identity and speech style.",
+                )
+                # voice select
+                toggle = gr.Radio(choices=RADIO_CHOICES, label="Choose voice", value=RADIO_CHOICES[0])
+            with gr.Row(visible=True) as row_1:
+                preset_dropdown = gr.Dropdown(
+                    PRESET_VOICES.keys(), label="Preset voices", value=list(PRESET_VOICES.keys())[0]
+                )
+                with gr.Accordion("Preview: Preset voices", open=False):
+                    for label, path in PRESET_VOICES.items():
+                        gr.Audio(value=path, label=label)
+            with gr.Row(visible=False) as row_2:
+                upload_target = gr.Audio(
+                    sources=["upload"],
+                    type="filepath",
+                    label="Upload a clean sample to clone. Sample should contain 1 speaker, be between 10-90 seconds and not contain background noise.",
+                    min_length=10,
+                    max_length=90,
+                )
+            with gr.Row(visible=False) as row_3:
+                record_target = gr.Audio(
+                    sources=["microphone"],
+                    type="filepath",
+                    label="Record your voice with a microphone to clone. Sample should contain 1 speaker, be between 10-90 seconds and not contain background noise.",
+                    min_length=10,
+                    max_length=90,
+                )
+            toggle.change(
+                change_voice_selection_layout,
+                inputs=toggle,
+                outputs=[row_1, row_2, row_3],
+            )
+        with gr.Column():
+            speech = gr.Audio(
+                type="numpy",
+                label="Kotoba-Speech says...",
+            )
+    submit = gr.Button("Generate Speech")
+    submit.click(
+        fn=tts,
+        inputs=[to_say, top_p, guidance, toggle, preset_dropdown, upload_target, record_target],
+        outputs=speech,
+    )
+demo.queue(default_concurrency_limit=2)
+# demo.launch()
+demo.launch(server_name="0.0.0.0", server_port=3000, share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,138 @@

+anyio==4.0.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==23.1.0
+Babel==2.13.1
+beautifulsoup4==4.12.2
+bleach==6.1.0
+blinker==1.4
+certifi==2022.12.7
+cffi==1.16.0
+charset-normalizer==2.1.1
+comm==0.2.0
+cryptography==3.4.8
+dbus-python==1.2.18
+debugpy==1.8.0
+decorator==5.1.1
+deepfilternet
+defusedxml==0.7.1
+distro==1.7.0
+entrypoints==0.4
+exceptiongroup==1.1.3
+executing==2.0.1
+fastjsonschema==2.18.1
+filelock==3.9.0
+fqdn==1.5.1
+fsspec
+gradio==4.21.0
+httplib2==0.20.2
+idna==3.4
+importlib-metadata==4.6.4
+ipykernel==6.26.0
+ipython==8.17.2
+ipython-genutils==0.2.0
+ipywidgets==8.1.1
+isoduration==20.11.0
+jedi==0.19.1
+jeepney==0.7.1
+Jinja2==3.1.2
+json5==0.9.14
+jsonpointer==2.4
+jsonschema==4.19.2
+jsonschema-specifications==2023.7.1
+jupyter-archive==3.4.0
+jupyter-contrib-core==0.4.2
+jupyter-contrib-nbextensions==0.7.0
+jupyter-events==0.9.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.0
+jupyter-nbextensions-configurator==0.6.3
+jupyter_client==7.4.9
+jupyter_core==5.5.0
+jupyter_server==2.10.0
+jupyter_server_terminals==0.4.4
+jupyterlab==4.0.8
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==3.0.9
+jupyterlab_server==2.25.0
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+librosa
+lxml==4.9.3
+Mako==1.1.3
+Markdown==3.3.6
+MarkupSafe==2.1.2
+matplotlib-inline==0.1.6
+mistune==3.0.2
+more-itertools==8.10.0
+mpmath==1.3.0
+nbclassic==1.0.0
+nbclient==0.9.0
+nbconvert==7.11.0
+nbformat==5.9.2
+nest-asyncio==1.5.8
+networkx==3.0
+notebook==6.5.5
+notebook_shim==0.2.3
+numpy==1.24.1
+oauthlib==3.2.0
+overrides==7.4.0
+packaging==23.2
+pandocfilters==1.5.0
+parso==0.8.3
+pexpect==4.8.0
+Pillow==9.3.0
+platformdirs==3.11.0
+prometheus-client==0.18.0
+prompt-toolkit==3.0.39
+psutil==5.9.6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pycparser==2.21
+Pygments==2.16.1
+PyGObject==3.42.1
+PyJWT==2.3.0
+pyparsing==2.4.7
+# python-apt==2.4.0
+python-dateutil==2.8.2
+python-json-logger==2.0.7
+PyYAML==6.0.1
+pyzmq==24.0.1
+referencing==0.30.2
+requests==2.31.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.12.0
+SecretStorage==3.3.1
+Send2Trash==1.8.2
+six==1.16.0
+sniffio==1.3.0
+soundfile==0.12.1
+soupsieve==2.5
+stack-data==0.6.3
+sympy==1.12
+terminado==0.17.1
+tinycss2==1.2.1
+tomli==2.0.1
+torch
+torchaudio
+torchvision
+tornado==6.3.3
+traitlets==5.13.0
+triton==2.1.0
+types-python-dateutil==2.8.19.14
+typing_extensions
+uri-template==1.3.0
+urllib3==1.26.13
+wadllib==1.3.6
+wcwidth==0.2.9
+webcolors==1.13
+webencodings==0.5.1
+websocket-client==1.6.4
+widgetsnbextension==4.0.9
+zipp==1.0.0

setup.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from setuptools import find_packages, setup  # type: ignore
+setup(
+    name="fam",
+    packages=find_packages(".", exclude=["tests"]),
+)