diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..27587ee715a54c857380774ea93dc9332e24c242 --- /dev/null +++ b/.gitignore @@ -0,0 +1,13 @@ +output/* +__pycache__/* +Bark/__pycache__/* +*.wav +Bark/request.py +VoiceFixer/request.py +service_logs/* +convert_script_to_audio_gen_code.py +/cache/ +VoiceParser/hubert/* +VoiceParser/__pycache__ +config.yaml +/services_logs/ diff --git a/APIs.py b/APIs.py new file mode 100644 index 0000000000000000000000000000000000000000..83ead8437d0749b12e055ffb5c2175278241f2a1 --- /dev/null +++ b/APIs.py @@ -0,0 +1,215 @@ +import os +import numpy as np +import requests +import yaml +import pyloudnorm as pyln +from scipy.io.wavfile import write +import torchaudio +from retrying import retry + + +os.environ['OPENBLAS_NUM_THREADS'] = '1' + +SAMPLE_RATE = 32000 + + +with open('config.yaml', 'r') as file: + config = yaml.safe_load(file) + tts_port = config['Text-to-Speech']['service-port'] + ttm_port = config['Text-to-Music']['service-port'] + tta_port = config['Text-to-Audio']['service-port'] + sr_port = config['Speech-Restoration']['service-port'] + vp_port = config['Voice-Parser']['service-port'] + enable_sr = config['Speech-Restoration']['Enable'] + + +def IDLE(length=1.0, out_wav='out.wav', sr=SAMPLE_RATE): + idle = np.zeros(int(length * sr)) + WRITE_AUDIO(idle, name=out_wav, sr=SAMPLE_RATE) + + +def LOUDNESS_NORM(audio, sr=32000, volumn=-25): + # peak normalize audio to -1 dB + peak_normalized_audio = pyln.normalize.peak(audio, -10.0) + # measure the loudness first + meter = pyln.Meter(sr) # create BS.1770 meter + loudness = meter.integrated_loudness(peak_normalized_audio) + # loudness normalize audio to -12 dB LUFS + normalized_audio = pyln.normalize.loudness(peak_normalized_audio, loudness, volumn) + return normalized_audio + + +def WRITE_AUDIO(wav, name=None, sr=SAMPLE_RATE): + """ + function: write audio numpy to .wav file + @params: + wav: np.array [samples] + """ + if name is None: + name = 'output.wav' + + if len(wav.shape) > 1: + wav = wav[0] + + # declipping + + max_value = np.max(np.abs(wav)) + if max_value > 1: + wav *= 0.9 / max_value + + # print(f'WRITE_AUDIO to {name}') + write(name, sr, np.round(wav*32767).astype(np.int16)) + + +def READ_AUDIO_NUMPY(wav, sr=SAMPLE_RATE): + """ + function: read audio numpy + return: np.array [samples] + """ + waveform, sample_rate = torchaudio.load(wav) + + if sample_rate != sr: + waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=sr) + + wav_numpy = waveform[0].numpy() + + return wav_numpy + + +def MIX(wavs=[['1.wav', 0.], ['2.wav', 10.]], out_wav='out.wav', sr=SAMPLE_RATE): + """ + wavs:[[wav_name, absolute_offset], ...] + """ + + # last_name, last_offset = wavs[-1] + # last_len = len(READ_AUDIO_NUMPY(last_name)) + # max_length = int(last_offset * sr + last_len) + + max_length = max([int(wav[1]*sr + len(READ_AUDIO_NUMPY(wav[0]))) for wav in wavs]) + template_wav = np.zeros(max_length) + + for wav in wavs: + cur_name, cur_offset = wav + cur_wav = READ_AUDIO_NUMPY(cur_name) + cur_len = len(cur_wav) + cur_offset = int(cur_offset * sr) + + # mix + template_wav[cur_offset:cur_offset+cur_len] += cur_wav + + WRITE_AUDIO(template_wav, name=out_wav) + + +def CAT(wavs, out_wav='out.wav'): + """ + wavs: List of wav file ['1.wav', '2.wav', ...] + """ + wav_num = len(wavs) + + segment0 = READ_AUDIO_NUMPY(wavs[0]) + + cat_wav = segment0 + + if wav_num > 1: + for i in range(1, wav_num): + next_wav = READ_AUDIO_NUMPY(wavs[i]) + cat_wav = np.concatenate((cat_wav, next_wav), axis=-1) + + WRITE_AUDIO(cat_wav, name=out_wav) + + +def COMPUTE_LEN(wav): + wav= READ_AUDIO_NUMPY(wav) + return len(wav) / 32000 + + +@retry(stop_max_attempt_number=5, wait_fixed=2000) +def TTM(text, length=10, volume=-28, out_wav='out.wav'): + url = f'http://127.0.0.1:{ttm_port}/generate_music' + data = { + 'text': f'{text}', + 'length': f'{length}', + 'volume': f'{volume}', + 'output_wav': f'{out_wav}', + } + + response = requests.post(url, json=data) + + if response.status_code == 200: + print('Success:', response.json()['message']) + else: + print('Error:', response.json()['API error']) + raise RuntimeError(response.json()['API error']) + +@retry(stop_max_attempt_number=5, wait_fixed=2000) +def TTA(text, length=5, volume=-35, out_wav='out.wav'): + url = f'http://127.0.0.1:{tta_port}/generate_audio' + data = { + 'text': f'{text}', + 'length': f'{length}', + 'volume': f'{volume}', + 'output_wav': f'{out_wav}', + } + + response = requests.post(url, json=data) + + if response.status_code == 200: + print('Success:', response.json()['message']) + else: + print('Error:', response.json()['API error']) + raise RuntimeError(response.json()['API error']) + + +@retry(stop_max_attempt_number=5, wait_fixed=2000) +def TTS(text, speaker='news_anchor', volume=-20, out_wav='out.wav', enhanced=enable_sr, speaker_id='', speaker_npz=''): + url = f'http://127.0.0.1:{tts_port}/generate_speech' + data = { + 'text': f'{text}', + 'speaker_id': f'{speaker_id}', + 'speaker_npz': f'{speaker_npz}', + 'volume': f'{volume}', + 'output_wav': f'{out_wav}', + } + + response = requests.post(url, json=data) + + if response.status_code == 200: + print('Success:', response.json()['message']) + else: + print('Error:', response.json()['API error']) + raise RuntimeError(response.json()['API error']) + + if enhanced: + SR(processfile=out_wav) + + +@retry(stop_max_attempt_number=5, wait_fixed=2000) +def SR(processfile): + url = f'http://127.0.0.1:{sr_port}/fix_audio' + data = {'processfile': f'{processfile}'} + + response = requests.post(url, json=data) + + if response.status_code == 200: + print('Success:', response.json()['message']) + else: + print('Error:', response.json()['API error']) + raise RuntimeError(response.json()['API error']) + + +@retry(stop_max_attempt_number=5, wait_fixed=2000) +def VP(wav_path, out_dir): + url = f'http://127.0.0.1:{vp_port}/parse_voice' + data = { + 'wav_path': f'{wav_path}', + 'out_dir':f'{out_dir}' + } + + response = requests.post(url, json=data) + + if response.status_code == 200: + print('Success:', response.json()['message']) + else: + print('Error:', response.json()['API error']) + raise RuntimeError(response.json()['API error']) + diff --git a/AudioCraft/app.py b/AudioCraft/app.py new file mode 100644 index 0000000000000000000000000000000000000000..18310df57e568c685100dbf74cd6717d05b4ded4 --- /dev/null +++ b/AudioCraft/app.py @@ -0,0 +1,110 @@ +import sys +sys.path.append('../AudioJourney') +import os +import yaml +import logging +import torchaudio +from APIs import WRITE_AUDIO, LOUDNESS_NORM +from utils import fade +from flask import Flask, request, jsonify + +with open('config.yaml', 'r') as file: + config = yaml.safe_load(file) + +# Configure the logging format and level +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) + +# Create a FileHandler for the log file +os.makedirs('services_logs', exist_ok=True) +log_filename = 'services_logs/Text-to-Audio-Music.log' +file_handler = logging.FileHandler(log_filename, mode='w') +file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + +# Add the FileHandler to the root logger +logging.getLogger('').addHandler(file_handler) + + +# Initialize the model here +from audiocraft.models import AudioGen, MusicGen +tta_model = AudioGen.get_pretrained('facebook/audiogen-medium') +logging.info('AudioGen is loaded ...') + +model_size = config['Text-to-Music']['model_size'] +ttm_model = MusicGen.get_pretrained(f'facebook/musicgen-{model_size}') +logging.info(f'MusicGen ({model_size}) is loaded ...') + +app = Flask(__name__) + +@app.route('/generate_audio', methods=['POST']) +def generate_audio(): + # Receive the text from the POST request + data = request.json + text = data['text'] + length = float(data.get('length', 5.0)) + volume = float(data.get('volume', -35)) + output_wav = data.get('output_wav', 'out.wav') + + logging.info(f'TTA (AudioGen): Prompt: {text}, length: {length} seconds, volume: {volume} dB') + + try: + tta_model.set_generation_params(duration=length) + wav = tta_model.generate([text]) + wav = torchaudio.functional.resample(wav, orig_freq=16000, new_freq=32000) + + wav = wav.squeeze().cpu().detach().numpy() + wav = fade(LOUDNESS_NORM(wav, volumn=volume)) + WRITE_AUDIO(wav, name=output_wav) + + # Return success message and the filename of the generated audio + return jsonify({'message': f'Text-to-Audio generated successfully | {text}', 'file': output_wav}) + + except Exception as e: + return jsonify({'API error': str(e)}), 500 + + +@app.route('/generate_music', methods=['POST']) +def generate_music(): + # Receive the text from the POST request + data = request.json + text = data['text'] + length = float(data.get('length', 5.0)) + volume = float(data.get('volume', -35)) + output_wav = data.get('output_wav', 'out.wav') + + logging.info(f'TTM (MusicGen): Prompt: {text}, length: {length} seconds, volume: {volume} dB') + + + try: + ttm_model.set_generation_params(duration=length) + wav = ttm_model.generate([text]) + wav = wav[0][0].cpu().detach().numpy() + wav = fade(LOUDNESS_NORM(wav, volumn=volume)) + WRITE_AUDIO(wav, name=output_wav) + + # Return success message and the filename of the generated audio + return jsonify({'message': f'Text-to-Music generated successfully | {text}', 'file': output_wav}) + + except Exception as e: + # Return error message if something goes wrong + return jsonify({'API error': str(e)}), 500 + + +if __name__ == '__main__': + import yaml + with open('config.yaml', 'r') as file: + config = yaml.safe_load(file) + + tta_service_port = config['Text-to-Audio']['service-port'] + ttm_service_port = config['Text-to-Audio']['service-port'] + + if tta_service_port != ttm_service_port: + msg = 'Ports of TTA and TTM should be same if you are using Audiocraft ...' + logging.info(msg) + raise ValueError(msg) + + app.run(debug=False, port=tta_service_port) + + diff --git a/Bark/__init__.py b/Bark/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Bark/app.py b/Bark/app.py new file mode 100644 index 0000000000000000000000000000000000000000..2fcf005af550c324c85d1d97f2f791a5037ef6e4 --- /dev/null +++ b/Bark/app.py @@ -0,0 +1,109 @@ +import os +import sys +sys.path.append('../AudioJourney') +import logging +import yaml +import numpy as np +import torch +import torchaudio +from torchaudio.transforms import SpeedPerturbation +import nltk +from APIs import WRITE_AUDIO, LOUDNESS_NORM +from flask import Flask, request, jsonify +from transformers import BarkModel, AutoProcessor + + +with open('config.yaml', 'r') as file: + config = yaml.safe_load(file) + +# Configure the logging format and level +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) + +# Create a FileHandler for the log file +os.makedirs('services_logs', exist_ok=True) +log_filename = 'services_logs/Text-to-Speech.log' +file_handler = logging.FileHandler(log_filename, mode='w') +file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + +# Add the FileHandler to the root logger +logging.getLogger('').addHandler(file_handler) + +# Initialize the model here +SPEED = float(config['Text-to-Speech']['speed']) +speed_perturb = SpeedPerturbation(32000, [SPEED]) + +logging.info('Loading Bark model ...') +# TODO: fp16? +model = BarkModel.from_pretrained("suno/bark") +device = "cuda:0" if torch.cuda.is_available() else "cpu" +model = model.to(device) +model = model.to_bettertransformer() # Flash attention +SAMPLE_RATE = model.generation_config.sample_rate +SEMANTIC_TEMPERATURE = 0.9 +COARSE_TEMPERATURE = 0.5 +FINE_TEMPERATURE = 0.5 + +processor = AutoProcessor.from_pretrained("suno/bark") + +app = Flask(__name__) + +@app.route('/generate_speech', methods=['POST']) +def generate_speech(): + # Receive the text from the POST request + data = request.json + text = data['text'] + speaker_id = data['speaker_id'] + speaker_npz = data['speaker_npz'] + volume = float(data.get('volume', -35)) + output_wav = data.get('output_wav', 'out.wav') + + logging.info(f'TTS (Bark): Speaker: {speaker_id}, Volume: {volume} dB, Prompt: {text}') + + try: + # Generate audio using the global pipe object + text = text.replace('\n', ' ').strip() + sentences = nltk.sent_tokenize(text) + silence = torch.zeros(int(0.1 * SAMPLE_RATE), device=device).unsqueeze(0) # 0.1 second of silence + + pieces = [] + for sentence in sentences: + inputs = processor(sentence, voice_preset=speaker_npz).to(device) + # NOTE: you must run the line below, otherwise you will see the runtime error + # RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead. + inputs['history_prompt']['coarse_prompt'] = inputs['history_prompt']['coarse_prompt'].transpose(0, 1).contiguous().transpose(0, 1) + + with torch.inference_mode(): + # TODO: min_eos_p? + output = model.generate( + **inputs, + do_sample = True, + semantic_temperature = SEMANTIC_TEMPERATURE, + coarse_temperature = COARSE_TEMPERATURE, + fine_temperature = FINE_TEMPERATURE + ) + + pieces += [output, silence] + + result_audio = torch.cat(pieces, dim=1) + wav_tensor = result_audio.to(dtype=torch.float32).cpu() + wav = torchaudio.functional.resample(wav_tensor, orig_freq=SAMPLE_RATE, new_freq=32000) + wav = speed_perturb(wav.float())[0].squeeze(0) + wav = wav.numpy() + wav = LOUDNESS_NORM(wav, volumn=volume) + WRITE_AUDIO(wav, name=output_wav) + + # Return success message and the filename of the generated audio + return jsonify({'message': f'Text-to-Speech generated successfully | {speaker_id}: {text}', 'file': output_wav}) + + except Exception as e: + raise e + # Return error message if something goes wrong + return jsonify({'API error': str(e)}), 500 + + +if __name__ == '__main__': + service_port = config['Text-to-Speech']['service-port'] + app.run(debug=False, port=service_port) diff --git a/Envs/AudioCraft.yml b/Envs/AudioCraft.yml new file mode 100644 index 0000000000000000000000000000000000000000..09d76abfe5ec7db6b14bb27465677b8e6982e4dd --- /dev/null +++ b/Envs/AudioCraft.yml @@ -0,0 +1,237 @@ +name: AudioCraft +channels: + - nvidia/label/cuda-11.8.0 + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - bzip2=1.0.8=h7f98852_4 + - ca-certificates=2023.05.30=h06a4308_0 + - cuda-cccl=11.8.89=0 + - cuda-command-line-tools=11.8.0=0 + - cuda-compiler=11.8.0=0 + - cuda-cudart=11.8.89=0 + - cuda-cudart-dev=11.8.89=0 + - cuda-cuobjdump=11.8.86=0 + - cuda-cupti=11.8.87=0 + - cuda-cuxxfilt=11.8.86=0 + - cuda-documentation=11.8.86=0 + - cuda-driver-dev=11.8.89=0 + - cuda-gdb=11.8.86=0 + - cuda-libraries=11.8.0=0 + - cuda-libraries-dev=11.8.0=0 + - cuda-memcheck=11.8.86=0 + - cuda-nsight=11.8.86=0 + - cuda-nsight-compute=11.8.0=0 + - cuda-nvcc=11.8.89=0 + - cuda-nvdisasm=11.8.86=0 + - cuda-nvml-dev=11.8.86=0 + - cuda-nvprof=11.8.87=0 + - cuda-nvprune=11.8.86=0 + - cuda-nvrtc=11.8.89=0 + - cuda-nvrtc-dev=11.8.89=0 + - cuda-nvtx=11.8.86=0 + - cuda-nvvp=11.8.87=0 + - cuda-profiler-api=11.8.86=0 + - cuda-sanitizer-api=11.8.86=0 + - cuda-toolkit=11.8.0=0 + - cuda-tools=11.8.0=0 + - cuda-visual-tools=11.8.0=0 + - gds-tools=1.4.0.31=0 + - ld_impl_linux-64=2.40=h41732ed_0 + - libcublas=11.11.3.6=0 + - libcublas-dev=11.11.3.6=0 + - libcufft=10.9.0.58=0 + - libcufft-dev=10.9.0.58=0 + - libcufile=1.4.0.31=0 + - libcufile-dev=1.4.0.31=0 + - libcurand=10.3.0.86=0 + - libcurand-dev=10.3.0.86=0 + - libcusolver=11.4.1.48=0 + - libcusolver-dev=11.4.1.48=0 + - libcusparse=11.7.5.86=0 + - libcusparse-dev=11.7.5.86=0 + - libffi=3.4.2=h7f98852_5 + - libgcc-ng=13.1.0=he5830b7_0 + - libgomp=13.1.0=he5830b7_0 + - libnpp=11.8.0.86=0 + - libnpp-dev=11.8.0.86=0 + - libnsl=2.0.0=h7f98852_0 + - libnvjpeg=11.9.0.86=0 + - libnvjpeg-dev=11.9.0.86=0 + - libsqlite=3.42.0=h2797004_0 + - libuuid=2.38.1=h0b41bf4_0 + - libzlib=1.2.13=hd590300_5 + - ncurses=6.4=hcb278e6_0 + - nsight-compute=2022.3.0.22=0 + - openssl=3.1.1=hd590300_1 + - pip=23.1.2=pyhd8ed1ab_0 + - python=3.8.17=he550d4f_0_cpython + - readline=8.2=h8228510_1 + - setuptools=68.0.0=pyhd8ed1ab_0 + - tk=8.6.12=h27826a3_0 + - wheel=0.40.0=pyhd8ed1ab_0 + - xz=5.2.6=h166bdaf_0 + - pip: + - aiofiles==23.1.0 + - aiohttp==3.8.4 + - aiosignal==1.3.1 + - altair==5.0.1 + - antlr4-python3-runtime==4.9.3 + - anyio==3.7.1 + - appdirs==1.4.4 + - async-timeout==4.0.2 + - attrs==23.1.0 + - audioread==3.0.0 + - av==10.0.0 + - blinker==1.6.2 + - blis==0.7.9 + - catalogue==2.0.8 + - certifi==2023.5.7 + - cffi==1.15.1 + - charset-normalizer==3.2.0 + - click==8.1.5 + - cloudpickle==2.2.1 + - cmake==3.26.4 + - colorlog==6.7.0 + - confection==0.1.0 + - contourpy==1.1.0 + - cycler==0.11.0 + - cymem==2.0.7 + - cython==0.29.36 + - decorator==5.1.1 + - demucs==4.0.0 + - diffq==0.2.4 + - docopt==0.6.2 + - dora-search==0.1.12 + - einops==0.6.1 + - encodec==0.1.1 + - exceptiongroup==1.1.2 + - fastapi==0.100.0 + - ffmpy==0.3.0 + - filelock==3.12.2 + - flashy==0.0.2 + - flask==2.3.2 + - fonttools==4.41.0 + - frozenlist==1.4.0 + - fsspec==2023.6.0 + - future==0.18.3 + - gradio==3.36.1 + - gradio-client==0.2.9 + - h11==0.14.0 + - httpcore==0.17.3 + - httpx==0.24.1 + - huggingface-hub==0.16.4 + - hydra-colorlog==1.2.0 + - hydra-core==1.3.2 + - idna==3.4 + - importlib-metadata==6.8.0 + - importlib-resources==6.0.0 + - itsdangerous==2.1.2 + - jinja2==3.1.2 + - joblib==1.3.1 + - jsonschema==4.18.3 + - jsonschema-specifications==2023.6.1 + - julius==0.2.7 + - kiwisolver==1.4.4 + - lameenc==1.5.1 + - langcodes==3.3.0 + - lazy-loader==0.3 + - librosa==0.10.0.post2 + - lightning-utilities==0.9.0 + - linkify-it-py==2.0.2 + - lit==16.0.6 + - llvmlite==0.40.1 + - markdown-it-py==2.2.0 + - markupsafe==2.1.3 + - matplotlib==3.7.2 + - mdit-py-plugins==0.3.3 + - mdurl==0.1.2 + - mpmath==1.3.0 + - msgpack==1.0.5 + - multidict==6.0.4 + - murmurhash==1.0.9 + - mypy-extensions==1.0.0 + - networkx==3.1 + - num2words==0.5.12 + - numba==0.57.1 + - numpy==1.24.4 + - nvidia-cublas-cu11==11.10.3.66 + - nvidia-cuda-cupti-cu11==11.7.101 + - nvidia-cuda-nvrtc-cu11==11.7.99 + - nvidia-cuda-runtime-cu11==11.7.99 + - nvidia-cudnn-cu11==8.5.0.96 + - nvidia-cufft-cu11==10.9.0.58 + - nvidia-curand-cu11==10.2.10.91 + - nvidia-cusolver-cu11==11.4.0.1 + - nvidia-cusparse-cu11==11.7.4.91 + - nvidia-nccl-cu11==2.14.3 + - nvidia-nvtx-cu11==11.7.91 + - omegaconf==2.3.0 + - openunmix==1.2.1 + - orjson==3.9.2 + - packaging==23.1 + - pandas==2.0.3 + - pathy==0.10.2 + - pillow==10.0.0 + - pkgutil-resolve-name==1.3.10 + - pooch==1.6.0 + - preshed==3.0.8 + - pycparser==2.21 + - pydantic==1.10.11 + - pydub==0.25.1 + - pygments==2.15.1 + - pyloudnorm==0.1.1 + - pyparsing==3.0.9 + - pyre-extensions==0.0.29 + - python-dateutil==2.8.2 + - python-multipart==0.0.6 + - pytz==2023.3 + - pyyaml==6.0 + - referencing==0.29.1 + - regex==2023.6.3 + - requests==2.31.0 + - retrying==1.3.4 + - rpds-py==0.8.10 + - safetensors==0.3.1 + - scikit-learn==1.3.0 + - scipy==1.10.1 + - semantic-version==2.10.0 + - sentencepiece==0.1.99 + - six==1.16.0 + - smart-open==6.3.0 + - sniffio==1.3.0 + - soundfile==0.12.1 + - soxr==0.3.5 + - spacy==3.5.2 + - spacy-legacy==3.0.12 + - spacy-loggers==1.0.4 + - srsly==2.4.6 + - starlette==0.27.0 + - submitit==1.4.5 + - sympy==1.12 + - thinc==8.1.10 + - threadpoolctl==3.2.0 + - tokenizers==0.13.3 + - toolz==0.12.0 + - torch==2.0.1 + - torchaudio==2.0.2 + - torchmetrics==1.0.1 + - tqdm==4.65.0 + - transformers==4.31.0 + - treetable==0.2.5 + - triton==2.0.0 + - typer==0.7.0 + - typing-extensions==4.7.1 + - typing-inspect==0.9.0 + - tzdata==2023.3 + - uc-micro-py==1.0.2 + - urllib3==2.0.3 + - uvicorn==0.22.0 + - wasabi==1.1.2 + - websockets==11.0.3 + - werkzeug==2.3.6 + - xformers==0.0.20 + - yarl==1.9.2 + - zipp==3.16.2 diff --git a/Envs/Bark.yml b/Envs/Bark.yml new file mode 100644 index 0000000000000000000000000000000000000000..d42277c54fc08efb003e24cb486bdf8468ea3970 --- /dev/null +++ b/Envs/Bark.yml @@ -0,0 +1,180 @@ +name: Bark +channels: + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - bzip2=1.0.8=h7f98852_4 + - ca-certificates=2023.5.7=hbcca054_0 + - ld_impl_linux-64=2.40=h41732ed_0 + - libffi=3.4.2=h7f98852_5 + - libgcc-ng=13.1.0=he5830b7_0 + - libgomp=13.1.0=he5830b7_0 + - libnsl=2.0.0=h7f98852_0 + - libsqlite=3.42.0=h2797004_0 + - libuuid=2.38.1=h0b41bf4_0 + - libzlib=1.2.13=hd590300_5 + - ncurses=6.4=hcb278e6_0 + - openssl=3.1.1=hd590300_1 + - pip=23.1.2=pyhd8ed1ab_0 + - python=3.8.17=he550d4f_0_cpython + - readline=8.2=h8228510_1 + - setuptools=68.0.0=pyhd8ed1ab_0 + - tk=8.6.12=h27826a3_0 + - wheel=0.40.0=pyhd8ed1ab_0 + - xz=5.2.6=h166bdaf_0 + - pip: + - aiohttp==3.8.5 + - aiosignal==1.3.1 + - altair==5.0.1 + - appdirs==1.4.4 + - asttokens==2.2.1 + - async-timeout==4.0.3 + - attrs==23.1.0 + - audioread==3.0.0 + - backcall==0.2.0 + - backports-zoneinfo==0.2.1 + - blinker==1.6.2 + - boto3==1.28.3 + - botocore==1.31.3 + - cachetools==5.3.1 + - certifi==2023.5.7 + - cffi==1.15.1 + - charset-normalizer==3.2.0 + - click==8.1.5 + - cmake==3.26.4 + - coloredlogs==15.0.1 + - contourpy==1.1.0 + - cycler==0.11.0 + - datasets==2.14.4 + - decorator==5.1.1 + - dill==0.3.7 + - einops==0.6.1 + - encodec==0.1.1 + - executing==1.2.0 + - filelock==3.12.2 + - fire==0.5.0 + - flask==2.3.2 + - fonttools==4.41.0 + - frozenlist==1.4.0 + - fsspec==2023.6.0 + - funcy==2.0 + - future==0.18.3 + - gitdb==4.0.10 + - gitpython==3.1.32 + - huggingface-hub==0.16.4 + - humanfriendly==10.0 + - idna==3.4 + - importlib-metadata==6.8.0 + - importlib-resources==6.0.0 + - ipdb==0.13.13 + - ipython==8.12.2 + - itsdangerous==2.1.2 + - jedi==0.19.0 + - jinja2==3.1.2 + - jmespath==1.0.1 + - joblib==1.3.1 + - jsonschema==4.18.3 + - jsonschema-specifications==2023.6.1 + - kiwisolver==1.4.4 + - lazy-loader==0.3 + - librosa==0.10.0.post2 + - lit==16.0.6 + - llvmlite==0.40.1 + - markdown-it-py==3.0.0 + - markupsafe==2.1.3 + - matplotlib==3.7.2 + - matplotlib-inline==0.1.6 + - mdurl==0.1.2 + - mpmath==1.3.0 + - msgpack==1.0.5 + - multidict==6.0.4 + - multiprocess==0.70.15 + - networkx==3.1 + - nltk==3.8.1 + - numba==0.57.1 + - numpy==1.24.4 + - nvidia-cublas-cu11==11.10.3.66 + - nvidia-cuda-cupti-cu11==11.7.101 + - nvidia-cuda-nvrtc-cu11==11.7.99 + - nvidia-cuda-runtime-cu11==11.7.99 + - nvidia-cudnn-cu11==8.5.0.96 + - nvidia-cufft-cu11==10.9.0.58 + - nvidia-curand-cu11==10.2.10.91 + - nvidia-cusolver-cu11==11.4.0.1 + - nvidia-cusparse-cu11==11.7.4.91 + - nvidia-nccl-cu11==2.14.3 + - nvidia-nvtx-cu11==11.7.91 + - optimum==1.11.1 + - packaging==23.1 + - pandas==2.0.3 + - parso==0.8.3 + - pexpect==4.8.0 + - pickleshare==0.7.5 + - pillow==9.5.0 + - pkgutil-resolve-name==1.3.10 + - pooch==1.6.0 + - progressbar==2.5 + - prompt-toolkit==3.0.39 + - protobuf==4.23.4 + - ptyprocess==0.7.0 + - pure-eval==0.2.2 + - pyarrow==12.0.1 + - pycparser==2.21 + - pydeck==0.8.1b0 + - pygments==2.15.1 + - pyloudnorm==0.1.1 + - pympler==1.0.1 + - pyparsing==3.0.9 + - python-dateutil==2.8.2 + - pytz==2023.3 + - pytz-deprecation-shim==0.1.0.post0 + - pyyaml==6.0 + - referencing==0.29.1 + - regex==2023.6.3 + - requests==2.31.0 + - resampy==0.4.2 + - retrying==1.3.4 + - rich==13.4.2 + - rpds-py==0.8.10 + - s3transfer==0.6.1 + - safetensors==0.3.1 + - scikit-learn==1.3.0 + - scipy==1.10.1 + - sentencepiece==0.1.99 + - six==1.16.0 + - smmap==5.0.0 + - soundfile==0.12.1 + - soxr==0.3.5 + - stack-data==0.6.2 + - streamlit==1.24.1 + - suno-bark==0.1.5 + - sympy==1.12 + - tenacity==8.2.2 + - termcolor==2.3.0 + - threadpoolctl==3.2.0 + - tokenizers==0.13.3 + - toml==0.10.2 + - tomli==2.0.1 + - toolz==0.12.0 + - torch==2.0.1 + - torchaudio==2.0.2 + - torchlibrosa==0.0.7 + - tornado==6.3.2 + - tqdm==4.65.0 + - traitlets==5.9.0 + - transformers==4.31.0 + - triton==2.0.0 + - typing-extensions==4.7.1 + - tzdata==2023.3 + - tzlocal==4.3.1 + - urllib3==1.26.16 + - validators==0.20.0 + - watchdog==3.0.0 + - wcwidth==0.2.6 + - werkzeug==2.3.6 + - xxhash==3.3.0 + - yarl==1.9.2 + - zipp==3.16.1 +prefix: /home/zzk/Workspace/miniconda3/envs/Bark diff --git a/Envs/VoiceFixer.yml b/Envs/VoiceFixer.yml new file mode 100644 index 0000000000000000000000000000000000000000..b8222a8d2dc130495d8de7a67328e8bbb9ce56f5 --- /dev/null +++ b/Envs/VoiceFixer.yml @@ -0,0 +1,123 @@ +name: VoiceFixer +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - ca-certificates=2023.05.30=h06a4308_0 + - ld_impl_linux-64=2.38=h1181459_1 + - libffi=3.4.4=h6a678d5_0 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libstdcxx-ng=11.2.0=h1234567_1 + - ncurses=6.4=h6a678d5_0 + - openssl=3.0.9=h7f8727e_0 + - pip=23.2.1=py38h06a4308_0 + - python=3.8.17=h955ad1f_0 + - readline=8.2=h5eee18b_0 + - setuptools=68.0.0=py38h06a4308_0 + - sqlite=3.41.2=h5eee18b_0 + - tk=8.6.12=h1ccaba5_0 + - wheel=0.38.4=py38h06a4308_0 + - xz=5.4.2=h5eee18b_0 + - zlib=1.2.13=h5eee18b_0 + - pip: + - altair==5.0.1 + - attrs==23.1.0 + - audioread==3.0.0 + - backports-zoneinfo==0.2.1 + - blinker==1.6.2 + - cachetools==5.3.1 + - certifi==2023.5.7 + - cffi==1.15.1 + - charset-normalizer==3.2.0 + - click==8.1.5 + - cmake==3.27.0 + - contourpy==1.1.0 + - cycler==0.11.0 + - decorator==5.1.1 + - filelock==3.12.2 + - flask==2.3.2 + - fonttools==4.38.0 + - gitdb==4.0.10 + - gitpython==3.1.32 + - idna==3.4 + - importlib-metadata==6.7.0 + - importlib-resources==5.12.0 + - itsdangerous==2.1.2 + - jinja2==3.1.2 + - joblib==1.3.1 + - jsonschema==4.17.3 + - jsonschema-specifications==2023.7.1 + - kiwisolver==1.4.4 + - librosa==0.8.1 + - lit==16.0.6 + - llvmlite==0.39.1 + - markdown-it-py==2.2.0 + - markupsafe==2.1.3 + - matplotlib==3.5.3 + - mdurl==0.1.2 + - mpmath==1.3.0 + - networkx==3.1 + - numba==0.56.4 + - numpy==1.21.6 + - nvidia-cublas-cu11==11.10.3.66 + - nvidia-cuda-cupti-cu11==11.7.101 + - nvidia-cuda-nvrtc-cu11==11.7.99 + - nvidia-cuda-runtime-cu11==11.7.99 + - nvidia-cudnn-cu11==8.5.0.96 + - nvidia-cufft-cu11==10.9.0.58 + - nvidia-curand-cu11==10.2.10.91 + - nvidia-cusolver-cu11==11.4.0.1 + - nvidia-cusparse-cu11==11.7.4.91 + - nvidia-nccl-cu11==2.14.3 + - nvidia-nvtx-cu11==11.7.91 + - packaging==23.1 + - pandas==1.3.5 + - pillow==9.5.0 + - pkgutil-resolve-name==1.3.10 + - platformdirs==3.9.1 + - pooch==1.7.0 + - progressbar==2.5 + - protobuf==4.23.4 + - pyarrow==12.0.1 + - pycparser==2.21 + - pydeck==0.8.1b0 + - pygments==2.15.1 + - pympler==1.0.1 + - pyparsing==3.1.0 + - pyrsistent==0.19.3 + - python-dateutil==2.8.2 + - pytz==2023.3 + - pytz-deprecation-shim==0.1.0.post0 + - pyyaml==6.0.1 + - referencing==0.30.0 + - requests==2.31.0 + - resampy==0.4.2 + - retrying==1.3.4 + - rich==13.4.2 + - rpds-py==0.9.2 + - scikit-learn==1.0.2 + - scipy==1.7.3 + - six==1.16.0 + - smmap==5.0.0 + - soundfile==0.12.1 + - streamlit==1.23.1 + - sympy==1.12 + - tenacity==8.2.2 + - threadpoolctl==3.1.0 + - toml==0.10.2 + - toolz==0.12.0 + - torch==1.13.1 + - torchlibrosa==0.0.7 + - tornado==6.2 + - triton==2.0.0 + - typing-extensions==4.7.1 + - tzdata==2023.3 + - tzlocal==4.3.1 + - urllib3==2.0.3 + - validators==0.20.0 + - voicefixer==0.1.2 + - watchdog==3.0.0 + - werkzeug==2.3.6 + - zipp==3.15.0 diff --git a/Envs/WavJourney.yml b/Envs/WavJourney.yml new file mode 100644 index 0000000000000000000000000000000000000000..d4ce3c6438bb763dcf851739a17e95e6b9437a91 --- /dev/null +++ b/Envs/WavJourney.yml @@ -0,0 +1,248 @@ +name: WavJourney +channels: + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - aom=3.5.0=h27087fc_0 + - bzip2=1.0.8=h7f98852_4 + - ca-certificates=2023.7.22=hbcca054_0 + - cairo=1.16.0=hbbf8b49_1016 + - dav1d=1.2.1=hd590300_0 + - expat=2.5.0=hcb278e6_1 + - ffmpeg=6.0.0=gpl_hdbbbd96_103 + - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 + - font-ttf-inconsolata=3.000=h77eed37_0 + - font-ttf-source-code-pro=2.038=h77eed37_0 + - font-ttf-ubuntu=0.83=hab24e00_0 + - fontconfig=2.14.2=h14ed4e7_0 + - fonts-conda-ecosystem=1=0 + - fonts-conda-forge=1=0 + - freetype=2.12.1=hca18f0e_1 + - fribidi=1.0.10=h36c2ea0_0 + - gettext=0.21.1=h27087fc_0 + - gmp=6.2.1=h58526e2_0 + - gnutls=3.7.8=hf3e180e_0 + - graphite2=1.3.13=h58526e2_1001 + - harfbuzz=7.3.0=hdb3a94d_0 + - icu=72.1=hcb278e6_0 + - lame=3.100=h166bdaf_1003 + - ld_impl_linux-64=2.40=h41732ed_0 + - libass=0.17.1=hc9aadba_0 + - libdrm=2.4.114=h166bdaf_0 + - libexpat=2.5.0=hcb278e6_1 + - libffi=3.4.2=h7f98852_5 + - libgcc-ng=13.1.0=he5830b7_0 + - libglib=2.76.4=hebfc3b9_0 + - libgomp=13.1.0=he5830b7_0 + - libiconv=1.17=h166bdaf_0 + - libidn2=2.3.4=h166bdaf_0 + - libnsl=2.0.0=h7f98852_0 + - libopus=1.3.1=h7f98852_1 + - libpciaccess=0.17=h166bdaf_0 + - libpng=1.6.39=h753d276_0 + - libsqlite=3.42.0=h2797004_0 + - libstdcxx-ng=13.1.0=hfd8a6a1_0 + - libtasn1=4.19.0=h166bdaf_0 + - libunistring=0.9.10=h7f98852_0 + - libuuid=2.38.1=h0b41bf4_0 + - libva=2.19.0=hd590300_0 + - libvpx=1.13.0=hcb278e6_0 + - libxcb=1.15=h0b41bf4_0 + - libxml2=2.11.5=h0d562d8_0 + - libzlib=1.2.13=hd590300_5 + - ncurses=6.4=hcb278e6_0 + - nettle=3.8.1=hc379101_1 + - openh264=2.3.1=hcb278e6_2 + - openssl=3.1.2=hd590300_0 + - p11-kit=0.24.1=hc5aa10d_0 + - pcre2=10.40=hc3806b6_0 + - pip=23.2=pyhd8ed1ab_0 + - pixman=0.40.0=h36c2ea0_0 + - pthread-stubs=0.4=h36c2ea0_1001 + - python=3.8.17=he550d4f_0_cpython + - readline=8.2=h8228510_1 + - setuptools=68.0.0=pyhd8ed1ab_0 + - svt-av1=1.6.0=h59595ed_0 + - tk=8.6.12=h27826a3_0 + - wheel=0.40.0=pyhd8ed1ab_1 + - x264=1!164.3095=h166bdaf_2 + - x265=3.5=h924138e_3 + - xorg-fixesproto=5.0=h7f98852_1002 + - xorg-kbproto=1.0.7=h7f98852_1002 + - xorg-libice=1.1.1=hd590300_0 + - xorg-libsm=1.2.4=h7391055_0 + - xorg-libx11=1.8.6=h8ee46fc_0 + - xorg-libxau=1.0.11=hd590300_0 + - xorg-libxdmcp=1.1.3=h7f98852_0 + - xorg-libxext=1.3.4=h0b41bf4_2 + - xorg-libxfixes=5.0.3=h7f98852_1004 + - xorg-libxrender=0.9.11=hd590300_0 + - xorg-renderproto=0.11.1=h7f98852_1002 + - xorg-xextproto=7.3.0=h0b41bf4_1003 + - xorg-xproto=7.0.31=h7f98852_1007 + - xz=5.2.6=h166bdaf_0 + - zlib=1.2.13=hd590300_5 + - pip: + - accelerate==0.21.0 + - aiofiles==23.1.0 + - aiohttp==3.8.5 + - aiosignal==1.3.1 + - altair==5.0.1 + - annotated-types==0.5.0 + - antlr4-python3-runtime==4.8 + - anyio==3.7.1 + - appdirs==1.4.4 + - asttokens==2.2.1 + - async-timeout==4.0.2 + - attrs==23.1.0 + - audiolm-pytorch==1.1.4 + - audioread==3.0.0 + - backcall==0.2.0 + - beartype==0.15.0 + - bitarray==2.8.1 + - blinker==1.6.2 + - certifi==2023.5.7 + - cffi==1.15.1 + - charset-normalizer==3.2.0 + - click==8.1.6 + - cmake==3.26.4 + - colorama==0.4.6 + - contourpy==1.1.0 + - cycler==0.11.0 + - cython==3.0.0 + - decorator==5.1.1 + - einops==0.6.1 + - ema-pytorch==0.2.3 + - encodec==0.1.1 + - exceptiongroup==1.1.2 + - executing==1.2.0 + - fairseq==0.12.2 + - fastapi==0.100.1 + - ffmpy==0.3.1 + - filelock==3.12.2 + - flask==2.3.2 + - fonttools==4.42.0 + - frozenlist==1.4.0 + - fsspec==2023.6.0 + - future==0.18.3 + - gradio==3.39.0 + - gradio-client==0.3.0 + - h11==0.14.0 + - httpcore==0.17.3 + - httpx==0.24.1 + - huggingface-hub==0.16.4 + - hydra-core==1.0.7 + - idna==3.4 + - importlib-metadata==6.8.0 + - importlib-resources==6.0.0 + - ipdb==0.13.13 + - ipython==8.12.2 + - itsdangerous==2.1.2 + - jedi==0.18.2 + - jinja2==3.1.2 + - joblib==1.3.1 + - json5==0.9.14 + - jsonschema==4.18.6 + - jsonschema-specifications==2023.7.1 + - kiwisolver==1.4.4 + - lazy-loader==0.3 + - librosa==0.10.0.post2 + - linkify-it-py==2.0.2 + - lion-pytorch==0.1.2 + - lit==16.0.6 + - llvmlite==0.40.1 + - local-attention==1.8.6 + - lxml==4.9.3 + - markdown-it-py==2.2.0 + - markupsafe==2.1.3 + - matplotlib==3.7.2 + - matplotlib-inline==0.1.6 + - mdit-py-plugins==0.3.3 + - mdurl==0.1.2 + - mpmath==1.3.0 + - msgpack==1.0.5 + - multidict==6.0.4 + - networkx==3.1 + - nltk==3.8.1 + - numba==0.57.1 + - numpy==1.24.4 + - nvidia-cublas-cu11==11.10.3.66 + - nvidia-cuda-cupti-cu11==11.7.101 + - nvidia-cuda-nvrtc-cu11==11.7.99 + - nvidia-cuda-runtime-cu11==11.7.99 + - nvidia-cudnn-cu11==8.5.0.96 + - nvidia-cufft-cu11==10.9.0.58 + - nvidia-curand-cu11==10.2.10.91 + - nvidia-cusolver-cu11==11.4.0.1 + - nvidia-cusparse-cu11==11.7.4.91 + - nvidia-nccl-cu11==2.14.3 + - nvidia-nvtx-cu11==11.7.91 + - omegaconf==2.0.6 + - openai==0.27.8 + - orjson==3.9.2 + - packaging==23.1 + - pandas==2.0.3 + - parso==0.8.3 + - pexpect==4.8.0 + - pickleshare==0.7.5 + - pillow==10.0.0 + - pkgutil-resolve-name==1.3.10 + - pooch==1.6.0 + - portalocker==2.7.0 + - prompt-toolkit==3.0.39 + - psutil==5.9.5 + - ptyprocess==0.7.0 + - pure-eval==0.2.2 + - pycparser==2.21 + - pydantic==2.1.1 + - pydantic-core==2.4.0 + - pydub==0.25.1 + - pygments==2.15.1 + - pyloudnorm==0.1.1 + - pyparsing==3.0.9 + - python-dateutil==2.8.2 + - python-multipart==0.0.6 + - pytz==2023.3 + - pyyaml==6.0.1 + - referencing==0.30.1 + - regex==2023.6.3 + - requests==2.31.0 + - retrying==1.3.4 + - rpds-py==0.9.2 + - sacrebleu==2.3.1 + - safetensors==0.3.2 + - scikit-learn==1.3.0 + - scipy==1.10.1 + - semantic-version==2.10.0 + - sentencepiece==0.1.99 + - six==1.16.0 + - sniffio==1.3.0 + - soundfile==0.12.1 + - soxr==0.3.5 + - stack-data==0.6.2 + - starlette==0.27.0 + - sympy==1.12 + - tabulate==0.9.0 + - threadpoolctl==3.2.0 + - tokenizers==0.13.3 + - tomli==2.0.1 + - toolz==0.12.0 + - torch==2.0.1 + - torchaudio==2.0.2 + - tqdm==4.65.0 + - traitlets==5.9.0 + - transformers==4.31.0 + - triton==2.0.0 + - typing-extensions==4.7.1 + - tzdata==2023.3 + - uc-micro-py==1.0.2 + - urllib3==2.0.4 + - uvicorn==0.23.2 + - vector-quantize-pytorch==1.6.30 + - wcwidth==0.2.6 + - websockets==11.0.3 + - werkzeug==2.3.6 + - yarl==1.9.2 + - zipp==3.16.2 diff --git a/EnvsSetup/AudioCraft.sh b/EnvsSetup/AudioCraft.sh new file mode 100644 index 0000000000000000000000000000000000000000..ea4ca518bc3e275e9983ee8608f651b063f8a3a5 --- /dev/null +++ b/EnvsSetup/AudioCraft.sh @@ -0,0 +1,16 @@ +conda env create -f Envs/AudioCraft.yml +conda run --live-stream -n AudioCraft pip install -U git+https://git@github.com/facebookresearch/audiocraft@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft +# Could not load library libcudnn_cnn_infer.so.8. +# Error: libnvrtc.so: cannot open shared object file: No such file or directory +CONDAENV=AudioCraft +source activate ${CONDAENV} +conda install -c "nvidia/label/cuda-11.8.0" cuda-toolkit +python3 -m pip install nvidia-cudnn-cu11==8.5.0.96 +source deactivate +mkdir -p $CONDA_PREFIX/envs/${CONDAENV}/etc/conda/activate.d +echo 'CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn;print(nvidia.cudnn.__file__)"))' >> $CONDA_PREFIX/envs/${CONDAENV}/etc/conda/activate.d/env_vars.sh +echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/:$CUDNN_PATH/lib' >> $CONDA_PREFIX/envs/${CONDAENV}/etc/conda/activate.d/env_vars.sh +source $CONDA_PREFIX/envs/${CONDAENV}/etc/conda/activate.d/env_vars.sh + +# If you're using WSL2, you can add the following into ~/.bashrc +# export LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH diff --git a/EnvsSetup/Bark.sh b/EnvsSetup/Bark.sh new file mode 100644 index 0000000000000000000000000000000000000000..3cefd458d1a84be3d2196c5d5906536b79339a7a --- /dev/null +++ b/EnvsSetup/Bark.sh @@ -0,0 +1 @@ +conda env create -f Envs/Bark.yml diff --git a/EnvsSetup/VoiceFixer.sh b/EnvsSetup/VoiceFixer.sh new file mode 100644 index 0000000000000000000000000000000000000000..3597b5f00348c673c11994b111f7b7eb0290c2ab --- /dev/null +++ b/EnvsSetup/VoiceFixer.sh @@ -0,0 +1 @@ +conda env create -f Envs/VoiceFixer.yml diff --git a/EnvsSetup/WavJourney.sh b/EnvsSetup/WavJourney.sh new file mode 100644 index 0000000000000000000000000000000000000000..c15d05a869d00e8960fc53b6aabf98622f4ab9fa --- /dev/null +++ b/EnvsSetup/WavJourney.sh @@ -0,0 +1 @@ +conda env create -f Envs/WavJourney.yml diff --git a/README.md b/README.md index c72ecec8ff70b0d0d78f74dda21f55301a68c976..e6960c1d5b2dccbb80e47af51b9922b6f0328a12 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,47 @@ ---- -title: WavJourney -emoji: 🔥 -colorFrom: blue -colorTo: purple -sdk: gradio -sdk_version: 3.40.1 -app_file: app.py -pinned: false -license: cc-by-nc-nd-4.0 ---- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# How to run WavJourney? +1. Install environment by following the bash scripts in `EnvsSetup/` +2. Start API services; The service logs are in the folder of `logs/` + ```bash + python scripts/start_services.py + ``` +3. Run AudioJourney client; The results of scripts and audio are in the folder of `output/[datetime]_[instruction text]/` + ```bash + conda activate AudioJourney + python audiojourney_cli.py -f --instruction "News channel BBC broadcast about Trump playing street fighter 6 against Biden" + ``` +4. Kill the API services + ```bash +python scripts/kill_services.py + ``` + +5. Start the UI + ```bash +sh scripts/start_ui.sh + ``` + + +# Voice Presets +You can add voice presets to WavJourney to customize the voice actors. Simply provide the voice id, the description and a sample wav file, and WavJourney will pick the voice automatically based on the audio script. + +Predefined system voice presets are in `data/voice_presets`, whereas session voice presets are in each session's individual folder. See the example below: + +- 📂 **project_folder** + - 📂 **data** + - 📂 **voice_presets** <-- system voice presets + - 📄 **metadata.json** <-- system voice preset metadata + - 📂 **npz** + - 📂 **output** + - 📂 **sessions** + - 📂 **session_1** + - 📂 **voice_presets** <-- session voice presets + - 📄 **metadata.json** <-- session voice preset metadata + - 📂 **npz** + - 📂 **session_2** + - **...** + +## Add voice to system voice presets via command line +It's recommended to manage voice presets via UI. However if you want to add voice to voice presets via command line. Run the script below: +```bash +python add_voice_preset.py --id "id" --desc "description" --wav-path path/to/wav --session-id session-id +``` +if `session-id` is set to '', then you are adding to system voice presets diff --git a/VoiceFixer/app.py b/VoiceFixer/app.py new file mode 100644 index 0000000000000000000000000000000000000000..1e0e598a39784a02a76ae44a1c0cf5c9bdd5142f --- /dev/null +++ b/VoiceFixer/app.py @@ -0,0 +1,55 @@ +from genericpath import exists +import os +import os.path +import logging +from voicefixer import VoiceFixer +from flask import Flask, request, jsonify + +# Configure the logging format and level +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) + +# Create a FileHandler for the log file +os.makedirs('services_logs', exist_ok=True) +log_filename = 'services_logs/Speech-Restoration.log' +file_handler = logging.FileHandler(log_filename, mode='w') +file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + +# Add the FileHandler to the root logger +logging.getLogger('').addHandler(file_handler) + +# Initialize the model here +vf = VoiceFixer() +logging.info('VoiceFixer is loaded ...') + +app = Flask(__name__) + +@app.route('/fix_audio', methods=['POST']) +def fix_audio(): + # Receive the text from the POST request + data = request.json + processfile = data['processfile'] + + logging.info(f'Fixing {processfile} ...') + + try: + vf.restore(input=processfile, output=processfile, cuda=True, mode=0) + + # Return success message and the filename of the generated audio + return jsonify({'message': 'Speech restored successfully', 'file': processfile}) + + except Exception as e: + # Return error message if something goes wrong + return jsonify({'API error': str(e)}), 500 + + +if __name__ == '__main__': + import yaml + with open('config.yaml', 'r') as file: + config = yaml.safe_load(file) + + service_port = config['Speech-Restoration']['service-port'] + app.run(debug=False, port=service_port) + diff --git a/VoiceParser/__init__.py b/VoiceParser/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/VoiceParser/app.py b/VoiceParser/app.py new file mode 100644 index 0000000000000000000000000000000000000000..f4d73dc71b417b68554af8c13ab94d6f093f4bae --- /dev/null +++ b/VoiceParser/app.py @@ -0,0 +1,58 @@ +from genericpath import exists +import os +import os.path +import logging +import yaml +from model import VoiceParser +from flask import Flask, request, jsonify + +with open('config.yaml', 'r') as file: + config = yaml.safe_load(file) + +service_port = config['Voice-Parser']['service-port'] +vp_device = config['Voice-Parser']['device'] + +# Configure the logging format and level +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) + +# Create a FileHandler for the log file +os.makedirs('services_logs', exist_ok=True) +log_filename = 'services_logs/Voice-Parser.log' +file_handler = logging.FileHandler(log_filename, mode='w') +file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + +# Add the FileHandler to the root logger +logging.getLogger('').addHandler(file_handler) + +# Initialize the model here +vp = VoiceParser(device=vp_device) +logging.info('VoiceParser is loaded ...') + +app = Flask(__name__) + +@app.route('/parse_voice', methods=['POST']) +def parse_voice(): + # Receive the text from the POST request + data = request.json + wav_path = data['wav_path'] + out_dir = data['out_dir'] + + logging.info(f'Parsing {wav_path} ...') + + try: + vp.extract_acoustic_embed(wav_path, out_dir) + + # Return success message and the filename of the generated audio + return jsonify({'message': f'Sucessfully parsed {wav_path}'}) + + except Exception as e: + # Return error message if something goes wrong + return jsonify({'API error': str(e)}), 500 + + +if __name__ == '__main__': + app.run(debug=False, port=service_port) + diff --git a/VoiceParser/customtokenizer.py b/VoiceParser/customtokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..aa2e7d49bab149dfe3cb43db5502a4c5b40821c1 --- /dev/null +++ b/VoiceParser/customtokenizer.py @@ -0,0 +1,202 @@ +""" +Custom tokenizer model. +Author: https://www.github.com/gitmylo/ +License: MIT +""" + +import json +import os.path +from zipfile import ZipFile +from typing import Union + + +import numpy +import torch +from torch import nn, optim +from torch.serialization import MAP_LOCATION + + +class CustomTokenizer(nn.Module): + def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0): + super(CustomTokenizer, self).__init__() + next_size = input_size + if version == 0: + self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True) + next_size = hidden_size + if version == 1: + self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True) + self.intermediate = nn.Linear(hidden_size, 4096) + next_size = 4096 + + self.fc = nn.Linear(next_size, output_size) + self.softmax = nn.LogSoftmax(dim=1) + self.optimizer: optim.Optimizer = None + self.lossfunc = nn.CrossEntropyLoss() + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + self.version = version + + def forward(self, x): + x, _ = self.lstm(x) + if self.version == 1: + x = self.intermediate(x) + x = self.fc(x) + x = self.softmax(x) + return x + + @torch.no_grad() + def get_token(self, x): + """ + Used to get the token for the first + :param x: An array with shape (N, input_size) where N is a whole number greater or equal to 1, and input_size is the input size used when creating the model. + :return: An array with shape (N,) where N is the same as N from the input. Every number in the array is a whole number in range 0...output_size - 1 where output_size is the output size used when creating the model. + """ + return torch.argmax(self(x), dim=1) + + def prepare_training(self): + self.optimizer = optim.Adam(self.parameters(), 0.001) + + def train_step(self, x_train, y_train, log_loss=False): + # y_train = y_train[:-1] + # y_train = y_train[1:] + + optimizer = self.optimizer + lossfunc = self.lossfunc + # Zero the gradients + self.zero_grad() + + # Forward pass + y_pred = self(x_train) + + y_train_len = len(y_train) + y_pred_len = y_pred.shape[0] + + if y_train_len > y_pred_len: + diff = y_train_len - y_pred_len + y_train = y_train[diff:] + elif y_train_len < y_pred_len: + diff = y_pred_len - y_train_len + y_pred = y_pred[:-diff, :] + + y_train_hot = torch.zeros(len(y_train), self.output_size) + y_train_hot[range(len(y_train)), y_train] = 1 + y_train_hot = y_train_hot.to('cuda') + + # Calculate the loss + loss = lossfunc(y_pred, y_train_hot) + + # Print loss + if log_loss: + print('Loss', loss.item()) + + # Backward pass + loss.backward() + + # Update the weights + optimizer.step() + + def save(self, path): + info_path = '.'.join(os.path.basename(path).split('.')[:-1]) + '/.info' + torch.save(self.state_dict(), path) + data_from_model = Data(self.input_size, self.hidden_size, self.output_size, self.version) + with ZipFile(path, 'a') as model_zip: + model_zip.writestr(info_path, data_from_model.save()) + model_zip.close() + + @staticmethod + def load_from_checkpoint(path, map_location: MAP_LOCATION = None): + old = True + with ZipFile(path) as model_zip: + filesMatch = [file for file in model_zip.namelist() if file.endswith('/.info')] + file = filesMatch[0] if filesMatch else None + if file: + old = False + data_from_model = Data.load(model_zip.read(file).decode('utf-8')) + model_zip.close() + if old: + model = CustomTokenizer() + else: + model = CustomTokenizer(data_from_model.hidden_size, data_from_model.input_size, data_from_model.output_size, data_from_model.version) + model.load_state_dict(torch.load(path, map_location=map_location)) + if map_location: + model = model.to(map_location) + return model + + + +class Data: + input_size: int + hidden_size: int + output_size: int + version: int + + def __init__(self, input_size=768, hidden_size=1024, output_size=10000, version=0): + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + self.version = version + + @staticmethod + def load(string): + data = json.loads(string) + return Data(data['input_size'], data['hidden_size'], data['output_size'], data['version']) + + def save(self): + data = { + 'input_size': self.input_size, + 'hidden_size': self.hidden_size, + 'output_size': self.output_size, + 'version': self.version, + } + return json.dumps(data) + + +def auto_train(data_path, save_path='model.pth', lload_model: Union[str, None] = None, save_epochs=1): + data_x, data_y = {}, {} + + if load_model and os.path.isfile(load_model): + print('Loading model from', load_model) + model_training = CustomTokenizer.load_from_checkpoint(load_model, 'cuda') + else: + print('Creating new model.') + model_training = CustomTokenizer(version=1).to('cuda') + save_path = os.path.join(data_path, save_path) + base_save_path = '.'.join(save_path.split('.')[:-1]) + + sem_string = '_semantic.npy' + feat_string = '_semantic_features.npy' + + ready = os.path.join(data_path, 'ready') + for input_file in os.listdir(ready): + full_path = os.path.join(ready, input_file) + try: + prefix = input_file.split("_")[0] + number = int(prefix) + except ValueError as e: + raise e + if input_file.endswith(sem_string): + data_y[number] = numpy.load(full_path) + elif input_file.endswith(feat_string): + data_x[number] = numpy.load(full_path) + + model_training.prepare_training() + epoch = 1 + + while 1: + for i in range(save_epochs): + j = 0 + for i in range(max(len(data_x), len(data_y))): + x = data_x.get(i) + y = data_y.get(i) + if x is None or y is None: + print(f'The training data does not match. key={i}') + continue + model_training.train_step(torch.tensor(x).to('cuda'), torch.tensor(y).to('cuda'), j % 50 == 0) # Print loss every 50 steps + j += 1 + save_p = save_path + save_p_2 = f'{base_save_path}_epoch_{epoch}.pth' + model_training.save(save_p) + model_training.save(save_p_2) + print(f'Epoch {epoch} completed') + epoch += 1 \ No newline at end of file diff --git a/VoiceParser/hubert_manager.py b/VoiceParser/hubert_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..5f8445147a8997fdb54e1246e9a85af40342c748 --- /dev/null +++ b/VoiceParser/hubert_manager.py @@ -0,0 +1,33 @@ +import os.path +import shutil +import urllib.request + +import huggingface_hub + + +class HuBERTManager: + @staticmethod + def make_sure_hubert_installed(download_url: str = 'https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt', file_name: str = 'hubert.pt'): + install_dir = os.path.join('VoiceParser', 'hubert') + if not os.path.isdir(install_dir): + os.makedirs(install_dir, exist_ok=True) + install_file = os.path.join(install_dir, file_name) + if not os.path.isfile(install_file): + print('Downloading HuBERT base model') + urllib.request.urlretrieve(download_url, install_file) + print('Downloaded HuBERT') + return install_file + + + @staticmethod + def make_sure_tokenizer_installed(model: str = 'quantifier_hubert_base_ls960_14.pth', repo: str = 'GitMylo/bark-voice-cloning', local_file: str = 'tokenizer.pth'): + install_dir = os.path.join('VoiceParser', 'hubert') + if not os.path.isdir(install_dir): + os.makedirs(install_dir, exist_ok=True) + install_file = os.path.join(install_dir, local_file) + if not os.path.isfile(install_file): + print('Downloading HuBERT custom tokenizer') + huggingface_hub.hf_hub_download(repo, model, local_dir=install_dir, local_dir_use_symlinks=False) + shutil.move(os.path.join(install_dir, model), install_file) + print('Downloaded tokenizer') + return install_file \ No newline at end of file diff --git a/VoiceParser/model.py b/VoiceParser/model.py new file mode 100644 index 0000000000000000000000000000000000000000..d54b57f6292cd7ab588f6b672cd413ef96c8d02a --- /dev/null +++ b/VoiceParser/model.py @@ -0,0 +1,102 @@ +import os +import json +import numpy as np + +import torch +import torchaudio +torchaudio.set_audio_backend("soundfile") # Use 'soundfile' backend + +from encodec import EncodecModel +from encodec.utils import convert_audio +from hubert_manager import HuBERTManager +from pre_kmeans_hubert import CustomHubert +from customtokenizer import CustomTokenizer + +class VoiceParser(): + def __init__(self, device='cpu'): + model = ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth') + + hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed(), device=device) + quant_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device) + encodec_model = EncodecModel.encodec_model_24khz() + encodec_model.set_target_bandwidth(6.0) + + self.hubert_model = hubert_model + self.quant_model = quant_model + self.encodec_model = encodec_model.to(device) + self.device = device + print('Loaded VoiceParser models!') + + + def extract_acoustic_embed(self, wav_path, npz_dir): + wav, sr = torchaudio.load(wav_path) + + wav_hubert = wav.to(self.device) + + if wav_hubert.shape[0] == 2: # Stereo to mono if needed + wav_hubert = wav_hubert.mean(0, keepdim=True) + + semantic_vectors = self.hubert_model.forward(wav_hubert, input_sample_hz=sr) + semantic_tokens = self.quant_model.get_token(semantic_vectors) + wav = convert_audio(wav, sr, self.encodec_model.sample_rate, 1).unsqueeze(0) + + wav = wav.to(self.device) + + with torch.no_grad(): + encoded_frames = self.encodec_model.encode(wav) + + codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() + + codes = codes.cpu() + semantic_tokens = semantic_tokens.cpu() + + wav_name = os.path.split(wav_path)[1] + npz_name = wav_name[:-4] + '.npz' + npz_path = os.path.join(npz_dir, npz_name) + + np.savez( + npz_path, + semantic_prompt=semantic_tokens, + fine_prompt=codes, + coarse_prompt=codes[:2, :] + ) + + return npz_path + + + def read_json_file(self, json_path): + with open(json_path, 'r') as file: + data = json.load(file) + return data + + + def parse_voice_json(self, voice_json, output_dir): + """ + Parse a voice json file, generate the corresponding output json and npz files + Params: + voice_json: path of a json file or List of json nodes + output_dir: output dir for new json and npz files + """ + if isinstance(voice_json, list): + voice_json = voice_json + else: + # If voice_json is a file path (str), read the JSON file + voice_json = self.read_json_file(voice_json) + for item in voice_json: + wav_path = item['wav'] + npz_path = self.extract_acoustic_embed(wav_path=wav_path, npz_dir=output_dir) + item['npz'] = npz_path + del item['wav'] + + output_json = os.path.join(output_dir, 'metadata.json') + + with open(output_json, 'w') as file: + json.dump(voice_json, file, indent=4) + + + + + + + + diff --git a/VoiceParser/pre_kmeans_hubert.py b/VoiceParser/pre_kmeans_hubert.py new file mode 100644 index 0000000000000000000000000000000000000000..ca09cf7817a00130ac0d84ecc9e45c044f613360 --- /dev/null +++ b/VoiceParser/pre_kmeans_hubert.py @@ -0,0 +1,106 @@ +""" +Modified HuBERT model without kmeans. +Original author: https://github.com/lucidrains/ +Modified by: https://www.github.com/gitmylo/ +License: MIT +""" + +# Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py + +from pathlib import Path + +import torch +from torch import nn +from einops import pack, unpack + +import fairseq + +from torchaudio.functional import resample + +from audiolm_pytorch.utils import curtail_to_multiple + +import logging +logging.root.setLevel(logging.ERROR) + + +def exists(val): + return val is not None + + +def default(val, d): + return val if exists(val) else d + + +class CustomHubert(nn.Module): + """ + checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert + or you can train your own + """ + + def __init__( + self, + checkpoint_path, + target_sample_hz=16000, + seq_len_multiple_of=None, + output_layer=9, + device=None + ): + super().__init__() + self.target_sample_hz = target_sample_hz + self.seq_len_multiple_of = seq_len_multiple_of + self.output_layer = output_layer + + if device is not None: + self.to(device) + + model_path = Path(checkpoint_path) + + assert model_path.exists(), f'path {checkpoint_path} does not exist' + + checkpoint = torch.load(checkpoint_path, map_location=device) + load_model_input = {checkpoint_path: checkpoint} + model, *_ = fairseq.checkpoint_utils.load_model_ensemble_and_task(load_model_input) + + if device is not None: + model[0].to(device) + + self.model = model[0] + self.model.eval() + + @property + def groups(self): + return 1 + + @torch.no_grad() + def forward( + self, + wav_input, + flatten=True, + input_sample_hz=None + ): + device = wav_input.device + + if exists(input_sample_hz): + wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz) + + if exists(self.seq_len_multiple_of): + wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of) + + embed = self.model( + wav_input, + features_only=True, + mask=False, # thanks to @maitycyrus for noticing that mask is defaulted to True in the fairseq code + output_layer=self.output_layer + ) + + embed, packed_shape = pack([embed['x']], '* d') + + # codebook_indices = self.kmeans.predict(embed.cpu().detach().numpy()) + + codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device) # .long() + + if flatten: + return codebook_indices + + codebook_indices, = unpack(codebook_indices, packed_shape, '*') + return codebook_indices \ No newline at end of file diff --git a/add_voice_preset.py b/add_voice_preset.py new file mode 100644 index 0000000000000000000000000000000000000000..70cbd91ae5afb25fdad3f6cb1a23bc4a96a34569 --- /dev/null +++ b/add_voice_preset.py @@ -0,0 +1,21 @@ +import argparse +import voice_presets + +def main(): + # Argument Parsing + parser = argparse.ArgumentParser(description="Add Voice Preset") + parser.add_argument("--id", required=True, help="ID of the voice") + parser.add_argument("--desc", required=True, help="Description of the voice") + parser.add_argument("--wav-path", required=True, help="Path to the .wav file") + parser.add_argument("--session-id", required=True, help="session_id, if set to '' then it's system voice presets") + args = parser.parse_args() + + if args.session_id: + print(voice_presets.add_session_voice_preset(args.id, args.desc, args.wav_path, args.session_id)) + else: + print(voice_presets.add_system_voice_preset(args.id, args.desc, args.wav_path)) + + + +if __name__ == "__main__": + main() diff --git a/code_generator.py b/code_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..ab97472226676ca1557afe33881e2907cb70d92c --- /dev/null +++ b/code_generator.py @@ -0,0 +1,190 @@ +import os +import json5 +import utils + + +def check_json_script(data): + foreground_mandatory_attrs_map = { + 'music': ['vol', 'len', 'desc'], + 'sound_effect': ['vol', 'len', 'desc'], + 'speech': ['vol', 'text'] + } + background_mandatory_attrs_map = { + 'music': ['vol', 'desc'], + 'sound_effect': ['vol', 'desc'], + } + + def check_by_audio_type(audio, mandatory_attrs_map, audio_str): + if audio['audio_type'] not in mandatory_attrs_map: + raise ValueError('audio_type is not allowed in this layout, audio={audio_str}') + for attr_name in mandatory_attrs_map[audio['audio_type']]: + if attr_name not in audio: + raise ValueError(f'{attr_name} does not exist, audio={audio_str}') + + # Check json's format + for audio in data: + audio_str = json5.dumps(audio, indent=None) + if 'layout' not in audio: + raise ValueError(f'layout missing, audio={audio_str}') + elif 'audio_type' not in audio: + raise ValueError(f'audio_type missing, audio={audio_str}') + elif audio['layout'] == 'foreground': + check_by_audio_type(audio, foreground_mandatory_attrs_map, audio_str) + elif audio['layout'] == 'background': + if 'id' not in audio: + raise ValueError(f'id not in background audio, audio={audio_str}') + if 'action' not in audio: + raise ValueError(f'action not in background audio, audio={audio_str}') + if audio['action'] == 'begin': + check_by_audio_type(audio, background_mandatory_attrs_map, audio_str) + else: + if audio['action'] != 'end': + raise ValueError(f'Unknown action, audio={audio_str}') + else: + raise ValueError(f'Unknown layout, audio={audio_str}') + #except Exception as err: + # sys.stderr.write(f'PARSING ERROR: {err}, audio={json5.dumps(audio, indent=None)}\n') + # all_clear = False + + +def collect_and_check_audio_data(data): + fg_audio_id = 0 + fg_audios = [] + bg_audios = [] + # Collect all the foreground and background audio ids used to calculate background audio length later + for audio in data: + if audio['layout'] == 'foreground': + audio['id'] = fg_audio_id + fg_audios.append(audio) + fg_audio_id += 1 + else: # background + if audio['action'] == 'begin': + audio['begin_fg_audio_id'] = fg_audio_id + bg_audios.append(audio) + else: # ends + # find the backgound with the id, and update its 'end_fg_audio_id' + for bg_audio in bg_audios: + if bg_audio['id'] == audio['id'] and bg_audio['audio_type'] == audio['audio_type']: + bg_audio['end_fg_audio_id'] = fg_audio_id + break + + # check if all background audios are valid + for bg_audio in bg_audios: + if 'begin_fg_audio_id' not in bg_audio: + raise ValueError(f'begin of background missing, audio={bg_audio}') + elif 'end_fg_audio_id' not in bg_audio: + raise ValueError(f'end of background missing, audio={bg_audio}') + + if bg_audio['begin_fg_audio_id'] > bg_audio['end_fg_audio_id']: + raise ValueError(f'background audio ends before start, audio={bg_audio}') + elif bg_audio['begin_fg_audio_id'] == bg_audio['end_fg_audio_id']: + raise ValueError(f'background audio contains no foreground audio, audio={bg_audio}') + #except Exception as err: + # sys.stderr.write(f'ALIGNMENT ERROR: {err}, audio={bg_audio}\n') + # return None, None + + return fg_audios, bg_audios + + +class AudioCodeGenerator: + def __init__(self): + self.wav_counters = { + 'bg_sound_effect': 0, + 'bg_music': 0, + 'idle': 0, + 'fg_sound_effect': 0, + 'fg_music': 0, + 'fg_speech': 0, + } + self.code = '' + + def append_code(self, content): + self.code = f'{self.code}{content}\n' + + def generate_code(self, fg_audios, bg_audios, output_path, result_filename): + def get_wav_name(audio): + audio_type = audio['audio_type'] + layout = 'fg' if audio['layout'] == 'foreground' else 'bg' + wav_type = f'{layout}_{audio_type}' if layout else audio_type + desc = audio['text'] if 'text' in audio else audio['desc'] + desc = utils.text_to_abbrev_prompt(desc) + wav_filename = f'{wav_type}_{self.wav_counters[wav_type]}_{desc}.wav' + self.wav_counters[wav_type] += 1 + return wav_filename + + header = f''' +import sys +sys.path.append('../AudioJourney') + +import os +import datetime + +from APIs import TTM, TTS, TTA, MIX, CAT, COMPUTE_LEN + + +fg_audio_lens = [] +wav_path = \"{output_path.absolute()}/audio\" +os.makedirs(wav_path, exist_ok=True) + +''' + self.append_code(header) + + fg_audio_wavs = [] + for fg_audio in fg_audios: + wav_name = get_wav_name(fg_audio) + if fg_audio['audio_type'] == 'sound_effect': + self.append_code(f'TTA(text=\"{fg_audio["desc"]}\", length={fg_audio["len"]}, volume={fg_audio["vol"]}, out_wav=os.path.join(wav_path, \"{wav_name}\"))') + elif fg_audio['audio_type'] == 'music': + self.append_code(f'TTM(text=\"{fg_audio["desc"]}\", length={fg_audio["len"]}, volume={fg_audio["vol"]}, out_wav=os.path.join(wav_path, \"{wav_name}\"))') + elif fg_audio['audio_type'] == 'speech': + npz_path = self.char_to_voice_map[fg_audio["character"]]["npz_path"] + npz_full_path = os.path.abspath(npz_path) if os.path.exists(npz_path) else npz_path + self.append_code(f'TTS(text=\"{fg_audio["text"]}\", speaker_id=\"{self.char_to_voice_map[fg_audio["character"]]["id"]}\", volume={fg_audio["vol"]}, out_wav=os.path.join(wav_path, \"{wav_name}\"), speaker_npz=\"{npz_full_path}\")') + fg_audio_wavs.append(wav_name) + self.append_code(f'fg_audio_lens.append(COMPUTE_LEN(os.path.join(wav_path, \"{wav_name}\")))\n') + + # cat all foreground audio together + self.append_code(f'fg_audio_wavs = []') + for wav_filename in fg_audio_wavs: + self.append_code(f'fg_audio_wavs.append(os.path.join(wav_path, \"{wav_filename}\"))') + self.append_code(f'CAT(wavs=fg_audio_wavs, out_wav=os.path.join(wav_path, \"foreground.wav\"))') + + bg_audio_wavs = [] + self.append_code(f'\nbg_audio_offsets = []') + for bg_audio in bg_audios: + wav_name = get_wav_name(bg_audio) + self.append_code(f'bg_audio_len = sum(fg_audio_lens[{bg_audio["begin_fg_audio_id"]}:{bg_audio["end_fg_audio_id"]}])') + self.append_code(f'bg_audio_offset = sum(fg_audio_lens[:{bg_audio["begin_fg_audio_id"]}])') + if bg_audio['audio_type'] == 'sound_effect': + self.append_code(f'TTA(text=\"{bg_audio["desc"]}\", volume={bg_audio["vol"]}, length=bg_audio_len, out_wav=os.path.join(wav_path, \"{wav_name}\"))') + elif bg_audio['audio_type'] == 'music': + self.append_code(f'TTM(text=\"{bg_audio["desc"]}\", volume={bg_audio["vol"]}, length=bg_audio_len, out_wav=os.path.join(wav_path, \"{wav_name}\"))') + else: + raise ValueError() + bg_audio_wavs.append(wav_name) + self.append_code(f'bg_audio_offsets.append(bg_audio_offset)\n') + self.append_code(f'bg_audio_wavs = []') + for wav_filename in bg_audio_wavs: + self.append_code(f'bg_audio_wavs.append(os.path.join(wav_path, \"{wav_filename}\"))') + + self.append_code(f'bg_audio_wav_offset_pairs = list(zip(bg_audio_wavs, bg_audio_offsets))') + self.append_code(f'bg_audio_wav_offset_pairs.append((os.path.join(wav_path, \"foreground.wav\"), 0))') + self.append_code(f'MIX(wavs=bg_audio_wav_offset_pairs, out_wav=os.path.join(wav_path, \"{result_filename}.wav\"))') + + + def init_char_to_voice_map(self, filename): + with open(filename, 'r') as file: + self.char_to_voice_map = json5.load(file) + + + def parse_and_generate(self, script_filename, char_to_voice_map_filename, output_path, result_filename='result'): + self.code = '' + self.init_char_to_voice_map(char_to_voice_map_filename) + + with open(script_filename, 'r') as file: + data = json5.load(file) + + check_json_script(data) + fg_audios, bg_audios = collect_and_check_audio_data(data) + self.generate_code(fg_audios, bg_audios, output_path, result_filename) + return self.code diff --git a/convert_json_to_audio_gen_code.py b/convert_json_to_audio_gen_code.py new file mode 100644 index 0000000000000000000000000000000000000000..284cd1f0d4844c9bc5489999aa22b47aac598137 --- /dev/null +++ b/convert_json_to_audio_gen_code.py @@ -0,0 +1,30 @@ +import argparse +import os +import json5 +from pathlib import Path +from code_generator import AudioCodeGenerator + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--script", help="Path to the json script file") + parser.add_argument("--character-to-voice-map", help="Path to the character-to-voice mapping CSV file") + parser.add_argument( + "--path", + type=str, + default=".", + help="Path of all the output wav files to be created by the generated code, default: current path" + ) + args = parser.parse_args() + + if not os.path.isfile(args.script): + print(f"File {args.script} does not exist.") + return + + output_path = Path(args.path) + audio_code_generator = AudioCodeGenerator() + code = audio_code_generator.parse_and_generate(args.script, args.character_to_voice_map, output_path) + print(code) + +if __name__ == "__main__": + main() diff --git a/data/voice_presets/metadata.json b/data/voice_presets/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5a56985a1b1b9d5ddba4c8f626482d73288b57ab --- /dev/null +++ b/data/voice_presets/metadata.json @@ -0,0 +1,47 @@ +{ + "Female1": { + "id": "Female1", + "desc": "a normal female adult voice, British accent", + "npz_path": "v2/en_speaker_9" + }, + "Female2": { + "id": "Female2", + "desc": "a normal female adult voice, American accent", + "npz_path": "v2/de_speaker_3" + }, + "Male1": { + "id": "Male1", + "desc": "a normal male adult voice, British accent", + "npz_path": "v2/en_speaker_1" + }, + "Male2": { + "id": "Male2", + "desc": "a normal male adult voice, American accent", + "npz_path": "v2/en_speaker_2" + }, + "News_Male": { + "id": "News_Male", + "desc": "a male voice of a news anchor, suitable for news scenarios", + "npz_path": "data/voice_presets/npz/news_male_speaker.npz" + }, + "News_Female": { + "id": "News_Female", + "desc": "a female voice of a news anchor, suitable for news scenarios", + "npz_path": "data/voice_presets/npz/news_male_speaker.npz" + }, + "News_Female_Out": { + "id": "News_Female_Out", + "desc": "a female voice of a off-site news reporter, suitable for news scenario", + "npz_path": "data/voice_presets/npz/news_female_speaker_outside.npz" + }, + "child": { + "id": "child", + "desc": "a small young boy voice", + "npz_path": "data/voice_presets/npz/child_boy.npz" + }, + "old_man": { + "id": "old_man", + "desc": "a voice of an old man", + "npz_path": "data/voice_presets/npz/elder_morgen.npz" + } +} \ No newline at end of file diff --git a/data/voice_presets/npz/biden.npz b/data/voice_presets/npz/biden.npz new file mode 100644 index 0000000000000000000000000000000000000000..2875740b59b7d50f99f23b391dc7fc55924319bd Binary files /dev/null and b/data/voice_presets/npz/biden.npz differ diff --git a/data/voice_presets/npz/boris.npz b/data/voice_presets/npz/boris.npz new file mode 100644 index 0000000000000000000000000000000000000000..c9e2545c639993f89fb8b6c2784b8268d9273d3d Binary files /dev/null and b/data/voice_presets/npz/boris.npz differ diff --git a/data/voice_presets/npz/boy_Tom_Hiddleston.npz b/data/voice_presets/npz/boy_Tom_Hiddleston.npz new file mode 100644 index 0000000000000000000000000000000000000000..f7cd70b4e00914acd764c1ddc5cf3ea1068aa37d Binary files /dev/null and b/data/voice_presets/npz/boy_Tom_Hiddleston.npz differ diff --git a/data/voice_presets/npz/child_boy.npz b/data/voice_presets/npz/child_boy.npz new file mode 100644 index 0000000000000000000000000000000000000000..6b539a8712b75e1f010b7193a0fcd0f7fb11fca8 Binary files /dev/null and b/data/voice_presets/npz/child_boy.npz differ diff --git a/data/voice_presets/npz/cnn_male_speaker.npz b/data/voice_presets/npz/cnn_male_speaker.npz new file mode 100644 index 0000000000000000000000000000000000000000..e2fbc1f557d2b89456d68c3cf43d3d92c7bb20eb Binary files /dev/null and b/data/voice_presets/npz/cnn_male_speaker.npz differ diff --git a/data/voice_presets/npz/elder_morgen.npz b/data/voice_presets/npz/elder_morgen.npz new file mode 100644 index 0000000000000000000000000000000000000000..7de1e16a9f2edd82350c4645e51819b7964c74ac Binary files /dev/null and b/data/voice_presets/npz/elder_morgen.npz differ diff --git a/data/voice_presets/npz/girl_Anne_Hathaway.npz b/data/voice_presets/npz/girl_Anne_Hathaway.npz new file mode 100644 index 0000000000000000000000000000000000000000..daf033b42d8b979ced44da2b276546e10f5f8e2a Binary files /dev/null and b/data/voice_presets/npz/girl_Anne_Hathaway.npz differ diff --git a/data/voice_presets/npz/mark_professor.npz b/data/voice_presets/npz/mark_professor.npz new file mode 100644 index 0000000000000000000000000000000000000000..07725992a1f72931b8ddd4d34cbcac3883d09eec Binary files /dev/null and b/data/voice_presets/npz/mark_professor.npz differ diff --git a/data/voice_presets/npz/news_female_speaker.npz b/data/voice_presets/npz/news_female_speaker.npz new file mode 100644 index 0000000000000000000000000000000000000000..013dea952ecc7c34c601bfc8d25fc367cffb5584 Binary files /dev/null and b/data/voice_presets/npz/news_female_speaker.npz differ diff --git a/data/voice_presets/npz/news_female_speaker_outside.npz b/data/voice_presets/npz/news_female_speaker_outside.npz new file mode 100644 index 0000000000000000000000000000000000000000..2cd5f5964c778742ab200bf71f8bad83db6fdec3 Binary files /dev/null and b/data/voice_presets/npz/news_female_speaker_outside.npz differ diff --git a/data/voice_presets/npz/news_male_speaker.npz b/data/voice_presets/npz/news_male_speaker.npz new file mode 100644 index 0000000000000000000000000000000000000000..744b9afe7ca626c1e3f70adaea1a13c67bb0796c Binary files /dev/null and b/data/voice_presets/npz/news_male_speaker.npz differ diff --git a/data/voice_presets/npz/trump.npz b/data/voice_presets/npz/trump.npz new file mode 100644 index 0000000000000000000000000000000000000000..bc8e350e2efd42638e5343a93ad14d153248dd5e Binary files /dev/null and b/data/voice_presets/npz/trump.npz differ diff --git a/parse_voice.py b/parse_voice.py new file mode 100644 index 0000000000000000000000000000000000000000..9583f402cfb23aede18d421befd2508633b1d23c --- /dev/null +++ b/parse_voice.py @@ -0,0 +1,31 @@ +import os +import argparse +from VoiceParser.model import VoiceParser + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--wav-path', type=str, help="Path of a wav file") + parser.add_argument('--wav-dir', type=str, help="Directory of wav files") + parser.add_argument('--out-dir', type=str, help="Directory of output npz files") + args = parser.parse_args() + + if (args.wav_path is None and args.wav_dir is None) or (args.wav_path is not None and args.wav_dir is not None): + parser.error("Please provide either '--wav-path' or '--wav-dir', but not both.") + + out_dir = args.out_dir + + model = VoiceParser(device='cpu') + + if args.wav_path is not None: + model.extract_acoustic_embed(args.wav_path, out_dir) + print(f'Sucessfully parsed {args.wav_path}') + else: + wav_name_list = os.listdir(args.wav_dir) + for wav_name in wav_name_list: + wav_path = os.path.join(args.wav_dir, wav_name) + model.extract_acoustic_embed(wav_path, out_dir) + print(f'Sucessfully parsed {wav_path}') + + +if __name__ == '__main__': + main() diff --git a/pipeline.py b/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..76793e4b47260b83b2c94d9f68f7d951fb60066a --- /dev/null +++ b/pipeline.py @@ -0,0 +1,235 @@ +import argparse +import datetime +import os +import subprocess +from string import Template +import openai +import re +from pathlib import Path +import glob +from utils import get_key +import pickle +import time +import json5 +from retrying import retry +from code_generator import check_json_script, collect_and_check_audio_data +from tabulate import tabulate +import random +import string + +import utils +import voice_presets +from code_generator import AudioCodeGenerator + +USE_OPENAI_CACHE = True +openai_cache = [] +if USE_OPENAI_CACHE: + os.makedirs('cache', exist_ok=True) + for cache_file in glob.glob('cache/*.pkl'): + with open(cache_file, 'rb') as file: + openai_cache.append(pickle.load(file)) + +openai.api_key = get_key() + +def chat_with_gpt(prompt): + if USE_OPENAI_CACHE: + filtered_object = list(filter(lambda x: x['prompt'] == prompt, openai_cache)) + if len(filtered_object) > 0: + response = filtered_object[0]['response'] + return response + chat = openai.ChatCompletion.create( + # model="gpt-3.5-turbo", + model="gpt-4", + messages=[ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": prompt + } + ] + ) + if USE_OPENAI_CACHE: + cache_obj = { + 'prompt': prompt, + 'response': chat['choices'][0]['message']['content'] + } + with open(f'cache/{time.time()}.pkl', 'wb') as _openai_cache: + pickle.dump(cache_obj, _openai_cache) + openai_cache.append(cache_obj) + + return chat['choices'][0]['message']['content'] + + +def get_file_content(filename): + with open(filename, 'r') as file: + return file.read().strip() + + +def write_to_file(filename, content): + with open(filename, 'w') as file: + file.write(content) + + +def extract_substring_with_quotes(input_string, quotes="'''"): + pattern = f"{quotes}(.*?){quotes}" + matches = re.findall(pattern, input_string, re.DOTALL) + return matches + + +def try_extract_content_from_quotes(content): + if "'''" in content: + return extract_substring_with_quotes(content)[0] + elif "```" in content: + return extract_substring_with_quotes(content, quotes="```")[0] + else: + return content + +def maybe_get_content_from_file(content_or_filename): + if os.path.exists(content_or_filename): + with open(content_or_filename, 'r') as file: + return file.read().strip() + return content_or_filename + + + +# Pipeline Interface Guidelines: +# +# Init calls: +# - Init calls must be called before running the actual steps +# - init_session() is called every time a gradio webpage is loaded +# +# Single Step: +# - takes input (file or content) and output path as input +# - most of time just returns output content +# +# Compositional Step: +# - takes session_id as input (you have session_id, you have all the paths) +# - run a series of steps + +# This is called for every new gradio webpage + +def init_session(session_id=''): + def uid8(): + return ''.join(random.choices(string.ascii_lowercase + string.digits, k=8)) + + if session_id == '': + session_id = f'{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}_{uid8()}' + # create the paths + os.makedirs(utils.get_session_voice_preset_path(session_id)) + os.makedirs(utils.get_session_audio_path(session_id)) + return session_id + +@retry(stop_max_attempt_number=3) +def input_text_to_json_script_with_retry(complete_prompt_path): + print(" trying ...") + complete_prompt = get_file_content(complete_prompt_path) + json_response = try_extract_content_from_quotes(chat_with_gpt(complete_prompt)) + json_data = json5.loads(json_response) + + try: + check_json_script(json_data) + collect_and_check_audio_data(json_data) + except Exception as err: + print(f'JSON ERROR: {err}') + retry_complete_prompt = f'{complete_prompt}\n```\n{json_response}```\nThe script above has format error(s). Return the fixed script.\n\nScript:\n' + write_to_file(complete_prompt_path, retry_complete_prompt) + raise err + + return json_response + +# Step 1: input_text to json +def input_text_to_json_script(input_text, output_path): + print('Step 1: Writing audio script with LLM ...') + input_text = maybe_get_content_from_file(input_text) + text_to_audio_script_prompt = get_file_content('prompts/text_to_json.prompt') + prompt = f'{text_to_audio_script_prompt}\n\nInput text: {input_text}\n\nScript:\n' + complete_prompt_path = output_path / 'complete_input_text_to_audio_script.prompt' + write_to_file(complete_prompt_path, prompt) + audio_script_response = input_text_to_json_script_with_retry(complete_prompt_path) + generated_audio_script_filename = output_path / 'audio_script.json' + write_to_file(generated_audio_script_filename, audio_script_response) + return audio_script_response + +# Step 2: json to char-voice map +def json_script_to_char_voice_map(json_script, voices, output_path): + def create_complete_char_voice_map(char_voice_map): + return + print('Step 2: Parsing character voice with LLM...') + json_script_content = maybe_get_content_from_file(json_script) + prompt = get_file_content('prompts/audio_script_to_character_voice_map.prompt') + presets_str = '\n'.join(f"{preset['id']}: {preset['desc']}" for preset in voices.values()) + prompt = Template(prompt).substitute(voice_and_desc=presets_str) + prompt = f"{prompt}\n\nAudio script:\n'''\n{json_script_content}\n'''\n\noutput:\n" + write_to_file(output_path / 'complete_audio_script_to_char_voice_map.prompt', prompt) + char_voice_map_response = try_extract_content_from_quotes(chat_with_gpt(prompt)) + char_voice_map = json5.loads(char_voice_map_response) + # enrich char_voice_map with voice preset metadata + complete_char_voice_map = {c: voices[char_voice_map[c]] for c in char_voice_map} + char_voice_map_filename = output_path / 'character_voice_map.json' + write_to_file(char_voice_map_filename, json5.dumps(complete_char_voice_map)) + return complete_char_voice_map + +# Step 3: json to py code +def json_script_and_char_voice_map_to_audio_gen_code(json_script_filename, char_voice_map_filename, output_path, result_filename): + print('Step 3: Compiling audio script to Python program ...') + audio_code_generator = AudioCodeGenerator() + code = audio_code_generator.parse_and_generate( + json_script_filename, + char_voice_map_filename, + output_path, + result_filename + ) + write_to_file(output_path / 'audio_generation.py', code) + +# Step 4: py code to final wav +def audio_code_gen_to_result(audio_gen_code_path): + print('Step 4: Start running Python program ...') + audio_gen_code_filename = audio_gen_code_path / 'audio_generation.py' + os.system(f'python {audio_gen_code_filename}') + +# Function call used by Gradio: input_text to json +def generate_json_file(session_id, input_text): + output_path = utils.get_session_path(session_id) + # Step 1 + return input_text_to_json_script(input_text, output_path) + +# Function call used by Gradio: json to result wav +def generate_audio(session_id, json_script): + output_path = utils.get_session_path(session_id) + output_audio_path = utils.get_session_audio_path(session_id) + voices = voice_presets.get_merged_voice_presets(session_id) + + # Step 2 + json_script_to_char_voice_map(json_script, voices, output_path) + # Step 3 + json_script_filename = output_path / 'audio_script.json' + char_voice_map_filename = output_path / 'character_voice_map.json' + result_wav_basename = f'res_{session_id}' + json_script_and_char_voice_map_to_audio_gen_code(json_script_filename, char_voice_map_filename, output_path, result_wav_basename) + # Step 4 + audio_code_gen_to_result(output_path) + + result_wav_filename = output_audio_path / f'{result_wav_basename}.wav' + print(f'Done all processes, result: {result_wav_filename}') + return result_wav_filename + +# Convenient function call used by wavjourney_cli +def full_steps(session_id, input_text): + json_script = generate_json_file(session_id, input_text) + return generate_audio(session_id, json_script) + +def convert_json_to_md(audio_script_response): + audio_json_data = json5.loads(audio_script_response) + table = [[node.get(field, 'N/A') for field in ["audio_type", "layout", "id", "character", "action", 'vol']] + + [node.get("desc", "N/A") if node.get("audio_type") != "speech" else node.get("text", "N/A")] + + [node.get("len", "Auto") if "len" in node else "Auto"] + for i, node in enumerate(audio_json_data)] + + headers = ["Audio Type", "Layout", "ID", "Character", "Action", 'Volume', "Description", "Length" ] + + # Tabulate + table_txt = tabulate(table, headers, tablefmt="github") + return table_txt diff --git a/prompts/audio_script_to_character_voice_map.prompt b/prompts/audio_script_to_character_voice_map.prompt new file mode 100644 index 0000000000000000000000000000000000000000..3e3634613c0998a617d4ea70c90d814661c73d4f --- /dev/null +++ b/prompts/audio_script_to_character_voice_map.prompt @@ -0,0 +1,11 @@ +Given an audio script in json format, for each character appeared in the "character" attribute, you should map the character to a "voice type" according to the his/her lines and the voice type's features. Each character must be mapped to a different voice type, and each voice type must be from one of the following(each line in the format of "[voice_type_id]: [voice_type_description]"): +$voice_and_desc + +Output should be in the format of json, like: +''' +{ + "character_1": "voice_type_1", + "character_2": "voice_type_2", + ... +} +''' \ No newline at end of file diff --git a/prompts/audio_script_to_json.prompt b/prompts/audio_script_to_json.prompt new file mode 100644 index 0000000000000000000000000000000000000000..b7f867e2b309ad590da1dad8b2f44e0fc05f5f90 --- /dev/null +++ b/prompts/audio_script_to_json.prompt @@ -0,0 +1,74 @@ +Given an audio script, adapt it into a json file. You must go through each line of the script, and try your best to convert it to a json object or multiple json objects. + +Each json object represents an audio. There are three types of audios: sound effect, music, and speech. For each audio, there are two types of layouts: foreground and background. Foreground audios are played sequentially, and background audios are environmental sounds or music which are played while the foreground audios are being played. + +While going through each line of the script, you have choices as below: +- For character lines, you need to convert it to a speech audio. Note that a speech audio can only be foreground. Example: +From +``` +News Anchor: Good evening, this is BBC News. +``` +To +``` +{"audio_type": "speech", "layout": "foreground", "character": "News Anchor", "vol": -15, "text": "Good evening, this is BBC News."}, +``` +- For sound effects, you need to convert it to a sound_effect audio. Especially, you need to figure out its length according to the script's context, and put it into "len". Example: +From +``` +(SFX: Airport beeping sound) +``` +to +``` +{"audio_type": "sound_effect", "layout": "foreground", "vol": -35, "len": 2, "desc": "Airport beeping sound"}, +``` +- For music, you need to convert it to a music audio. Especially, you need to figure out its length according to the script's context, and put it into "len". Example: +From +``` +(SFX: Uplifting newsroom music) +``` +to +``` +{"audio_type": "music", "layout": "foreground", "vol": -35, "len": 10, "desc": "Uplifting newsroom music"}, +``` + +When a sound effect or music is environmental played in the background, you should set their layout to "background". You must give the background audio an unique id, and you must figure out the end of the background audio according to the context and indicate it explicitly. Example: +From +``` +... +(SFX: Airport ambiance, people walking) +Airport Announcer: Lades and Gentlemen, attentions please! +... +``` +to +``` +... +{"audio_type": "sound_effect", "layout": "background", "id":1, "action": "begin", "vol": -35, "desc": "Airport ambiance, people walking"}, +[foreground audio] +... +{"audio_type": "sound_effect", "layout": "background", "id":1, "action": "end"}, +... +``` + +When a line contains multiple sound effects and musics, you need to decompose it into multiple audios. Example: +From +``` +... +(SFX: A classy restaurant, low chatter, clinking silverware, jazz music playing) +... +``` +to +``` +... +{"audio_type": "sound_effect", "layout": "background", "id":1, "action": "begin", "vol": -35, "desc": "low chatter"}, +{"audio_type": "sound_effect", "layout": "background", "id":2, "action": "begin", "vol": -35, "desc": "clinking silverware"}, +{"audio_type": "music", "layout": "background", "id":3, "action": "begin", "vol": -35, "desc": "jazz music"}, +... +{"audio_type": "sound_effect", "layout": "background", "id":1, "action": "end"}, +{"audio_type": "sound_effect", "layout": "background", "id":2, "action": "end"}, +{"audio_type": "music", "layout": "background", "id":3, "action": "end"}, +... +``` + +The final json object contains a list of all the audio objects. + +Script: \ No newline at end of file diff --git a/prompts/script_to_json.prompt b/prompts/script_to_json.prompt new file mode 100644 index 0000000000000000000000000000000000000000..3cd5b9fd1e4261813a672a56932cb86c48d61503 --- /dev/null +++ b/prompts/script_to_json.prompt @@ -0,0 +1,58 @@ +Convert an audio script line to another format. Each line will be converted to a simple json format. Below are the examples of conversion of each line. + +Example line 1: +''' +[Background music 1 begins, -35dB: Uplifting newsroom music] +''' +convert to: +''' +{"voice_type": "back_ground_music", "id": 1, "state": "begin", "volume": -35, "desc": "Uplifting newsroom music"}, +''' +Example line 2: +''' +[Background music 1 ends] +''' +convert to: +''' +{"voice_type": "back_ground_music", "id": 1, "state": "end"}, +''' +Example line 3: +''' +[Background sound effect 2 begins, -35dB: Crowds cheering and arcade ambiance] +''' +convert to: +''' +{"voice_type": "back_ground_sound_effect", "id": 2, "state": "begin", "volume": -35, "desc": "Crowds cheering and arcade ambiance"}, +''' +Example line 4: +''' +[Background sound effect 2 ends] +''' +convert to: +''' +{"voice_type": "back_ground_sound_effect", "id": 2, "state": "end"}, +''' +Example line 5: +''' +News Anchor, -15dB: Good evening, this is BBC News. +''' +convert to: +''' +{"voice_type": "speech", "character": "News Anchor", "volume": -15, "desc": "Good evening, this is BBC News."}, +''' +Example line 6: +''' +[Sound effect, 3s, -15dB: Keyboard typing and mouse clicking] +''' +convert to: +''' +{"voice_type": "sound_effect", "length": 3, "volume": -15, "desc": "Keyboard typing and mouse clicking"}, +''' +Example line 7: +''' +[Sound music, 10s, -15dB: Uplifting newsroom music] +''' +convert to: +''' +{"voice_type": "music", "length": 10, "volume": -15, "desc": "Uplifting newsroom music"}, +''' diff --git a/prompts/text_to_audio_script.prompt b/prompts/text_to_audio_script.prompt new file mode 100644 index 0000000000000000000000000000000000000000..074e0f286802d4a3c2321f7b708db665555bd012 --- /dev/null +++ b/prompts/text_to_audio_script.prompt @@ -0,0 +1,34 @@ +I want you to act as a audio script writer. I'll give you an instruction which is a general idea and you will make it a short audio script. + +The script should follow the rules below: +- For dialogs, each line must contain the character's name, its volume in decibel (human voices are usually around -15dB) and the line, example: +''' +Darth Vader, -16dB: Luke, I'm your father. +''' +- For foreground sound effect, you must wrap the line with brackets and start with "Sound effect, ", and you should give the duration of the sound effect in seconds, and you should specify the volume you want in decibel(For foreground sound effects it's usually around -15dB), and you should give very detailed description of the sound effect, example: +''' +[Sound effect, 2s, -15dB: Airport beeping sound] +''' +- For foreground music, you must wrap the line with brackets and start with "Music, ", and you should give the duration of the music in seconds, and you should specify the volume you want in decibel(for foreground music it's usually around -15dB), and you should give very detailed description of the music, example: +''' +[Music, 10s, -15dB: 80's Rock and Roll music] +''' +- For background sound effects, you must wrap the line with brackets and start with "Background sound effect" followed by its id, and you must always explicitly indicate the start and end of the sound effects, and you should specify the volume you want in decibel(for background sound effect it's usually around -35dB), and you should give very detailed description of the sound effect, example: +''' +[Background sound effect 1 begins, -34dB: Airport ambiance, including footsteps, luggage rolling, and distant airplane engine] +... +[Background sound effect 1 ends] +''' +- For background music, you must wrap the line with brackets and start with "Background music" followed by its id, and you must always explicitly indicate the start and end of the music, and you should specify the volume you want in decibel(for background sound effect it's usually around -35dB), and you should give very detailed description of the music, example: +''' +[Background music 1 begins, -35dB: Uplifting newsroom music] +... +[Background music 1 ends] +''' +- For music and sound effect, you can not name the element outside these: +["Sound effect, ", +"Music, ", +"Background sound effect" followed by its id, +"Background music" followed by its id] +such as "Foreground sound effect", "Foreground music" is forbidden + diff --git a/prompts/text_to_json.prompt b/prompts/text_to_json.prompt new file mode 100644 index 0000000000000000000000000000000000000000..3b92b40fa3e95e0a941e60278b81da41291368f0 --- /dev/null +++ b/prompts/text_to_json.prompt @@ -0,0 +1,31 @@ +I want you to act as a audio script writer. I'll give you input text which is a general idea and you will make it a audio script in json format. Instructions: +- Each line represents an audio. There are three types of audios: sound effect, music, and speech. For each audio, there are only two types of layouts: foreground and background. Foreground audios are played sequentially, and background audios are environmental sounds or music which are played while the foreground audios are being played. +- Sound effects can be either foreground or background. For sound effects, you must provide its layout, volume, length (in seconds), and detailed description of the real-world sound effect. Example: +''' +- The description of sound effects should not contain a specific person. +{"audio_type": "sound_effect", "layout": "foreground", "vol": -35, "len": 2, "desc": "Airport beeping sound"}, +''' +- Music can be either foreground or background. For music, you must provide its layout, volume, length (in seconds), and detailed description of the music. Example: +''' +{"audio_type": "music", "layout": "foreground", "vol": -35, "len": 10, "desc": "Uplifting newsroom music"}, +''' +- Speechs can only be foreground. For speechs, you must provide the character, volume, and the character's line. You do not need to specify the length of the speech. Example: +''' +{"audio_type": "speech", "layout": "foreground", "character": "News Anchor", "vol": -15, "text": "Good evening, this is BBC News. In today's breaking news, we have an unexpected turn of events in the political arena"}, +''' +- The description of speechs should not contain anything other than the lines, such as actions, expressions, emotions etc. +- For background sound audio, you must specify the beginning and the end of a background audio in separate lines to indicate when the audio begins and when it ends. Example for background sound effect (for background music it's similar): +''' +{"audio_type": "sound_effect", "layout": "background", "id":1, "action": "begin", "vol": -35, "desc": "Airport ambiance, people walking"}, +[foreground audio 1] +[foreground audio 2] +... +{"audio_type": "sound_effect", "layout": "background", "id":1, "action": "end"}, +''' +- Each background audio must have a unique id. +- You do not specify the length of a background audio. +- A background audio must be wrapped around at least one foreground audio. +- If a background sound effect has multiple sounds, please decompose it into multiple background sound effects. +- At the same time there must be at most only one audio with type music playing, either foreground or background. +- The volume of background sound effect/music is usually around -35 ~ -40 dB +- The output json must be a list as the root node containing all the audio nodes, and must be wrapped with triple quotes '''. diff --git a/scripts/download_models.py b/scripts/download_models.py new file mode 100644 index 0000000000000000000000000000000000000000..b8350543984ba93df51ed83c9f11f527ea6b667a --- /dev/null +++ b/scripts/download_models.py @@ -0,0 +1,31 @@ +import yaml +import os + +# Read the YAML file +with open('config.yaml', 'r') as file: + config = yaml.safe_load(file) + +# Extract values for each application +tts_env = config['Text-to-Speech']['env'] + +ttm_env = config['Text-to-Music']['env'] +ttm_model_size = config['Text-to-Music']['model_size'] + +tta_env = config['Text-to-Audio']['env'] + +sr_env = config['Speech-Restoration']['env'] + +# Downloading the TTS models +print('Step 1: Downloading TTS model ...') +os.system(f'conda run --live-stream -n {tts_env} python -c \'from transformers import BarkModel; BarkModel.from_pretrained("suno/bark")\'') + +print('Step 2: Downloading TTA model ...') +os.system(f'conda run --live-stream -n {tta_env} python -c \'from audiocraft.models import AudioGen; tta_model = AudioGen.get_pretrained("facebook/audiogen-medium")\'') + +print('Step 3: Downloading TTM model ...') +os.system(f'conda run --live-stream -n {ttm_env} python -c \'from audiocraft.models import MusicGen; tta_model = MusicGen.get_pretrained("facebook/musicgen-{ttm_model_size}")\'') + +print('Step 4: Downloading SR model ...') +os.system(f'conda run --live-stream -n {sr_env} python -c \'from voicefixer import VoiceFixer; vf = VoiceFixer()\'') + +print('All models successfully downloaded!') diff --git a/scripts/kill_services.py b/scripts/kill_services.py new file mode 100644 index 0000000000000000000000000000000000000000..9df01d4734575db23b21501100fa1bb4d4ca63c0 --- /dev/null +++ b/scripts/kill_services.py @@ -0,0 +1,28 @@ +import yaml +import os + +# Read the YAML file +with open('config.yaml', 'r') as file: + config = yaml.safe_load(file) + +# Extract values for each application +tts_port = config['Text-to-Speech']['service-port'] + +ttm_port = config['Text-to-Music']['service-port'] + +tta_port = config['Text-to-Audio']['service-port'] + +sr_port = config['Speech-Restoration']['service-port'] + +vp_port = config['Voice-Parser']['service-port'] + + +# Execute the commands +os.system(f'kill $(lsof -t -i :{tts_port})') +os.system(f'kill $(lsof -t -i :{tta_port})') +os.system(f'kill $(lsof -t -i :{ttm_port})') +os.system(f'kill $(lsof -t -i :{sr_port})') +os.system(f'kill $(lsof -t -i :{vp_port})') + + + diff --git a/scripts/restart_services.sh b/scripts/restart_services.sh new file mode 100644 index 0000000000000000000000000000000000000000..7081c44df4ab8f1fdc710b40293f178986e3b06c --- /dev/null +++ b/scripts/restart_services.sh @@ -0,0 +1,2 @@ +python scripts/kill_services.py +python scripts/start_services.py \ No newline at end of file diff --git a/scripts/start_services.py b/scripts/start_services.py new file mode 100644 index 0000000000000000000000000000000000000000..37d273a0da8d0710810e1d9f02c9797349f2acc4 --- /dev/null +++ b/scripts/start_services.py @@ -0,0 +1,41 @@ +import yaml +import os + +# Read the YAML file +with open('config.yaml', 'r') as file: + config = yaml.safe_load(file) + +os.makedirs('services_logs', exist_ok=True) + +# Extract values for each application +tts_model = config['Text-to-Speech']['model'] +tts_env = config['Text-to-Speech']['env'] + +ttm_model = config['Text-to-Music']['model'] +ttm_env = config['Text-to-Music']['env'] + +tta_model = config['Text-to-Audio']['model'] +tta_env = config['Text-to-Audio']['env'] + +sr_model = config['Speech-Restoration']['model'] +sr_env = config['Speech-Restoration']['env'] +enable_sr = config['Speech-Restoration']['Enable'] + +vp_model = config['Voice-Parser']['model'] +vp_env = config['Voice-Parser']['env'] + +# Execute the commands +os.system(f'nohup conda run --live-stream -n {tts_env} python {tts_model}/app.py > services_logs/meta_tts.out 2>&1 &') +os.system(f'nohup conda run --live-stream -n {vp_env} python {vp_model}/app.py > services_logs/meta_vp.out 2>&1 &') + +if enable_sr: + os.system(f'nohup conda run --live-stream -n {sr_env} python {sr_model}/app.py > services_logs/meta_sr.out 2>&1 &') + +# Using AudioCraft for TTA & TTM +if tta_env == ttm_env: + os.system(f'nohup conda run --live-stream -n {ttm_env} python {ttm_model}/app.py > services_logs/meta_tta_ttm.out 2>&1 &') + +# Using AudioLDM for TTA, MusicGen for TTM +if tta_env != ttm_env: + os.system(f'nohup conda run --live-stream -n {tta_env} python {tta_model}/app.py > services_logs/meta_tta.out 2>&1 &') + os.system(f'nohup conda run --live-stream -n {ttm_env} python {ttm_model}/app.py > services_logs/meta_ttm.out 2>&1 &') diff --git a/scripts/start_ui.sh b/scripts/start_ui.sh new file mode 100644 index 0000000000000000000000000000000000000000..6cb9a176602af1cc4acd9d88f0f7887459f4691b --- /dev/null +++ b/scripts/start_ui.sh @@ -0,0 +1 @@ +conda run --live-stream -n WavJourney gradio ui_client.py \ No newline at end of file diff --git a/ui_client.py b/ui_client.py new file mode 100644 index 0000000000000000000000000000000000000000..60b38389017c5e2cb2cbe217b27ee853bea0d461 --- /dev/null +++ b/ui_client.py @@ -0,0 +1,273 @@ +import pdb +import shutil + +import gradio as gr + +import pipeline +import utils +from pipeline import generate_json_file, generate_audio +from voice_presets import load_voice_presets_metadata, add_session_voice_preset, \ + remove_session_voice_preset + +import openai + +VOICE_PRESETS_HEADERS = ['ID', 'Description'] +DELETE_FILE_WHEN_DO_CLEAR = False +DEBUG = False + + +def generate_script_fn(instruction, _state: gr.State): + try: + session_id = _state['session_id'] + json_script = generate_json_file(session_id, instruction) + table_text = pipeline.convert_json_to_md(json_script) + except Exception as e: + gr.Warning(str(e)) + print(f"Generating script error: {str(e)}") + return [None, gr.Button.update(interactive=False), _state, gr.Button.update(interactive=True)] + _state = { + **_state, + 'session_id': session_id, + 'json_script': json_script + } + return [ + table_text, + _state, + gr.Button.update(interactive=True), + gr.Button.update(interactive=True), + gr.Button.update(interactive=True), + gr.Button.update(interactive=True), + ] + + +def generate_audio_fn(state): + btn_state = gr.Button.update(interactive=True) + try: + audio_path = generate_audio(**state) + return [ + gr.make_waveform(str(audio_path)), + btn_state, + btn_state, + btn_state, + btn_state, + ] + except Exception as e: + print(f"Generation audio error: {str(e)}") + gr.Warning(str(e)) + return [ + None, + btn_state, + btn_state, + btn_state, + btn_state, + ] + + +def clear_fn(state): + if DELETE_FILE_WHEN_DO_CLEAR: + shutil.rmtree('output', ignore_errors=True) + state = {'session_id': pipeline.init_session()} + return [gr.Textbox.update(value=''), gr.Video.update(value=None), + gr.Markdown.update(value=''), gr.Button.update(interactive=False), gr.Button.update(interactive=False), + state, gr.Dataframe.update(visible=False), gr.Button.update(visible=False), + gr.Textbox.update(value=''), gr.Textbox.update(value=''), gr.File.update(value=None)] + + +def textbox_listener(textbox_input): + if len(textbox_input) > 0: + return gr.Button.update(interactive=True) + else: + return gr.Button.update(interactive=False) + + +def get_voice_preset_to_list(state: gr.State): + if state.__class__ == dict: + session_id = state['session_id'] + else: + session_id = state.value['session_id'] + voice_presets = load_voice_presets_metadata( + utils.get_session_voice_preset_path(session_id), + safe_if_metadata_not_exist=True + ) + dataframe = [] + for key in voice_presets.keys(): + row = [key, voice_presets[key]['desc']] + dataframe.append(row) + return dataframe + + +def df_on_select(evt: gr.SelectData): + print(f"You selected {evt.value} at {evt.index} from {evt.target}") + return {'selected_voice_preset': evt.index} + + +def del_voice_preset(selected_voice_presets, ui_state, dataframe): + gr_visible = gr.Dataframe.update(visible=True) + btn_visible = gr.Button.update(visible=True) + current_presets = get_voice_preset_to_list(ui_state) + if selected_voice_presets['selected_voice_preset'] is None or \ + selected_voice_presets['selected_voice_preset'][0] > len(current_presets) - 1: + gr.Warning('None row is selected') + return [current_presets, gr_visible, btn_visible, selected_voice_presets] + # Do the real file deletion + index = selected_voice_presets['selected_voice_preset'][0] + vp_id = dataframe['ID'][index] + remove_session_voice_preset(vp_id, ui_state['session_id']) + current_presets = get_voice_preset_to_list(ui_state) + gr.Dataframe.update(value=current_presets) + if len(current_presets) == 0: + gr_visible = gr.Dataframe.update(visible=False) + btn_visible = gr.Button.update(visible=False) + selected_voice_presets['selected_voice_preset'] = None + return [current_presets, gr_visible, btn_visible, selected_voice_presets] + + +def get_system_voice_presets(): + system_presets = load_voice_presets_metadata(utils.get_system_voice_preset_path()) + data = [] + for k, v in system_presets.items(): + data.append([k, v['desc']]) + # headers = ['id', 'description'] + # table_txt = tabulate(data, headers, tablefmt="github") + return data + + +def set_openai_key(key): + openai.api_key = key + return key + + +def add_voice_preset(vp_id, vp_desc, file, ui_state, added_voice_preset): + if vp_id is None or vp_desc is None or file is None or vp_id.strip() == '' or vp_desc.strip() == '': + gr.Warning('please complete all three fields') + else: + count: int = added_voice_preset['count'] + # check if greater than 3 + session_id = ui_state['session_id'] + file_path = file.name + print(f'session {session_id}, id {id}, desc {vp_desc}, file {file_path}') + # Do adding ... + try: + add_session_voice_preset(vp_id, vp_desc, file_path, session_id) + added_voice_preset['count'] = count + 1 + except Exception as exception: + gr.Warning(str(exception)) + # After added + dataframe = get_voice_preset_to_list(ui_state) + df_visible = gr.Dataframe.update(visible=True) + del_visible = gr.Button.update(visible=True) + if len(dataframe) == 0: + df_visible = gr.Dataframe.update(visible=False) + del_visible = gr.Button.update(visible=False) + return [gr.Textbox.update(value=''), gr.Textbox.update(value=''), gr.File.update(value=None), + ui_state, added_voice_preset, dataframe, gr.Button.update(interactive=True), + df_visible, del_visible] + + +with gr.Blocks() as interface: + system_voice_presets = get_system_voice_presets() + # State + ui_state = gr.State(value={'session_id': pipeline.init_session()}) + selected_voice_presets = gr.State(value={'selected_voice_preset': None}) + added_voice_preset_state = gr.State(value={'added_file': None, 'count': 0}) + # UI Component + key_text_input = gr.Textbox(label='Please Enter OPENAI Key for acessing GPT4', lines=1, placeholder="Input instruction here.", + value='') + text_input_value = '' if DEBUG is False else "News channel BBC broadcast about Trump playing street fighter 6 against Biden" + text_input = gr.Textbox(label='Input', lines=2, placeholder="Input instruction here.", + value=text_input_value) + markdown_output = gr.Markdown(label='Audio Script', lines=2) + generate_script_btn = gr.Button(value='Generate Script', interactive=False) + audio_output = gr.Video(type='filepath') + generate_audio_btn = gr.Button(value='Generate Audio', interactive=False) + clear_btn = gr.ClearButton(value='Clear Inputs') + # System Voice Presets + gr.Markdown(label='System Voice Presets', value='# System Voice Presets') + system_markdown_voice_presets = gr.Dataframe(label='System Voice Presets', headers=VOICE_PRESETS_HEADERS, + value=system_voice_presets) + # User Voice Preset Related + gr.Markdown(label='User Voice Presets', value='# User Voice Presets') + get_voice_preset_to_list(ui_state) + voice_presets_df = gr.Dataframe(headers=VOICE_PRESETS_HEADERS, col_count=len(VOICE_PRESETS_HEADERS), + value=get_voice_preset_to_list(ui_state), interactive=False, visible=False) + # voice_presets_ds = gr.Dataset(components=[gr.Dataframe(visible=True)], samples=get_voice_preset_to_list(ui_state)) + del_voice_btn = gr.Button(value='Delete Selected Voice Preset', visible=False) + gr.Markdown(label='Add Voice Preset', value='## Add Voice Preset') + vp_text_id = gr.Textbox(label='Id', lines=1, placeholder="Input voice preset id here.") + vp_text_desc = gr.Textbox(label='Desc', lines=1, placeholder="Input description here.") + vp_file = gr.File(label='Wav File', type='file', description='Upload your wav file here.', file_types=['.wav'], + interactive=True) + vp_submit = gr.Button(label='Upload Voice Preset', value="Upload Voice Preset") + # events + key_text_input.change(fn=set_openai_key, inputs=[key_text_input], outputs=[key_text_input]) + text_input.change(fn=textbox_listener, inputs=[text_input], outputs=[generate_script_btn]) + generate_audio_btn.click( + fn=generate_audio_fn, + inputs=[ui_state], + outputs=[ + audio_output, + generate_audio_btn, + generate_script_btn, + clear_btn, + vp_submit, + ], + api_name='audio_journey', + ) + generate_audio_btn.click( + fn=lambda _: [ + gr.Button.update(interactive=False), + gr.Button.update(interactive=False), + gr.Button.update(interactive=False), + gr.Button.update(interactive=False), + ], + outputs=[ + generate_audio_btn, + generate_script_btn, + clear_btn, + vp_submit, + ] + ) + clear_btn.click(fn=clear_fn, inputs=ui_state, + outputs=[text_input, audio_output, markdown_output, generate_audio_btn, generate_script_btn, + ui_state, voice_presets_df, del_voice_btn, + vp_text_id, vp_text_desc, vp_file]) + generate_script_btn.click( + fn=generate_script_fn, inputs=[text_input, ui_state], + outputs=[ + markdown_output, + ui_state, + generate_audio_btn, + generate_script_btn, + clear_btn, + vp_submit, + ] + ) + generate_script_btn.click( + fn=lambda _: [ + gr.Button.update(interactive=False), + gr.Button.update(interactive=False), + gr.Button.update(interactive=False), + gr.Button.update(interactive=False), + ], + outputs=[ + generate_audio_btn, + generate_script_btn, + clear_btn, + vp_submit, + ] + ) + voice_presets_df.select(df_on_select, outputs=[selected_voice_presets]) + voice_presets_df.update(lambda x: print(x)) + del_voice_btn.click(del_voice_preset, inputs=[selected_voice_presets, ui_state, voice_presets_df], + outputs=[voice_presets_df, voice_presets_df, del_voice_btn, selected_voice_presets]) + # user voice preset upload + vp_submit.click(add_voice_preset, inputs=[vp_text_id, vp_text_desc, vp_file, ui_state, added_voice_preset_state], + outputs=[vp_text_id, vp_text_desc, vp_file, ui_state, added_voice_preset_state, voice_presets_df, + vp_submit, + voice_presets_df, del_voice_btn]) + vp_submit.click(lambda _: gr.Button.update(interactive=False), inputs=[vp_submit]) + # debug only + # print_state_btn = gr.Button(value='Print State') + # print_state_btn.click(fn=lambda state, state2: print(state, state2), inputs=[ui_state, selected_voice_presets]) +interface.queue(concurrency_count=5) +interface.launch() diff --git a/utils.py b/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c7506663873b08d2f7ab7cccc232770e5d38f8a1 --- /dev/null +++ b/utils.py @@ -0,0 +1,67 @@ +import re +import torch +import numpy as np +import yaml +from pathlib import Path + + +#### path related code BEGIN #### +def get_session_path(session_id): + return Path(f'output/sessions/{session_id}') + +def get_system_voice_preset_path(): + return Path('data/voice_presets') + +def get_session_voice_preset_path(session_id): + return Path(f'{get_session_path(session_id)}/voice_presets') + +def get_session_audio_path(session_id): + return Path(f'{get_session_path(session_id)}/audio') + +def rescale_to_match_energy(segment1, segment2): + ratio = get_energy_ratio(segment1, segment2) + recaled_segment1 = segment1 / ratio + return recaled_segment1.numpy() +#### path related code END #### + +def text_to_abbrev_prompt(input_text): + return re.sub(r'[^a-zA-Z_]', '', '_'.join(input_text.split()[:5])) + +def get_energy(x): + return np.mean(x ** 2) + + +def get_energy_ratio(segment1, segment2): + energy1 = get_energy(segment1) + energy2 = max(get_energy(segment2), 1e-10) + ratio = (energy1 / energy2) ** 0.5 + ratio = torch.tensor(ratio) + ratio = torch.clamp(ratio, 0.02, 50) + return ratio + +def fade(audio_data, fade_duration=2, sr=32000): + audio_duration = audio_data.shape[0] / sr + + # automated choose fade duration + if audio_duration >=8: + # keep fade_duration 2 + pass + else: + fade_duration = audio_duration / 5 + + fade_sampels = int(sr * fade_duration) + fade_in = np.linspace(0, 1, fade_sampels) + fade_out = np.linspace(1, 0, fade_sampels) + + audio_data_fade_in = audio_data[:fade_sampels] * fade_in + audio_data_fade_out = audio_data[-fade_sampels:] * fade_out + + audio_data_faded = np.concatenate((audio_data_fade_in, audio_data[len(fade_in):-len(fade_out)], audio_data_fade_out)) + return audio_data_faded + +def get_key(config='config.yaml'): + with open('config.yaml', 'r') as file: + config = yaml.safe_load(file) + openai_key = config['OpenAI-Key'] + return openai_key + diff --git a/voice_presets.py b/voice_presets.py new file mode 100644 index 0000000000000000000000000000000000000000..edcd4b9c9c7b6cfea7c01e58a8a1a954c1b333b1 --- /dev/null +++ b/voice_presets.py @@ -0,0 +1,96 @@ +import os +import json, json5 +from pathlib import Path + +import utils +from APIs import VP + + +def save_voice_presets_metadata(voice_presets_path, metadata): + with open(voice_presets_path / 'metadata.json', 'w') as f: + json.dump(metadata, f, indent=4) + +def load_voice_presets_metadata(voice_presets_path, safe_if_metadata_not_exist=False): + metadata_full_path = voice_presets_path / 'metadata.json' + + if safe_if_metadata_not_exist: + if not os.path.exists(metadata_full_path): + return {} + + with open(metadata_full_path, 'r') as f: + presets = json5.load(f) + + return presets + +# return system voice presets and session voice presets individually, each in a list +def get_voice_presets(session_id): + system_presets, session_presets = [], [] + + # Load system presets + system_presets = load_voice_presets_metadata(utils.get_system_voice_preset_path()) + + # Load session presets + session_presets = load_voice_presets_metadata( + utils.get_session_voice_preset_path(session_id), + safe_if_metadata_not_exist=True + ) + + return system_presets, session_presets + +# return merged voice presets in a {voice_preset_name: voice_preset} dict +def get_merged_voice_presets(session_id): + system_presets, session_presets = get_voice_presets(session_id) + res = {} + for preset in list(system_presets.values()) + list(session_presets.values()): + res[preset['id']] = preset # session presets with the same id will cover that of system presets + return res + +def add_voice_preset(voice_presets_path, presets, id, desc, wav_file_path): + if id in presets: + raise KeyError(f'{id} already in voice preset, path={voice_presets_path}!') + + # Convert wav to npz + npz_path = voice_presets_path / 'npz' + VP(wav_file_path, npz_path) + npz_file_path = npz_path / f'{Path(wav_file_path).stem}.npz' + + presets[id] = { + 'id': id, + 'desc': desc, + 'npz_path': str(npz_file_path) + } + save_voice_presets_metadata(voice_presets_path, presets) + return presets[id] + +def add_session_voice_preset(id, desc, wav_file_path, session_id): + voice_presets_path = utils.get_session_voice_preset_path(session_id) + os.makedirs(voice_presets_path / 'npz', exist_ok=True) + presets = load_voice_presets_metadata(voice_presets_path, safe_if_metadata_not_exist=True) + if len(presets) >= 3: + raise ValueError(f'session voice presets size exceed 3') + if id in presets: + raise KeyError(f'{id} already in voice preset, path={voice_presets_path}!') + + return add_voice_preset(voice_presets_path, presets, id, desc, wav_file_path) + +def add_system_voice_preset(id, desc, wav_file_path): + voice_presets_path = utils.get_system_voice_preset_path() + presets = load_voice_presets_metadata(voice_presets_path) + return add_voice_preset(voice_presets_path, presets, id, desc, wav_file_path) + +# if session_id set to '', we are removing system voice presets +def remove_session_voice_preset(id, session_id): + voice_presets_path = utils.get_session_voice_preset_path(session_id) + presets = load_voice_presets_metadata( + voice_presets_path, + safe_if_metadata_not_exist=True + ) + preset = presets.pop(id) + npz_path = preset['npz_path'] + + try: + os.remove(npz_path) + except FileNotFoundError: + print(f"INFO: trying to delete {npz_path} which does not exist, path={voice_presets_path}.") + + save_voice_presets_metadata(voice_presets_path, presets) \ No newline at end of file diff --git a/wavjourney_cli.py b/wavjourney_cli.py new file mode 100644 index 0000000000000000000000000000000000000000..f778c490d4dd3e5c63293263839518a7e1c6b08e --- /dev/null +++ b/wavjourney_cli.py @@ -0,0 +1,23 @@ +import time +import argparse + +import pipeline + +parser = argparse.ArgumentParser() +parser.add_argument('-f', '--full', action='store_true', help='Go through the full process') +parser.add_argument('--input-text', type=str, default='', help='input text or text file') +parser.add_argument('--session-id', type=str, default='', help='session id, if set to empty, system will allocate an id') +args = parser.parse_args() + +if args.full: + input_text = args.input_text + + start_time = time.time() + session_id = pipeline.init_session(args.session_id) + + print(f"Session {session_id} is created.") + + pipeline.full_steps(session_id, input_text) + end_time = time.time() + + print(f"WavJourney took {end_time - start_time:.2f} seconds to complete.") diff --git a/webapp/app.prompt b/webapp/app.prompt new file mode 100644 index 0000000000000000000000000000000000000000..506494290194a33a2b148bf95dd9a1b82a15b388 --- /dev/null +++ b/webapp/app.prompt @@ -0,0 +1,18 @@ +write a web app in python and flask and bootstrap. + +The UI: +- input textbox named "InputTextbox" on top +- "Generate All" button named GenerateAllButton at the same row as input textbox 1 +- a button "Text -> Script" called TextToScriptButton +- A big textbox named "ScriptTextbox". The textbox should be set to wrap-word mode +- A split line +- a button "Script -> HAML" called ScriptToHAMLButton +- A big textbox named "HAMLTextbox". The textbox should be set to wrap-word mode and display text with HTML syntax format. +- A split line +- a button "HAML -> Python Code" called HAMLToPythonCodeButton +- A big textbox named "PythonCodeTextbox". The textbox should be set to wrap-word mode and display text with python syntax format. + +Behaviors: +- When the user click TextToScriptButton, it will call ChatGPT API, which concat a prompt which is red from "prompts/text_to_audio_script.prompt" with the content from InputTextbox, and send it to ChatGPT, and output ChatGPT's response to ScriptTextbox. +- When the user click ScriptToHAMLButton, it will call ChatGPT API, which concat a prompt which is red from "prompts/audio_script_to_HAML.prompt" with the content from ScriptTextbox, and send it to ChatGPT, and output ChatGPT's response to HAMLTextbox. +- When the user click HAMLToPythonCodeButton, it will get the content of HAMLTextbox and pipe the content to the python script convert_haml_to_py_code.py, and return the script's output to PythonCodeTextbox. diff --git a/webapp/app.py b/webapp/app.py new file mode 100644 index 0000000000000000000000000000000000000000..6c82cd7d204e52a2e1f1157a8405febe2ff6b75f --- /dev/null +++ b/webapp/app.py @@ -0,0 +1,43 @@ +from flask import Flask, request, render_template +import os +import subprocess + +app = Flask(__name__) + +def call_chatgpt(prompt_file, input_text): + # Your actual function to call the ChatGPT API will go here + # For now, return a placeholder string + with open(prompt_file, 'r') as file: + prompt = file.read() + return f"Prompt: {prompt}\nInput: {input_text}" + +def call_convert_script(input_text): + # Your actual function to call the script will go here + # For now, return a placeholder string + # Run the script and capture the output + process = subprocess.Popen(['python', '../convert_haml_to_py_code.py'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) + output, error = process.communicate(input=input_text) + return output + error + +@app.route('/', methods=['GET', 'POST']) +def index(): + if request.method == 'POST': + input_text = request.form.get('InputTextbox', '') + script_text = request.form.get('ScriptTextbox', '') + haml_text = request.form.get('HAMLTextbox', '') + python_code_text = request.form.get('PythonCodeTextbox', '') + if 'TextToScriptButton' in request.form: + script_text = call_chatgpt('../prompts/text_to_audio_script.prompt', input_text) + + elif 'ScriptToHAMLButton' in request.form: + haml_text = call_chatgpt('../prompts/audio_script_to_HAML.prompt', script_text) + + elif 'HAMLToPythonCodeButton' in request.form: + python_code_text = call_convert_script(haml_text) + + return render_template('index.html', haml_text=haml_text, python_code_text=python_code_text, script_text=script_text, input_text=input_text) + + return render_template('index.html') + +if __name__ == '__main__': + app.run(debug=True) diff --git a/webapp/templates/index.html b/webapp/templates/index.html new file mode 100644 index 0000000000000000000000000000000000000000..d054580f9f13a21e44a5253aa1228023d8dbc137 --- /dev/null +++ b/webapp/templates/index.html @@ -0,0 +1,30 @@ + + +
+ + + +