# Mostly from: https://github.com/adefossez/seewav # Original author: adefossez import math import tempfile from pathlib import Path import subprocess import cairo import numpy as np import gradio as gr from pydub import AudioSegment def read_audio(audio, seek=None, duration=None): """ Read the `audio` file, starting at `seek` (or 0) seconds for `duration` (or all) seconds. Returns `float[channels, samples]`. """ audio_segment = AudioSegment.from_file(audio) channels = audio_segment.channels samplerate = audio_segment.frame_rate if seek is not None: seek_ms = int(seek * 1000) audio_segment = audio_segment[seek_ms:] if duration is not None: duration_ms = int(duration * 1000) audio_segment = audio_segment[:duration_ms] samples = audio_segment.get_array_of_samples() wav = np.array(samples, dtype=np.float32) return wav.reshape(channels, -1), samplerate def sigmoid(x): return 1 / (1 + np.exp(-x)) def envelope(wav, window, stride): """ Extract the envelope of the waveform `wav` (float[samples]), using average pooling with `window` samples and the given `stride`. """ # pos = np.pad(np.maximum(wav, 0), window // 2) wav = np.pad(wav, window // 2) out = [] for off in range(0, len(wav) - window, stride): frame = wav[off : off + window] out.append(np.maximum(frame, 0).mean()) out = np.array(out) # Some form of audio compressor based on the sigmoid. out = 1.9 * (sigmoid(2.5 * out) - 0.5) return out def draw_env(envs, out, fg_colors, bg_color, size): """ Internal function, draw a single frame (two frames for stereo) using cairo and save it to the `out` file as png. envs is a list of envelopes over channels, each env is a float[bars] representing the height of the envelope to draw. Each entry will be represented by a bar. """ surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, *size) ctx = cairo.Context(surface) ctx.scale(*size) ctx.set_source_rgb(*bg_color) ctx.rectangle(0, 0, 1, 1) ctx.fill() K = len(envs) # Number of waves to draw (waves are stacked vertically) T = len(envs[0]) # Numbert of time steps pad_ratio = 0.1 # spacing ratio between 2 bars width = 1.0 / (T * (1 + 2 * pad_ratio)) pad = pad_ratio * width delta = 2 * pad + width ctx.set_line_width(width) for step in range(T): for i in range(K): half = 0.5 * envs[i][step] # (semi-)height of the bar half /= K # as we stack K waves vertically midrule = (1 + 2 * i) / (2 * K) # midrule of i-th wave ctx.set_source_rgb(*fg_colors[i]) ctx.move_to(pad + step * delta, midrule - half) ctx.line_to(pad + step * delta, midrule) ctx.stroke() ctx.set_source_rgba(*fg_colors[i], 0.8) ctx.move_to(pad + step * delta, midrule) ctx.line_to(pad + step * delta, midrule + 0.9 * half) ctx.stroke() surface.write_to_png(out) def interpole(x1, y1, x2, y2, x): return y1 + (y2 - y1) * (x - x1) / (x2 - x1) def visualize( progress, audio, tmp, out, seek=None, duration=None, rate=60, bars=50, speed=4, time=0.4, oversample=3, fg_color=(0.2, 0.2, 0.2), fg_color2=(0.5, 0.3, 0.6), bg_color=(1, 1, 1), size=(400, 400), stereo=False, ): """ Generate the visualisation for the `audio` file, using a `tmp` folder and saving the final video in `out`. `seek` and `durations` gives the extract location if any. `rate` is the framerate of the output video. `bars` is the number of bars in the animation. `speed` is the base speed of transition. Depending on volume, actual speed will vary between 0.5 and 2 times it. `time` amount of audio shown at once on a frame. `oversample` higher values will lead to more frequent changes. `fg_color` is the rgb color to use for the foreground. `fg_color2` is the rgb color to use for the second wav if stereo is set. `bg_color` is the rgb color to use for the background. `size` is the `(width, height)` in pixels to generate. `stereo` is whether to create 2 waves. """ try: wav, sr = read_audio(audio, seek=seek, duration=duration) except (IOError, ValueError) as err: raise gr.Error(err) # wavs is a list of wav over channels wavs = [] if stereo: assert wav.shape[0] == 2, "stereo requires stereo audio file" wavs.append(wav[0]) wavs.append(wav[1]) else: wav = wav.mean(0) wavs.append(wav) for i, wav in enumerate(wavs): wavs[i] = wav / wav.std() window = int(sr * time / bars) stride = int(window / oversample) # envs is a list of env over channels envs = [] for wav in wavs: env = envelope(wav, window, stride) env = np.pad(env, (bars // 2, 2 * bars)) envs.append(env) duration = len(wavs[0]) / sr frames = int(rate * duration) smooth = np.hanning(bars) gr.Info("Generating the frames...") for idx in progress(range(frames)): pos = (((idx / rate)) * sr) / stride / bars off = int(pos) loc = pos - off denvs = [] for env in envs: env1 = env[off * bars : (off + 1) * bars] env2 = env[(off + 1) * bars : (off + 2) * bars] # we want loud parts to be updated faster maxvol = math.log10(1e-4 + env2.max()) * 10 speedup = np.clip(interpole(-6, 0.5, 0, 2, maxvol), 0.5, 2) w = sigmoid(speed * speedup * (loc - 0.5)) denv = (1 - w) * env1 + w * env2 denv *= smooth denvs.append(denv) draw_env(denvs, tmp / f"{idx:06d}.png", (fg_color, fg_color2), bg_color, size) gr.Info("Encoding the animation video...") subprocess.run([ "ffmpeg", "-y", "-loglevel", "panic", "-r", str(rate), "-f", "image2", "-s", f"{size[0]}x{size[1]}", "-i", "%06d.png", "-i", audio, "-c:a", "aac", "-vcodec", "libx264", "-crf", "10", "-pix_fmt", "yuv420p", out.resolve() ], check=True, cwd=tmp) return out def parse_color(colorstr): """ Given a comma separated rgb(a) colors, returns a 4-tuple of float. """ try: r, g, b = [float(i) for i in colorstr.split(",")] return r, g, b except ValueError: raise gr.Error( "Format for color is 3 floats separated by commas 0.xx,0.xx,0.xx, rgb order" ) def hex_to_rgb(hex_color): hex_color = hex_color.lstrip('#') r = int(hex_color[0:2], 16) / 255.0 g = int(hex_color[2:4], 16) / 255.0 b = int(hex_color[4:6], 16) / 255.0 return (r, g, b) def do_viz( inp_aud, inp_bgcolor, inp_color1, inp_nbars, inp_vidw, inp_vidh, progress=gr.Progress(), ): with tempfile.TemporaryDirectory() as tmp, tempfile.NamedTemporaryFile( suffix=".mp4", delete=False ) as out: return visualize( progress.tqdm, inp_aud, Path(tmp), Path(out.name), bars=inp_nbars, fg_color=hex_to_rgb(inp_color1), bg_color=hex_to_rgb(inp_bgcolor), size=(inp_vidw, inp_vidh), ) import gradio as gr ABOUT = """ # seewav GUI > Have an audio clip but need a video (e.g. for X/Twitter)? **Convert audio into a video!** An online graphical user interface for [seewav](https://github.com/adefossez/seewav). """ with gr.Blocks() as demo: gr.Markdown(ABOUT) with gr.Row(): with gr.Column(): inp_aud = gr.Audio(type='filepath') with gr.Group(): inp_color1 = gr.ColorPicker( label="Color", info="Color of the top waveform", value="#00237E", interactive=True, ) inp_bgcolor = gr.ColorPicker( label="Background Color", info="Color of the background", value="#000000", interactive=True, ) with gr.Accordion("Advanced Configuration", open=False): inp_nbars = gr.Slider( label="Num. Bars", value=50, interactive=True, minimum=5, maximum=1500, ) inp_vidw = gr.Slider( label="Video Width", value=400, interactive=True, minimum=100, maximum=3000, ) inp_vidh = gr.Slider( label="Video Height", value=400, interactive=True, minimum=100, maximum=3000, ) inp_go = gr.Button("Visualize", variant="primary") with gr.Column(): out_vid = gr.Video(interactive=False) inp_go.click( do_viz, inputs=[ inp_aud, inp_bgcolor, inp_color1, inp_nbars, inp_vidw, inp_vidh, ], outputs=[out_vid], ) demo.queue(api_open=False, default_concurrency_limit=20).launch(show_api=False)