web-singer-new-2

Runtime error

App Files Files Community

kevinwang676 commited on May 16, 2023

Commit

53bed0d

•

1 Parent(s): 4e3a496

Update app.py

Browse files

Files changed (1) hide show

app.py +199 -0

app.py CHANGED Viewed

@@ -18,6 +18,11 @@ import torch
 import librosa
 import util
 from config import device
 from infer_pack.models import (
     SynthesizerTrnMs256NSFsid,
@@ -99,6 +104,186 @@ for model_name in config_json.get('models'):
     ))
 print(f'Models loaded: {len(loaded_models)}')
 # Command line test
 def command_line_test():
     command = "df -h /home/user/app"
@@ -408,6 +593,20 @@ with gr.Blocks(theme=theme, css=css) as App:
                 new_song = gr.Audio(label="Full song", type="filepath")
     vc_audio_submit.click(fn=voice_changer, inputs=[vc_audio_input, vc_model_index, vc_pitch_adjust, vc_f0_method, vc_feat_ratio], outputs=[vc_audio_output, vc_audio_message], show_progress=True, queue=True)
     full_song.click(fn=mix, inputs=[vc_audio_output, as_audio_no_vocals], outputs=[new_song])
     gr.Markdown("### <center>注意❗：请不要生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及个人娱乐使用。</center>")
     gr.HTML('''
         <div class="footer">

 import librosa
 import util
+import matplotlib.pyplot as plt
+from PIL import Image, ImageDraw, ImageFont
+from moviepy.editor import *
+from moviepy.video.io.VideoFileClip import VideoFileClip
 from config import device
 from infer_pack.models import (
     SynthesizerTrnMs256NSFsid,
     ))
 print(f'Models loaded: {len(loaded_models)}')
+def make_bars_image(height_values, index, new_height):
+    # Define the size of the image
+    width = 512
+    height = new_height
+    # Create a new image with a transparent background
+    image = Image.new('RGBA', (width, height), color=(0, 0, 0, 0))
+    # Get the image drawing context
+    draw = ImageDraw.Draw(image)
+    # Define the rectangle width and spacing
+    rect_width = 2
+    spacing = 2
+    # Define the list of height values for the rectangles
+    #height_values = [20, 40, 60, 80, 100, 80, 60, 40]
+    num_bars = len(height_values)
+    # Calculate the total width of the rectangles and the spacing
+    total_width = num_bars * rect_width + (num_bars - 1) * spacing
+    # Calculate the starting position for the first rectangle
+    start_x = int((width - total_width) / 2)
+    # Define the buffer size
+    buffer_size = 80
+    # Draw the rectangles from left to right
+    x = start_x
+    for i, height in enumerate(height_values):
+        # Define the rectangle coordinates
+        y0 = buffer_size
+        y1 = height + buffer_size
+        x0 = x
+        x1 = x + rect_width
+        # Draw the rectangle
+        draw.rectangle([x0, y0, x1, y1], fill='white')
+        # Move to the next rectangle position
+        if i < num_bars - 1:
+            x += rect_width + spacing
+    # Rotate the image by 180 degrees
+    image = image.rotate(180)
+    # Mirror the image
+    image = image.transpose(Image.FLIP_LEFT_RIGHT)
+    # Save the image
+    image.save('audio_bars_'+ str(index) + '.png')
+    return 'audio_bars_'+ str(index) + '.png'
+def db_to_height(db_value):
+    # Scale the dB value to a range between 0 and 1
+    scaled_value = (db_value + 80) / 80
+    # Convert the scaled value to a height between 0 and 100
+    height = scaled_value * 50
+    return height
+def infer(title, audio_in, image_in):
+    # Load the audio file
+    audio_path = audio_in
+    audio_data, sr = librosa.load(audio_path)
+    # Get the duration in seconds
+    duration = librosa.get_duration(y=audio_data, sr=sr)
+    # Extract the audio data for the desired time
+    start_time = 0 # start time in seconds
+    end_time = duration # end time in seconds
+    start_index = int(start_time * sr)
+    end_index = int(end_time * sr)
+    audio_data = audio_data[start_index:end_index]
+    # Compute the short-time Fourier transform
+    hop_length = 512
+    stft = librosa.stft(audio_data, hop_length=hop_length)
+    spectrogram = librosa.amplitude_to_db(np.abs(stft), ref=np.max)
+    # Get the frequency values
+    freqs = librosa.fft_frequencies(sr=sr, n_fft=stft.shape[0])
+    # Select the indices of the frequency values that correspond to the desired frequencies
+    n_freqs = 114
+    freq_indices = np.linspace(0, len(freqs) - 1, n_freqs, dtype=int)
+    # Extract the dB values for the desired frequencies
+    db_values = []
+    for i in range(spectrogram.shape[1]):
+        db_values.append(list(zip(freqs[freq_indices], spectrogram[freq_indices, i])))
+    # Print the dB values for the first time frame
+    print(db_values[0])
+    proportional_values = []
+    for frame in db_values:
+        proportional_frame = [db_to_height(db) for f, db in frame]
+        proportional_values.append(proportional_frame)
+    print(proportional_values[0])
+    print("AUDIO CHUNK: " + str(len(proportional_values)))
+    # Open the background image
+    background_image = Image.open(image_in)
+    # Resize the image while keeping its aspect ratio
+    bg_width, bg_height = background_image.size
+    aspect_ratio = bg_width / bg_height
+    new_width = 512
+    new_height = int(new_width / aspect_ratio)
+    resized_bg = background_image.resize((new_width, new_height))
+    # Apply black cache for better visibility of the white text
+    bg_cache = Image.open('black_cache.png')
+    resized_bg.paste(bg_cache, (0, resized_bg.height - bg_cache.height), mask=bg_cache)
+    # Create a new ImageDraw object
+    draw = ImageDraw.Draw(resized_bg)
+    # Define the text to be added
+    text = title
+    font = ImageFont.truetype("Lato-Regular.ttf", 16)
+    text_color = (255, 255, 255) # white color
+    # Calculate the position of the text
+    text_width, text_height = draw.textsize(text, font=font)
+    x = 30
+    y = new_height - 70
+    # Draw the text on the image
+    draw.text((x, y), text, fill=text_color, font=font)
+    # Save the resized image
+    resized_bg.save('resized_background.jpg')
+    generated_frames = []
+    for i, frame in enumerate(proportional_values):
+        bars_img = make_bars_image(frame, i, new_height)
+        bars_img = Image.open(bars_img)
+        # Paste the audio bars image on top of the background image
+        fresh_bg = Image.open('resized_background.jpg')
+        fresh_bg.paste(bars_img, (0, 0), mask=bars_img)
+        # Save the image
+        fresh_bg.save('audio_bars_with_bg' + str(i) + '.jpg')
+        generated_frames.append('audio_bars_with_bg' + str(i) + '.jpg')
+    print(generated_frames)
+    # Create a video clip from the images
+    clip = ImageSequenceClip(generated_frames, fps=len(generated_frames)/(end_time-start_time))
+    audio_clip = AudioFileClip(audio_in)
+    clip = clip.set_audio(audio_clip)
+    # Set the output codec
+    codec = 'libx264'
+    audio_codec = 'aac'
+    # Save the video to a file
+    clip.write_videofile("my_video.mp4", codec=codec, audio_codec=audio_codec)
+    retimed_clip = VideoFileClip("my_video.mp4")
+    # Set the desired frame rate
+    new_fps = 25
+    # Create a new clip with the new frame rate
+    new_clip = retimed_clip.set_fps(new_fps)
+    # Save the new clip as a new video file
+    new_clip.write_videofile("my_video_retimed.mp4", codec=codec, audio_codec=audio_codec)
+    return "my_video_retimed.mp4"
 # Command line test
 def command_line_test():
     command = "df -h /home/user/app"
                 new_song = gr.Audio(label="Full song", type="filepath")
     vc_audio_submit.click(fn=voice_changer, inputs=[vc_audio_input, vc_model_index, vc_pitch_adjust, vc_f0_method, vc_feat_ratio], outputs=[vc_audio_output, vc_audio_message], show_progress=True, queue=True)
     full_song.click(fn=mix, inputs=[vc_audio_output, as_audio_no_vocals], outputs=[new_song])
+    with gr.Tab("📺 - 音乐视频"):
+        with gr.Row():
+            with gr.Column():
+                inp1 = gr.Textbox(label="为视频配上精彩的文案吧(选填)")
+                inp2 = new_song
+                gr.Image(source='upload', type='filepath', label="上传一张背景图片吧")
+                btn = gr.Button("生成您的专属音乐视频吧")
+            with gr.Column():
+                out1 = gr.Video(label='您的专属音乐视频')
+    btn.click(fn=infer, inputs=[inp1, inp2], outputs=[out1])
     gr.Markdown("### <center>注意❗：请不要生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及个人娱乐使用。</center>")
     gr.HTML('''
         <div class="footer">