Spaces:

Infinigence
/

Megrez-3B-Omni

Running

App Files Files Community

yuantao-infini-ai commited on 3 days ago

Commit

72e6273

•

1 Parent(s): 40b01dd

Init commit

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +289 -0
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🐠
 colorFrom: red
 colorTo: blue
 sdk: gradio
-sdk_version: 5.9.0
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: red
 colorTo: blue
 sdk: gradio
+sdk_version: 5.3.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py ADDED Viewed

	@@ -0,0 +1,289 @@

+# -*- encoding: utf-8 -*-
+# File: app.py
+# Description: None
+from copy import deepcopy
+from typing import Dict, List
+from PIL import Image
+import io
+import subprocess
+import requests
+import json
+import base64
+import gradio as gr
+import librosa
+IMAGE_EXTENSIONS = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp")
+VIDEO_EXTENSIONS = (".mp4", ".mkv", ".mov", ".avi", ".flv", ".wmv", ".webm", ".m4v")
+AUDIO_EXTENSIONS = (".mp3", ".wav", "flac", ".m4a")
+DEFAULT_SAMPLING_PARAMS = {
+    "top_p": 0.8,
+    "top_k": 100,
+    "temperature": 0.7,
+    "do_sample": True,
+    "num_beams": 1,
+    "repetition_penalty": 1.2,
+}
+MAX_NEW_TOKENS = 1024
+def load_image_to_base64(image_path):
+    """Load image and convert to base64 string"""
+    with Image.open(image_path) as img:
+        if img.mode != 'RGB':
+            img = img.convert('RGB')
+        img_byte_arr = io.BytesIO()
+        img.save(img_byte_arr, format='PNG')
+        img_byte_arr = img_byte_arr.getvalue()
+        return base64.b64encode(img_byte_arr).decode('utf-8')
+def wav_to_bytes_with_ffmpeg(wav_file_path):
+    process = subprocess.Popen(
+        ['ffmpeg', '-i', wav_file_path, '-f', 'wav', '-'],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE
+    )
+    out, _ = process.communicate()
+    return base64.b64encode(out).decode('utf-8')
+def parse_sse_response(response):
+    for line in response.iter_lines():
+        if line:
+            line = line.decode('utf-8')
+            if line.startswith('data: '):
+                data = line[6:]  # Remove 'data: ' prefix
+                if data == '[DONE]':
+                    break
+                try:
+                    json_data = json.loads(data)
+                    yield json_data['text']
+                except json.JSONDecodeError:
+                    raise gr.Error(f"Failed to parse JSON: {data}")
+def history2messages(history: List[Dict]) -> List[Dict]:
+    """
+    Transform gradio history to chat messages.
+    """
+    messages = []
+    cur_message = dict()
+    for item in history:
+        if item["role"] == "assistant":
+            if len(cur_message) > 0:
+                messages.append(deepcopy(cur_message))
+                cur_message = dict()
+            messages.append(deepcopy(item))
+            continue
+        if "role" not in cur_message:
+            cur_message["role"] = "user"
+        if "content" not in cur_message:
+            cur_message["content"] = dict()
+        if "metadata" not in item:
+            item["metadata"] = {"title": None}
+        if item["metadata"]["title"] is None:
+            cur_message["content"]["text"] = item["content"]
+        elif item["metadata"]["title"] == "image":
+            cur_message["content"]["image"] = load_image_to_base64(item["content"][0])
+        elif item["metadata"]["title"] == "audio":
+            cur_message["content"]["audio"] = wav_to_bytes_with_ffmpeg(item["content"][0])
+    if len(cur_message) > 0:
+        messages.append(cur_message)
+    return messages
+def check_messages(history, message, audio):
+    has_text = message["text"] and message["text"].strip()
+    has_files = len(message["files"]) > 0
+    has_audio = audio is not None
+    if not (has_text or has_files or has_audio):
+        raise gr.Error("请输入文字或上传音频/图片后再发送。")
+    audios = []
+    images = []
+    for file_msg in message["files"]:
+        if file_msg.endswith(AUDIO_EXTENSIONS) or file_msg.endswith(VIDEO_EXTENSIONS):
+            duration = librosa.get_duration(filename=file_msg)
+            if duration > 30:
+                raise gr.Error("音频时长不能超过30秒。")
+            if duration == 0:
+                raise gr.Error("音频时长不能为0秒。")
+            audios.append(file_msg)
+        elif file_msg.endswith(IMAGE_EXTENSIONS):
+            images.append(file_msg)
+        else:
+            filename = file_msg.split("/")[-1]
+            raise gr.Error(f"Unsupported file type: {filename}. It should be an image or audio file.")
+    if len(audios) > 1:
+        raise gr.Error("Please upload only one audio file.")
+    if len(images) > 1:
+        raise gr.Error("Please upload only one image file.")
+    if audio is not None:
+        if len(audios) > 0:
+            raise gr.Error("Please upload only one audio file or record audio.")
+        audios.append(audio)
+    # Append the message to the history
+    for image in images:
+        history.append({"role": "user", "content": (image,), "metadata": {"title": "image"}})
+    for audio in audios:
+        history.append({"role": "user", "content": (audio,), "metadata": {"title": "audio"}})
+    if message["text"]:
+        history.append({"role": "user", "content": message["text"]})
+    return history, gr.MultimodalTextbox(value=None, interactive=False), None
+def bot(
+    history: list,
+    top_p: float,
+    top_k: int,
+    temperature: float,
+    repetition_penalty: float,
+    max_new_tokens: int = MAX_NEW_TOKENS,
+    regenerate: bool = False,
+):
+    if history and regenerate:
+        history = history[:-1]
+    if not history:
+        return history
+    msgs = history2messages(history)
+    API_URL = "http://8.152.0.142:8000/v1/chat"
+    payload = {
+        "messages": msgs,
+        "sampling_params": {
+            "top_p": top_p,
+            "top_k": top_k,
+            "temperature": temperature,
+            "repetition_penalty": repetition_penalty,
+            "max_new_tokens": max_new_tokens
+        }
+    }
+    response = requests.post(
+        API_URL,
+        json=payload,
+        headers={'Accept': 'text/event-stream'},
+        stream=True
+    )
+    response_text = ""
+    for text in parse_sse_response(response):
+        response_text += text
+        yield history + [{"role": "assistant", "content": response_text}]
+    return response_text
+def change_state(state):
+    return gr.update(visible=not state), not state
+def reset_user_input():
+    return gr.update(value="")
+if __name__ == "__main__":
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown(
+            f"""
+# 🪐 Chat with <a href="https://github.com/infinigence/Infini-Megrez-Omni">Megrez-3B-Omni</a>
+"""
+        )
+        chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages", height='48vh')
+        sampling_params_group_hidden_state = gr.State(False)
+        with gr.Row(equal_height=True):
+            chat_input = gr.MultimodalTextbox(
+                file_count="multiple",
+                placeholder="Enter your prompt or upload image/audio here, then press ENTER...",
+                show_label=False,
+                scale=8,
+                file_types=["image", "audio"],
+                interactive=True,
+                # stop_btn=True,
+            )
+        with gr.Row(equal_height=True):
+            audio_input = gr.Audio(
+                sources=["microphone", "upload"],
+                type="filepath",
+                scale=1,
+                max_length=30
+            )
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1, min_width=150):
+                with gr.Row(equal_height=True):
+                    regenerate_btn = gr.Button("Regenerate", variant="primary")
+                    clear_btn = gr.ClearButton(
+                        [chat_input, audio_input, chatbot],
+                    )
+        with gr.Row():
+            sampling_params_toggle_btn = gr.Button("Sampling Parameters")
+        with gr.Group(visible=False) as sampling_params_group:
+            with gr.Row():
+                temperature = gr.Slider(
+                    minimum=0, maximum=1.2, value=DEFAULT_SAMPLING_PARAMS["temperature"], label="Temperature"
+                )
+                repetition_penalty = gr.Slider(
+                    minimum=0,
+                    maximum=2,
+                    value=DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
+                    label="Repetition Penalty",
+                )
+            with gr.Row():
+                top_p = gr.Slider(minimum=0, maximum=1, value=DEFAULT_SAMPLING_PARAMS["top_p"], label="Top-p")
+                top_k = gr.Slider(minimum=0, maximum=1000, value=DEFAULT_SAMPLING_PARAMS["top_k"], label="Top-k")
+            with gr.Row():
+                max_new_tokens = gr.Slider(
+                    minimum=1,
+                    maximum=MAX_NEW_TOKENS,
+                    value=MAX_NEW_TOKENS,
+                    label="Max New Tokens",
+                    interactive=True,
+                )
+        sampling_params_toggle_btn.click(
+            change_state,
+            sampling_params_group_hidden_state,
+            [sampling_params_group, sampling_params_group_hidden_state],
+        )
+        chat_msg = chat_input.submit(
+            check_messages,
+            [chatbot, chat_input, audio_input],
+            [chatbot, chat_input, audio_input],
+        )
+        bot_msg = chat_msg.then(
+            bot,
+            inputs=[chatbot, top_p, top_k, temperature, repetition_penalty, max_new_tokens],
+            outputs=chatbot,
+            api_name="bot_response",
+        )
+        bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
+        regenerate_btn.click(
+            bot,
+            inputs=[chatbot, top_p, top_k, temperature, repetition_penalty, max_new_tokens, gr.State(True)],
+            outputs=chatbot,
+        )
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ librosa