How to use from the
Use from the
Transformers library
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="yujiepan/qwen2-audio-tiny-random")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)
# Load model directly
from transformers import AutoProcessor, AutoModelForSeq2SeqLM

processor = AutoProcessor.from_pretrained("yujiepan/qwen2-audio-tiny-random")
model = AutoModelForSeq2SeqLM.from_pretrained("yujiepan/qwen2-audio-tiny-random")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:]))
Quick Links

This model is for debugging. It is randomly initialized using the config from Qwen/Qwen2-Audio-7B-Instruct but with smaller size.

Codes:

import os
from typing import Dict

import requests
import torch
import transformers
from PIL import Image
from torchvision import io
from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor,
                          AutoTokenizer, GenerationConfig,
                          Qwen2AudioForConditionalGeneration, pipeline,
                          set_seed)

model_id = "Qwen/Qwen2-Audio-7B-Instruct"
repo_id = "yujiepan/qwen2-audio-tiny-random"
save_path = f"/tmp/{repo_id}"

config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
config.audio_config.encoder_layers = 2
config.audio_config.encoder_attention_heads = 2
config.audio_config.encoder_ffn_dim = 32
config.audio_config.d_model = 16
config.text_config.num_hidden_layers = 2
config.text_config.intermediate_size = 32
config.text_config.hidden_size = 16
config.text_config.num_attention_heads = 2
config.text_config.num_key_value_heads = 1

model = Qwen2AudioForConditionalGeneration(config=config)
model = model.to(torch.bfloat16).cuda().eval()
model.generation_config = GenerationConfig.from_pretrained(
    model_id, trust_remote_code=True,
)
set_seed(42)
with torch.no_grad():
    for _, p in sorted(model.named_parameters()):
        torch.nn.init.uniform_(p, -0.3, 0.3)

processor = AutoProcessor.from_pretrained(model_id)
model.save_pretrained(save_path)
processor.save_pretrained(save_path)
os.system(f"ls -alh {save_path}")


def try_inference():
    from io import BytesIO
    from urllib.request import urlopen

    import librosa
    processor = AutoProcessor.from_pretrained(save_path)
    model = Qwen2AudioForConditionalGeneration.from_pretrained(
        save_path, device_map="auto")
    conversation = [
        {"role": "user", "content": [
            {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"},
        ]},
        {"role": "assistant", "content": "Yes, the speaker is female and in her twenties."},
        {"role": "user", "content": [
            {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
        ]},
    ]
    text = processor.apply_chat_template(
        conversation, add_generation_prompt=True, tokenize=False)
    audios = []
    for message in conversation:
        if isinstance(message["content"], list):
            for ele in message["content"]:
                if ele["type"] == "audio":
                    audios.append(librosa.load(
                        BytesIO(urlopen(ele['audio_url']).read()),
                        sr=processor.feature_extractor.sampling_rate)[0]
                    )

    inputs = processor(text=text, audios=audios,
                       return_tensors="pt", padding=True)
    inputs.input_ids = inputs.input_ids.to("cuda")

    generate_ids = model.generate(**inputs, max_length=256)
    generate_ids = generate_ids[:, inputs.input_ids.size(1):]

    response = processor.batch_decode(
        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    print(response)


try_inference()
Downloads last month
30
Safetensors
Model size
5.03M params
Tensor type
BF16
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Collection including yujiepan/qwen2-audio-tiny-random