Spaces:

AiCoderv2
/

deepseek-ai

Runtime error

File size: 34,951 Bytes

5b21692

import io
import os

os.environ['VLLM_USE_V1'] = '0'
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
from argparse import ArgumentParser

import gradio as gr
import gradio.processing_utils as processing_utils
import modelscope_studio.components.antd as antd
import modelscope_studio.components.base as ms
import numpy as np
import soundfile as sf
from gradio_client import utils as client_utils
from qwen_omni_utils import process_mm_info

import base64
import numpy as np
from scipy.io import wavfile  # 使用 scipy 保存 wav 文件，更简单支持 int16

import soundfile as sf
from openai import OpenAI

import base64

import os
import oss2
import json
import time
import subprocess
import numpy as np

OSS_RETRY = 10
OSS_RETRY_DELAY = 3
WAV_BIT_RATE = 16
WAV_SAMPLE_RATE = os.environ.get("WAV_SAMPLE_RATE", 16000)

# OSS_CONFIG_PATH = "/mnt/workspace/feizi.wx/.oss_config.json"

endpoint = os.getenv("OSS_ENDPOINT")
region = os.getenv("OSS_REGION")
bucket_name = os.getenv("OSS_BUCKET_NAME")
API_KEY = os.environ['API_KEY']
OSS_ACCESS_KEY_ID = os.environ['OSS_ACCESS_KEY_ID']
OSS_ACCESS_KEY_SECRET = os.environ['OSS_ACCESS_KEY_SECRET']
OSS_CONFIG_PATH = {}

class OSSReader:
    def __init__(self):
        # 初始化OSS配置
        self.bucket2object = {
            bucket_name: oss2.Bucket(oss2.Auth(OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET), endpoint, bucket_name),
        }
        print(f"Loaded OSS config from: {OSS_CONFIG_PATH}\nSupported buckets: {list(self.bucket2object.keys())}")
    
    def _parse_oss_path(self, oss_path):
        """解析oss路径，返回bucket名称和实际路径"""
        assert oss_path.startswith("oss://"), f"Invalid oss path {oss_path}"
        bucket_name, object_key = oss_path.split("oss://")[-1].split("/", 1)
        object_key = f"studio-temp/Qwen3-Omni-Demo/{object_key}"
        return bucket_name, object_key
    
    def _retry_operation(self, func, *args, retries=OSS_RETRY, delay=OSS_RETRY, **kwargs):
        """通用的重试机制"""
        for _ in range(retries):
            try:
                return func(*args, **kwargs)
            except Exception as e:
                print(f"Retry: {_} Error: {str(e)}")
                if _ == retries - 1:
                    raise e
                time.sleep(delay)
    
    def get_public_url(self, oss_path):
        bucket_name, object_key = self._parse_oss_path(oss_path)
        url = self._retry_operation(self.bucket2object[bucket_name].sign_url, 'GET', object_key, 600,
                                    slash_safe=True).replace('http://', 'https://')
        return url.replace("-internal", '')
    
    def file_exists(self, oss_path):
        """判断文件是否存在"""
        bucket_name, object_key = self._parse_oss_path(oss_path)
        return self._retry_operation(self.bucket2object[bucket_name].object_exists, object_key)
    
    def download_file(self, oss_path, local_path):
        """下载OSS上的文件到本地"""
        bucket_name, object_key = self._parse_oss_path(oss_path)
        self._retry_operation(self.bucket2object[bucket_name].get_object_to_file, object_key, local_path)
    
    def upload_file(self, local_path, oss_path, overwrite=True):
        """上传本地文件到OSS"""
        bucket_name, object_key = self._parse_oss_path(oss_path)
        # 检查文件是否存在
        if not os.path.exists(local_path):
            raise FileNotFoundError(f"Local file {local_path} does not exist")
        # 检查目标文件是否存在（当overwrite=False时）
        if not overwrite and self.file_exists(oss_path):
            print(f"File {oss_path} already exists, skip upload")
            return False
        # 执行上传操作
        try:
            self._retry_operation(
                self.bucket2object[bucket_name].put_object_from_file,
                object_key,
                local_path
            )
            return True
        except Exception as e:
            print(f"Upload failed: {str(e)}")
            return False
    
    def upload_audio_from_array(self, data, sample_rate, oss_path, overwrite=True):
        """将音频数据保存为WAV格式并上传到OSS"""
        bucket_name, object_key = self._parse_oss_path(oss_path)
        
        # 检查目标文件是否存在（当overwrite=False时）
        if not overwrite and self.file_exists(oss_path):
            print(f"File {oss_path} already exists, skip upload")
            return False
        
        try:
            # 使用 BytesIO 在内存中生成 WAV 格式数据
            import wave
            from io import BytesIO
            
            byte_io = BytesIO()
            with wave.open(byte_io, 'wb') as wf:
                wf.setnchannels(1)  # 单声道
                wf.setsampwidth(2)  # 16-bit PCM
                wf.setframerate(sample_rate)  # 设置采样率
                # 将 float32 数据转换为 int16 并写入 WAV
                data_int16 = np.clip(data, -1, 1) * 32767
                data_int16 = data_int16.astype(np.int16)
                wf.writeframes(data_int16.tobytes())
            
            # 上传到 OSS
            self._retry_operation(
                self.bucket2object[bucket_name].put_object,
                object_key,
                byte_io.getvalue()
            )
            return True
        except Exception as e:
            print(f"Upload failed: {str(e)}")
            return False
    
    def get_object(self, oss_path):
        """读取OSS上的音频文件，返回音频数据和采样率"""
        bucket_name, object_key = self._parse_oss_path(oss_path)
        return self._retry_operation(self.bucket2object[bucket_name].get_object, object_key)
    
    def read_text_file(self, oss_path):
        """读取OSS上的文本文件"""
        bucket_name, object_key = self._parse_oss_path(oss_path)
        result = self._retry_operation(self.bucket2object[bucket_name].get_object, object_key)
        return result.read().decode('utf-8')
    
    def read_audio_file(self, oss_path):
        """读取OSS上的音频文件，返回音频数据和采样率"""
        bucket_name, object_key = self._parse_oss_path(oss_path)
        result = self._retry_operation(self.bucket2object[bucket_name].get_object, object_key)
        # ffmpeg 命令：从标准输入读取音频并输出PCM浮点数据
        command = [
            'ffmpeg',
            '-i', '-',  # 输入来自管道
            '-ar', str(WAV_SAMPLE_RATE),  # 输出采样率
            '-ac', '1',  # 单声道
            '-f', 'f32le',  # 指定输出格式
            '-'  # 输出到管道
        ]
        # 启动ffmpeg子进程
        process = subprocess.Popen(
            command,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )
        # 写入音频字节并获取输出
        stdout_data, stderr_data = process.communicate(input=result.read())
        if process.returncode != 0:
            raise RuntimeError(f"FFmpeg error: {stderr_data.decode('utf-8')}")
        # 将PCM数据转换为numpy数组
        wav_data = np.frombuffer(stdout_data, dtype=np.float32)
        return wav_data, WAV_SAMPLE_RATE
    
    def get_wav_duration_from_bin(self, oss_path):
        oss_bin_path = oss_path + ".ar16k.bin"
        bucket_name, object_key = self._parse_oss_path(oss_bin_path)
        metadata = self._retry_operation(self.bucket2object[bucket_name].get_object_meta, object_key)
        duration = float(metadata.headers['Content-Length']) / (16000 * 2)
        return duration
    
    def read_wavdata_from_oss(self, oss_path, start=None, end=None, force_bin=False):
        bucket_name, object_key = self._parse_oss_path(oss_path)
        oss_bin_key = object_key + ".ar16k.bin"
        if start is None or end is None:
            if self.bucket2object[bucket_name].object_exists(oss_bin_key):
                wav_data = self._retry_operation(self.bucket2object[bucket_name].get_object, oss_bin_key).read()
            elif not force_bin:
                wav_data, _ = self.read_audio_file(oss_path)
            else:
                raise ValueError(f"Cannot find bin file for {oss_path}")
        else:
            bytes_per_second = WAV_SAMPLE_RATE * (WAV_BIT_RATE // 8)
            # 计算字节偏移量
            start_offset = round(start * bytes_per_second)
            end_offset = round(end * bytes_per_second)
            if not (end_offset - start_offset) % 2:
                end_offset -= 1
            # 使用范围请求只获取指定字节范围的数据
            wav_data = self._retry_operation(self.bucket2object[bucket_name].get_object,
                                             oss_bin_key,
                                             byte_range=(start_offset, end_offset),
                                             headers={'x-oss-range-behavior': 'standard'}).read()
        if not isinstance(wav_data, np.ndarray):
            wav_data = np.frombuffer(wav_data, np.int16).flatten() / 32768.0
        return wav_data.astype(np.float32)
    
    def _list_files_by_suffix(self, oss_dir, suffix):
        """递归搜索以某个后缀结尾的所有文件，返回所有文件的OSS路径列表"""
        bucket_name, dir_key = self._parse_oss_path(oss_dir)
        file_list = []
        
        def _recursive_list(prefix):
            for obj in oss2.ObjectIterator(self.bucket2object[bucket_name], prefix=prefix, delimiter='/'):
                if obj.is_prefix():  # 如果是目录，递归搜索
                    _recursive_list(obj.key)
                elif obj.key.endswith(suffix):
                    file_list.append(f"oss://{bucket_name}/{obj.key}")
        
        _recursive_list(dir_key)
        return file_list
    
    def list_files_by_suffix(self, oss_dir, suffix):
        return self._retry_operation(self._list_files_by_suffix, oss_dir, suffix)
    
    def _list_files_by_prefix(self, oss_dir, file_prefix):
        """递归搜索以某个后缀结尾的所有文件，返回所有文件的OSS路径列表"""
        bucket_name, dir_key = self._parse_oss_path(oss_dir)
        file_list = []
        
        def _recursive_list(prefix):
            for obj in oss2.ObjectIterator(self.bucket2object[bucket_name], prefix=prefix, delimiter='/'):
                if obj.is_prefix():  # 如果是目录，递归搜索
                    _recursive_list(obj.key)
                elif os.path.basename(obj.key).startswith(file_prefix):
                    file_list.append(f"oss://{bucket_name}/{obj.key}")
        
        _recursive_list(dir_key)
        return file_list
    
    def list_files_by_prefix(self, oss_dir, file_prefix):
        return self._retry_operation(self._list_files_by_prefix, oss_dir, file_prefix)


def encode_base64(base64_path):
    with open(base64_path, "rb") as base64_file:
        return base64.b64encode(base64_file.read()).decode("utf-8")


def _load_model_processor(args):
    if args.cpu_only:
        device_map = 'cpu'
    else:
        device_map = 'auto'
    
    model = OpenAI(
        # 若没有配置环境变量，请用阿里云百炼API Key将下行替换为：api_key="sk-xxx",
        api_key=API_KEY,
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
    )
    
    return model, None


oss_reader = OSSReader()


def _launch_demo(args, model, processor):
    # Voice settings
    VOICE_OPTIONS = {
        "芊悦 Cherry": "Cherry",
        "晨煦 Ethan": "Ethan",
        "詹妮弗 Jennifer": "Jennifer",
        "甜茶 Ryan": "Ryan",
        "卡捷琳娜 Katerina": "Katerina",
        "不吃鱼 Nofish": "Nofish",
        "墨讲师 Elias": "Elias",
        "南京-老李 Li": "Li",
        "陕西-秦川 Marcus": "Marcus",
        "闽南-阿杰 Roy": "Roy",
        "天津-李彼得 Peter": "Peter",
        "四川-程川 Eric": "Eric",
        "粤语-阿强 Rocky": "Rocky",
        "粤语-阿清 Kiki": "Kiki",
        "四川-晴儿 Sunny": "Sunny",
        "上海-阿珍 Jada": "Jada",
        "北京-晓东 Dylan": "Dylan",
    }
    DEFAULT_VOICE = '芊悦 Cherry'
    
    default_system_prompt = ''
    
    language = args.ui_language
    
    def get_text(text: str, cn_text: str):
        if language == 'en':
            return text
        if language == 'zh':
            return cn_text
        return text
    
    def to_mp4(path):
        import subprocess
        if path and path.endswith(".webm"):
            mp4_path = path.replace(".webm", ".mp4")
            subprocess.run([
                "ffmpeg", "-y",
                "-i", path,
                "-c:v", "libx264",  # 使用 H.264
                "-preset", "ultrafast",  # 最快速度！
                "-tune", "fastdecode",  # 优化快速解码（利于后续处理）
                "-pix_fmt", "yuv420p",  # 兼容性像素格式
                "-c:a", "aac",  # 音频编码
                "-b:a", "128k",  # 可选：限制音频比特率加速
                "-threads", "0",  # 使用所有线程
                "-f", "mp4",
                mp4_path
            ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            return mp4_path
        return path  # 已经是 mp4 或 None
    
    def format_history(history: list, system_prompt: str):
        print(history)
        messages = []
        if system_prompt != "":
            messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
        
        current_user_content = []
        
        for item in history:
            role = item['role']
            content = item['content']
            
            if role != "user":
                if current_user_content:
                    messages.append({"role": "user", "content": current_user_content})
                    current_user_content = []
                
                if isinstance(content, str):
                    messages.append({
                        "role": role,
                        "content": [{"type": "text", "text": content}]
                    })
                else:
                    pass
                continue
            
            if isinstance(content, str):
                current_user_content.append({"type": "text", "text": content})
            elif isinstance(content, (list, tuple)):
                for file_path in content:
                    mime_type = client_utils.get_mimetype(file_path)
                    media_type = None
                    
                    if mime_type.startswith("image"):
                        media_type = "image_url"
                    elif mime_type.startswith("video"):
                        media_type = "video_url"
                        file_path = to_mp4(file_path)
                    elif mime_type.startswith("audio"):
                        media_type = "input_audio"
                    
                    if media_type:
                        # base64_media = encode_base64(file_path)
                        import uuid
                        request_id = str(uuid.uuid4())
                        oss_path = f"oss://{bucket_name}//studio-temp/Qwen3-Omni-Demo/" + request_id
                        oss_reader.upload_file(file_path, oss_path)
                        media_url = oss_reader.get_public_url(oss_path)
                        if media_type == "input_audio":
                            current_user_content.append({
                                "type": "input_audio",
                                "input_audio": {
                                    "data": media_url,
                                    "format": "wav",
                                },
                            })
                        if media_type == "image_url":
                            current_user_content.append({
                                "type": "image_url",
                                "image_url": {
                                    "url": media_url
                                },
                            })
                        if media_type == "video_url":
                            current_user_content.append({
                                "type": "video_url",
                                "video_url": {
                                    "url": media_url
                                },
                            })
                    else:
                        current_user_content.append({
                            "type": "text",
                            "text": file_path
                        })
        
        if current_user_content:
            media_items = []
            text_items = []
            
            for item in current_user_content:
                if item["type"] == "text":
                    text_items.append(item)
                else:
                    media_items.append(item)
            
            messages.append({
                "role": "user",
                "content": media_items + text_items
            })
        
        return messages
    
    def predict(messages, voice_choice=DEFAULT_VOICE, temperature=0.7, top_p=0.8, top_k=20, return_audio=False,
                enable_thinking=False):
        # print('predict history: ', messages)
        if enable_thinking:
            return_audio=False
        if return_audio:
            completion = model.chat.completions.create(
                model="qwen3-omni-flash",
                messages=messages,
                modalities=["text", "audio"],
                audio={"voice": VOICE_OPTIONS[voice_choice], "format": "wav"},
                extra_body={'enable_thinking': False, "top_k": top_k},
                stream_options={"include_usage": True},
                stream=True,
                temperature=temperature,
                top_p=top_p,
            )
        else:
            completion = model.chat.completions.create(
                model="qwen3-omni-flash",
                messages=messages,
                modalities=["text"],
                extra_body={'enable_thinking': enable_thinking, "top_k": top_k},
                stream_options={"include_usage": True},
                stream=True,
                temperature=temperature,
                top_p=top_p,
            )
        audio_string = ""
        output_text = ""
        reasoning_content = "<think>\n\n"  # 完整思考过程
        answer_content = ""  # 完整回复
        is_answering = False  # 是否进入回复阶段
        print(return_audio, enable_thinking)
        for chunk in completion:
            if chunk.choices:
                if hasattr(chunk.choices[0].delta, "audio"):
                    try:
                        audio_string += chunk.choices[0].delta.audio["data"]
                    except Exception as e:
                        output_text += chunk.choices[0].delta.audio["transcript"]
                        yield {"type": "text", "data": output_text}
                else:
                    delta = chunk.choices[0].delta
                    if enable_thinking:
                        if hasattr(delta, "reasoning_content") and delta.reasoning_content is not None:
                            if not is_answering:
                                print(delta.reasoning_content, end="", flush=True)
                            reasoning_content += delta.reasoning_content
                            yield {"type": "text", "data": reasoning_content}
                        if hasattr(delta, "content") and delta.content:
                            if not is_answering:
                                reasoning_content += "\n\n</think>\n\n"
                                is_answering = True
                            answer_content += delta.content
                            yield {"type": "text", "data": reasoning_content + answer_content}
                    else:
                        if hasattr(delta, "content") and delta.content:
                            output_text += chunk.choices[0].delta.content
                            yield {"type": "text", "data": output_text}
            else:
                print(chunk.usage)
        
        wav_bytes = base64.b64decode(audio_string)
        audio_np = np.frombuffer(wav_bytes, dtype=np.int16)
        
        if audio_string != "":
            wav_io = io.BytesIO()
            sf.write(wav_io, audio_np, samplerate=24000, format="WAV")
            wav_io.seek(0)
            wav_bytes = wav_io.getvalue()
            audio_path = processing_utils.save_bytes_to_cache(
                wav_bytes, "audio.wav", cache_dir=demo.GRADIO_CACHE)
            yield {"type": "audio", "data": audio_path}
    
    def media_predict(audio, video, history, system_prompt, voice_choice, temperature, top_p, top_k, return_audio=False,
                      enable_thinking=False):
        # First yield
        yield (
            None,  # microphone
            None,  # webcam
            history,  # media_chatbot
            gr.update(visible=False),  # submit_btn
            gr.update(visible=True),  # stop_btn
        )
        
        files = [audio, video]
        
        for f in files:
            if f:
                history.append({"role": "user", "content": (f,)})
        
        yield (
            None,  # microphone
            None,  # webcam
            history,  # media_chatbot
            gr.update(visible=True),  # submit_btn
            gr.update(visible=False),  # stop_btn
        )
        
        formatted_history = format_history(history=history,
                                           system_prompt=system_prompt, )
        
        history.append({"role": "assistant", "content": ""})
        
        for chunk in predict(formatted_history, voice_choice, temperature, top_p, top_k, return_audio, enable_thinking):
            print('chunk', chunk)
            if chunk["type"] == "text":
                history[-1]["content"] = chunk["data"]
                yield (
                    None,  # microphone
                    None,  # webcam
                    history,  # media_chatbot
                    gr.update(visible=False),  # submit_btn
                    gr.update(visible=True),  # stop_btn
                )
            if chunk["type"] == "audio":
                history.append({
                    "role": "assistant",
                    "content": gr.Audio(chunk["data"])
                })
        
        # Final yield
        yield (
            None,  # microphone
            None,  # webcam
            history,  # media_chatbot
            gr.update(visible=True),  # submit_btn
            gr.update(visible=False),  # stop_btn
        )
    
    def chat_predict(text, audio, image, video, history, system_prompt, voice_choice, temperature, top_p, top_k,
                     return_audio=False, enable_thinking=False):
        
        # Process audio input
        if audio:
            history.append({"role": "user", "content": (audio,)})
        
        # Process text input
        if text:
            history.append({"role": "user", "content": text})
        
        # Process image input
        if image:
            history.append({"role": "user", "content": (image,)})
        
        # Process video input
        if video:
            history.append({"role": "user", "content": (video,)})
        
        formatted_history = format_history(history=history,
                                           system_prompt=system_prompt)
        
        yield None, None, None, None, history
        
        history.append({"role": "assistant", "content": ""})
        for chunk in predict(formatted_history, voice_choice, temperature, top_p, top_k, return_audio, enable_thinking):
            print('chat_predict chunk', chunk)
            
            if chunk["type"] == "text":
                history[-1]["content"] = chunk["data"]
                yield gr.skip(), gr.skip(), gr.skip(), gr.skip(
                ), history
            if chunk["type"] == "audio":
                history.append({
                    "role": "assistant",
                    "content": gr.Audio(chunk["data"])
                })
        yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), history
    
    # --- CORRECTED UI LAYOUT ---
    with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"]),
                   css=".gradio-container {max-width: none !important;}") as demo:
        gr.Markdown("# Qwen3-Omni Demo")
        gr.Markdown(
            "**Instructions**: Interact with the model through text, audio, images, or video. Use the tabs to switch between Online and Offline chat modes.")
        gr.Markdown(
            "**使用说明**：1️⃣ 点击音频录制按钮，或摄像头-录制按钮 2️⃣ 输入音频或者视频 3️⃣ 点击提交并等待模型的回答")
        
        with gr.Row(equal_height=False):
            with gr.Column(scale=1):
                gr.Markdown("### ⚙️ Parameters (参数)")
                system_prompt_textbox = gr.Textbox(label="System Prompt", value=default_system_prompt, lines=4,
                                                   max_lines=8)
                voice_choice = gr.Dropdown(label="Voice Choice", choices=VOICE_OPTIONS, value=DEFAULT_VOICE,
                                           visible=True)
                return_audio = gr.Checkbox(
                    label="Return Audio （返回语音）",
                    value=True,
                    interactive=True,
                    elem_classes="checkbox-large"
                )
                enable_thinking = gr.Checkbox(
                    label="Enable Thinking （启用思维链）",
                    value=False,
                    interactive=True,
                    elem_classes="checkbox-large"
                )
                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=0.6, step=0.1)
                top_p = gr.Slider(label="Top P", minimum=0.05, maximum=1.0, value=0.95, step=0.05)
                top_k = gr.Slider(label="Top K", minimum=1, maximum=100, value=20, step=1)
            
            with gr.Column(scale=3):
                with gr.Tabs():
                    with gr.TabItem("Online"):
                        with gr.Row():
                            with gr.Column(scale=1):
                                gr.Markdown("### Audio-Video Input (音视频输入)")
                                microphone = gr.Audio(sources=['microphone'], type="filepath",
                                                      label="Record Audio (录制音频)")
                                webcam = gr.Video(sources=['webcam', "upload"],
                                                  label="Record/Upload Video (录制/上传视频)",
                                                  elem_classes="media-upload")
                                with gr.Row():
                                    submit_btn_online = gr.Button("Submit (提交)", variant="primary", scale=2)
                                    stop_btn_online = gr.Button("Stop (停止)", visible=False, scale=1)
                                clear_btn_online = gr.Button("Clear History (清除历史)")
                            with gr.Column(scale=2):
                                # FIX: Re-added type="messages"
                                media_chatbot = gr.Chatbot(label="Chat History (对话历史)", type="messages", height=650,
                                                           layout="panel", bubble_full_width=False,
                                                           allow_tags=["think"], render=False)
                                media_chatbot.render()
                        
                        def clear_history_online():
                            return [], None, None
                        
                        submit_event_online = submit_btn_online.click(
                            fn=media_predict,
                            inputs=[microphone, webcam, media_chatbot, system_prompt_textbox, voice_choice, temperature,
                                    top_p, top_k, return_audio, enable_thinking],
                            outputs=[microphone, webcam, media_chatbot, submit_btn_online, stop_btn_online]
                        )
                        stop_btn_online.click(fn=lambda: (gr.update(visible=True), gr.update(visible=False)),
                                              outputs=[submit_btn_online, stop_btn_online],
                                              cancels=[submit_event_online], queue=False)
                        clear_btn_online.click(fn=clear_history_online, outputs=[media_chatbot, microphone, webcam])
                    
                    with gr.TabItem("Offline"):
                        # FIX: Re-added type="messages"
                        chatbot = gr.Chatbot(label="Chat History (对话历史)", type="messages", height=550,
                                             layout="panel", bubble_full_width=False, allow_tags=["think"],
                                             render=False)
                        chatbot.render()
                        
                        with gr.Accordion("📎 Click to upload multimodal files (点击上传多模态文件)", open=False):
                            with gr.Row():
                                audio_input = gr.Audio(sources=["upload", 'microphone'], type="filepath", label="Audio",
                                                       elem_classes="media-upload")
                                image_input = gr.Image(sources=["upload", 'webcam'], type="filepath", label="Image",
                                                       elem_classes="media-upload")
                                video_input = gr.Video(sources=["upload", 'webcam'], label="Video",
                                                       elem_classes="media-upload")
                        
                        with gr.Row():
                            text_input = gr.Textbox(show_label=False,
                                                    placeholder="Enter text or upload files and press Submit... (输入文本或者上传文件并点击提交)",
                                                    scale=7)
                            submit_btn_offline = gr.Button("Submit (提交)", variant="primary", scale=1)
                            stop_btn_offline = gr.Button("Stop (停止)", visible=False, scale=1)
                            clear_btn_offline = gr.Button("Clear (清空) ", scale=1)
                        
                        def clear_history_offline():
                            return [], None, None, None, None
                        
                        submit_event_offline = gr.on(
                            triggers=[submit_btn_offline.click, text_input.submit],
                            fn=chat_predict,
                            inputs=[text_input, audio_input, image_input, video_input, chatbot, system_prompt_textbox,
                                    voice_choice, temperature, top_p, top_k, return_audio, enable_thinking],
                            outputs=[text_input, audio_input, image_input, video_input, chatbot]
                        )
                        stop_btn_offline.click(fn=lambda: (gr.update(visible=True), gr.update(visible=False)),
                                               outputs=[submit_btn_offline, stop_btn_offline],
                                               cancels=[submit_event_offline], queue=False)
                        clear_btn_offline.click(fn=clear_history_offline,
                                                outputs=[chatbot, text_input, audio_input, image_input, video_input])
        
        gr.HTML("""
            <style>
                .media-upload { min-height: 160px; border: 2px dashed #ccc; border-radius: 8px; display: flex; align-items: center; justify-content: center; }
                .media-upload:hover { border-color: #666; }
            </style>
        """)
    
    demo.queue(default_concurrency_limit=100, max_size=100).launch(max_threads=100,
                                                                   ssr_mode=False,
                                                                   share=args.share,
                                                                   inbrowser=args.inbrowser,
                                                                   # ssl_certfile="examples/offline_inference/qwen3_omni_moe/cert.pem",
                                                                   # ssl_keyfile="examples/offline_inference/qwen3_omni_moe/key.pem",
                                                                   # ssl_verify=False,
                                                                   server_port=args.server_port,
                                                                   server_name=args.server_name, )


DEFAULT_CKPT_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"


def _get_args():
    parser = ArgumentParser()
    
    parser.add_argument('-c',
                        '--checkpoint-path',
                        type=str,
                        default=DEFAULT_CKPT_PATH,
                        help='Checkpoint name or path, default to %(default)r')
    parser.add_argument('--cpu-only', action='store_true', help='Run demo with CPU only')
    
    parser.add_argument('--flash-attn2',
                        action='store_true',
                        default=False,
                        help='Enable flash_attention_2 when loading the model.')
    parser.add_argument('--use-transformers',
                        action='store_true',
                        default=False,
                        help='Use transformers for inference.')
    parser.add_argument('--share',
                        action='store_true',
                        default=False,
                        help='Create a publicly shareable link for the interface.')
    parser.add_argument('--inbrowser',
                        action='store_true',
                        default=False,
                        help='Automatically launch the interface in a new tab on the default browser.')
    parser.add_argument('--server-port', type=int, default=7860, help='Demo server port.')
    parser.add_argument('--server-name', type=str, default='0.0.0.0', help='Demo server name.')
    parser.add_argument('--ui-language', type=str, choices=['en', 'zh'], default='zh',
                        help='Display language for the UI.')
    
    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = _get_args()
    model, processor = _load_model_processor(args)
    _launch_demo(args, model, processor)