jjw0126 commited on Sep 16, 2025

Commit

84ff315

verified ·

1 Parent(s): 71d6013

Upload files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

inference/infer.py +317 -0
inference/infer_ola_internvl.py +448 -0
inference/infer_ola_internvl_audio.py +244 -0
inference/infer_ola_internvl_audio_ckpt.py +245 -0
inference/infer_ola_internvl_copy.py +318 -0
inference/infer_ola_internvl_text_visual.py +409 -0
inference/log.txt +480 -0
inference/log1.txt +339 -0
ola.egg-info/PKG-INFO +265 -0
ola.egg-info/SOURCES.txt +44 -0
ola.egg-info/dependency_links.txt +1 -0
ola.egg-info/requires.txt +40 -0
ola.egg-info/top_level.txt +4 -0
ola/__pycache__/arguments.cpython-312.pyc +0 -0
ola/__pycache__/constants.cpython-312.pyc +0 -0
ola/__pycache__/conversation.cpython-312.pyc +0 -0
ola/__pycache__/mm_utils.cpython-312.pyc +0 -0
ola/__pycache__/utils.cpython-312.pyc +0 -0
ola/arguments.py +65 -0
ola/constants.py +14 -0
ola/conversation.py +266 -0
ola/datasets/__init__.py +0 -0
ola/datasets/__pycache__/__init__.cpython-312.pyc +0 -0
ola/datasets/__pycache__/preprocess.cpython-312.pyc +0 -0
ola/datasets/preprocess.py +234 -0
ola/mm_utils.py +271 -0
ola/model/__init__.py +2 -0
ola/model/__pycache__/__init__.cpython-312.pyc +0 -0
ola/model/__pycache__/builder.cpython-312.pyc +0 -0
ola/model/__pycache__/ola_arch.cpython-312.pyc +0 -0
ola/model/builder.py +97 -0
ola/model/builder_back.py +294 -0
ola/model/language_model/__pycache__/conversation.cpython-312.pyc +0 -0
ola/model/language_model/__pycache__/ola_qwen.cpython-312.pyc +0 -0
ola/model/language_model/__pycache__/ola_qwen3.cpython-312.pyc +0 -0
ola/model/language_model/conversation.py +403 -0
ola/model/language_model/ola_qwen.py +237 -0
ola/model/language_model/ola_qwen3.py +466 -0
ola/model/multimodal_encoder/__pycache__/builder.cpython-312.pyc +0 -0
ola/model/multimodal_encoder/__pycache__/configuration_intern_vit.cpython-312.pyc +0 -0
ola/model/multimodal_encoder/__pycache__/internvl_vit.cpython-312.pyc +0 -0
ola/model/multimodal_encoder/__pycache__/oryx_vit.cpython-312.pyc +0 -0
ola/model/multimodal_encoder/builder.py +16 -0
ola/model/multimodal_encoder/configuration_intern_vit.py +119 -0
ola/model/multimodal_encoder/internvl_vit.py +435 -0
ola/model/multimodal_encoder/oryx_vit.py +1075 -0
ola/model/multimodal_projector/__pycache__/builder.cpython-312.pyc +0 -0
ola/model/multimodal_projector/__pycache__/internvl_projector.cpython-312.pyc +0 -0
ola/model/multimodal_projector/__pycache__/pooler_projector.cpython-312.pyc +0 -0
ola/model/multimodal_projector/builder.py +177 -0

inference/infer.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import os
+os.environ['LOWRES_RESIZE'] = '384x32'
+os.environ['HIGHRES_BASE'] = '0x32'
+os.environ['VIDEO_RESIZE'] = "0x64"
+os.environ['VIDEO_MAXRES'] = "480"
+os.environ['VIDEO_MINRES'] = "288"
+os.environ['MAXRES'] = '1536'
+os.environ['MINRES'] = '0'
+os.environ['FORCE_NO_DOWNSAMPLE'] = '1'
+os.environ['LOAD_VISION_EARLY'] = '1'
+os.environ['PAD2STRIDE'] = '1'
+import gradio as gr
+import torch
+import re
+from decord import VideoReader, cpu
+from PIL import Image
+import numpy as np
+import transformers
+import moviepy as mp
+from typing import Dict, Optional, Sequence, List
+import librosa
+import whisper
+from ola.conversation import conv_templates, SeparatorStyle
+from ola.model.builder import load_pretrained_model
+from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token, tokenizer_speech_token
+from ola.mm_utils import KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image
+from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN, SPEECH_TOKEN_INDEX
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--model_path', type=str, default='/data1/cxy/model/THUdyh/Ola-7b')
+parser.add_argument('--text', type=str, default="What does the speech say?")
+parser.add_argument('--audio_path', type=str, default="/data1/cxy/dataset/english.mp3")
+parser.add_argument('--image_path', type=str, default=None)
+parser.add_argument('--video_path', type=str, default=None)
+args = parser.parse_args()
+model_path = args.model_path
+tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None)
+model = model.to('cuda').eval()
+model = model.bfloat16()
+# breakpoint()
+USE_SPEECH=False
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+def load_audio(audio_file_name):
+    speech_wav, samplerate = librosa.load(audio_file_name, sr=16000)
+    if len(speech_wav.shape) > 1:
+        speech_wav = speech_wav[:, 0]
+    speech_wav = speech_wav.astype(np.float32)
+    CHUNK_LIM = 480000
+    SAMPLE_RATE = 16000
+    speechs = []
+    speech_wavs = []
+    if len(speech_wav) <= CHUNK_LIM:
+        speech = whisper.pad_or_trim(speech_wav)
+        speech_wav = whisper.pad_or_trim(speech_wav)
+        speechs.append(speech)
+        speech_wavs.append(torch.from_numpy(speech_wav).unsqueeze(0))
+    else:
+        for i in range(0, len(speech_wav), CHUNK_LIM):
+            chunk = speech_wav[i : i + CHUNK_LIM]
+            if len(chunk) < CHUNK_LIM:
+                chunk = whisper.pad_or_trim(chunk)
+            speechs.append(chunk)
+            speech_wavs.append(torch.from_numpy(chunk).unsqueeze(0))
+    mels = []
+    for chunk in speechs:
+        chunk = whisper.log_mel_spectrogram(chunk, n_mels=128).permute(1, 0).unsqueeze(0)
+        mels.append(chunk)
+    mels = torch.cat(mels, dim=0)
+    speech_wavs = torch.cat(speech_wavs, dim=0)
+    if mels.shape[0] > 25:
+        mels = mels[:25]
+        speech_wavs = speech_wavs[:25]
+    speech_length = torch.LongTensor([mels.shape[1]] * mels.shape[0])
+    speech_chunks = torch.LongTensor([mels.shape[0]])
+    return mels, speech_length, speech_chunks, speech_wavs
+def extract_audio(videos_file_path):
+    my_clip = mp.VideoFileClip(videos_file_path)
+    return my_clip.audio
+image_path = args.image_path
+audio_path = args.audio_path
+video_path = args.video_path
+text = args.text
+if video_path is not None:
+    modality = "video"
+    visual = video_path
+    assert image_path is None
+elif image_path is not None:
+    visual = image_path
+    modality = "image"
+    assert video_path is None
+elif audio_path is not None:
+    modality = "text"
+# input audio and video, do not parse audio in the video, else parse audio in the video
+if audio_path:
+    USE_SPEECH = True
+elif modality == "video":
+    USE_SPEECH = True
+else:
+    USE_SPEECH = False
+speechs = []
+speech_lengths = []
+speech_wavs = []
+speech_chunks = []
+if modality == "video":
+    vr = VideoReader(visual, ctx=cpu(0))
+    total_frame_num = len(vr)
+    fps = round(vr.get_avg_fps())
+    uniform_sampled_frames = np.linspace(0, total_frame_num - 1, 64, dtype=int)
+    frame_idx = uniform_sampled_frames.tolist()
+    spare_frames = vr.get_batch(frame_idx).asnumpy()
+    video = [Image.fromarray(frame) for frame in spare_frames]
+elif modality == "image":
+    image = [Image.open(visual)]
+    image_sizes = [image[0].size]
+else:
+    images = [torch.zeros(1, 3, 224, 224).to(dtype=torch.bfloat16, device='cuda', non_blocking=True)]
+    images_highres = [torch.zeros(1, 3, 224, 224).to(dtype=torch.bfloat16, device='cuda', non_blocking=True)]
+    image_sizes = [(224, 224)]
+if USE_SPEECH and audio_path:
+    audio_path = audio_path
+    speech, speech_length, speech_chunk, speech_wav = load_audio(audio_path)
+    speechs.append(speech.bfloat16().to('cuda'))
+    speech_lengths.append(speech_length.to('cuda'))
+    speech_chunks.append(speech_chunk.to('cuda'))
+    speech_wavs.append(speech_wav.to('cuda'))
+    print('load audio')
+elif USE_SPEECH and not audio_path:
+    # parse audio in the video
+    audio = extract_audio(visual)
+    audio.write_audiofile("./video_audio.wav")
+    video_audio_path = './video_audio.wav'
+    speech, speech_length, speech_chunk, speech_wav = load_audio(video_audio_path)
+    speechs.append(speech.bfloat16().to('cuda'))
+    speech_lengths.append(speech_length.to('cuda'))
+    speech_chunks.append(speech_chunk.to('cuda'))
+    speech_wavs.append(speech_wav.to('cuda'))
+else:
+    speechs = [torch.zeros(1, 3000, 128).bfloat16().to('cuda')]
+    speech_lengths = [torch.LongTensor([3000]).to('cuda')]
+    speech_wavs = [torch.zeros([1, 480000]).to('cuda')]
+    speech_chunks = [torch.LongTensor([1]).to('cuda')]
+conv_mode = "qwen_1_5"
+if text:
+    qs = text
+else:
+    qs = ''
+if USE_SPEECH and audio_path and image_path: # image + speech instruction
+    qs = DEFAULT_IMAGE_TOKEN + "\n" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '\n'
+elif USE_SPEECH and video_path: # video + audio
+    qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "\n" + qs
+elif USE_SPEECH and audio_path: # audio + text
+    qs = DEFAULT_SPEECH_TOKEN + "\n" + qs
+elif image_path or video_path: # image / video
+    qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
+elif text: # text
+    qs = qs
+conv = conv_templates[conv_mode].copy()
+conv.append_message(conv.roles[0], qs)
+conv.append_message(conv.roles[1], None)
+prompt = conv.get_prompt()
+if USE_SPEECH and audio_path and image_path: # image + speech instruction
+    input_ids = tokenizer_speech_question_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
+elif USE_SPEECH and video_path: # video + audio
+    input_ids = tokenizer_speech_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
+elif USE_SPEECH and audio_path: # audio + text
+    # breakpoint()
+    input_ids = tokenizer_speech_token(prompt, tokenizer, SPEECH_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
+else:
+    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
+if modality == "video":
+    video_processed = []
+    for idx, frame in enumerate(video):
+        image_processor.do_resize = False
+        image_processor.do_center_crop = False
+        frame = process_anyres_video(frame, image_processor)
+        if frame_idx is not None and idx in frame_idx:
+            video_processed.append(frame.unsqueeze(0))
+        elif frame_idx is None:
+            video_processed.append(frame.unsqueeze(0))
+    if frame_idx is None:
+        frame_idx = np.arange(0, len(video_processed), dtype=int).tolist()
+    video_processed = torch.cat(video_processed, dim=0).bfloat16().to("cuda")
+    video_processed = (video_processed, video_processed)
+    video_data = (video_processed, (384, 384), "video")
+elif modality == "image":
+    image_processor.do_resize = False
+    image_processor.do_center_crop = False
+    image_tensor, image_highres_tensor = [], []
+    for visual in image:
+        image_tensor_, image_highres_tensor_ = process_anyres_highres_image(visual, image_processor)
+        image_tensor.append(image_tensor_)
+        image_highres_tensor.append(image_highres_tensor_)
+    if all(x.shape == image_tensor[0].shape for x in image_tensor):
+        image_tensor = torch.stack(image_tensor, dim=0)
+    if all(x.shape == image_highres_tensor[0].shape for x in image_highres_tensor):
+        image_highres_tensor = torch.stack(image_highres_tensor, dim=0)
+    if type(image_tensor) is list:
+        image_tensor = [_image.bfloat16().to("cuda") for _image in image_tensor]
+    else:
+        image_tensor = image_tensor.bfloat16().to("cuda")
+    if type(image_highres_tensor) is list:
+        image_highres_tensor = [_image.bfloat16().to("cuda") for _image in image_highres_tensor]
+    else:
+        image_highres_tensor = image_highres_tensor.bfloat16().to("cuda")
+pad_token_ids = 151643
+attention_masks = input_ids.ne(pad_token_ids).long().to('cuda')
+stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+keywords = [stop_str]
+stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+gen_kwargs = {}
+if "max_new_tokens" not in gen_kwargs:
+    gen_kwargs["max_new_tokens"] = 1024
+if "temperature" not in gen_kwargs:
+    gen_kwargs["temperature"] = 0.2
+if "top_p" not in gen_kwargs:
+    gen_kwargs["top_p"] = None
+if "num_beams" not in gen_kwargs:
+    gen_kwargs["num_beams"] = 1
+# breakpoint()
+with torch.inference_mode():
+    if modality == "video":
+        output_ids = model.generate(
+            inputs=input_ids,
+            images=video_data[0][0],
+            images_highres=video_data[0][1],
+            modalities=video_data[2],
+            speech=speechs,
+            speech_lengths=speech_lengths,
+            speech_chunks=speech_chunks,
+            speech_wav=speech_wavs,
+            attention_mask=attention_masks,
+            use_cache=True,
+            stopping_criteria=[stopping_criteria],
+            do_sample=True if gen_kwargs["temperature"] > 0 else False,
+            temperature=gen_kwargs["temperature"],
+            top_p=gen_kwargs["top_p"],
+            num_beams=gen_kwargs["num_beams"],
+            max_new_tokens=gen_kwargs["max_new_tokens"],
+        )
+    elif modality == "image":
+        output_ids = model.generate(
+            inputs=input_ids,
+            images=image_tensor,
+            images_highres=image_highres_tensor,
+            image_sizes=image_sizes,
+            modalities=['image'],
+            speech=speechs,
+            speech_lengths=speech_lengths,
+            speech_chunks=speech_chunks,
+            speech_wav=speech_wavs,
+            attention_mask=attention_masks,
+            use_cache=True,
+            stopping_criteria=[stopping_criteria],
+            do_sample=True if gen_kwargs["temperature"] > 0 else False,
+            temperature=gen_kwargs["temperature"],
+            top_p=gen_kwargs["top_p"],
+            num_beams=gen_kwargs["num_beams"],
+            max_new_tokens=gen_kwargs["max_new_tokens"],
+        )
+    elif modality == "text":
+        output_ids = model.generate(
+            input_ids,
+            images=images,
+            images_highres=images_highres,
+            image_sizes=image_sizes,
+            modalities=['text'],
+            speech=speechs,
+            speech_lengths=speech_lengths,
+            speech_chunks=speech_chunks,
+            speech_wav=speech_wavs,
+            attention_mask=attention_masks,
+            use_cache=True,
+            stopping_criteria=[stopping_criteria],
+            do_sample=True if gen_kwargs["temperature"] > 0 else False,
+            temperature=gen_kwargs["temperature"],
+            top_p=gen_kwargs["top_p"],
+            num_beams=gen_kwargs["num_beams"],
+            max_new_tokens=gen_kwargs["max_new_tokens"],
+            )
+outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+outputs = outputs.strip()
+if outputs.endswith(stop_str):
+    outputs = outputs[:-len(stop_str)]
+outputs = outputs.strip()
+print(outputs)

inference/infer_ola_internvl.py ADDED Viewed

	@@ -0,0 +1,448 @@

+import os
+os.environ['LOWRES_RESIZE'] = '384x32'
+os.environ['HIGHRES_BASE'] = '0x32'
+os.environ['VIDEO_RESIZE'] = "0x64"
+os.environ['VIDEO_MAXRES'] = "480"
+os.environ['VIDEO_MINRES'] = "288"
+os.environ['MAXRES'] = '1536'
+os.environ['MINRES'] = '0'
+os.environ['FORCE_NO_DOWNSAMPLE'] = '1'
+os.environ['LOAD_VISION_EARLY'] = '1'
+os.environ['PAD2STRIDE'] = '1'
+os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+import os
+import sys
+from pathlib import Path
+import math
+import numpy as np
+import torch
+import torchvision.transforms as T
+from decord import VideoReader, cpu  # 暂时注释掉，专注于语音功能测试
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoModel, AutoTokenizer
+from contextlib import redirect_stdout
+import io
+import librosa
+import whisper
+import moviepy as mp
+import torch
+from transformers import AutoTokenizer, AutoConfig, AutoModel
+# pure text
+# image + text
+# video + text
+# audio + text
+# video + audio + text
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+import gradio as gr
+import torch
+import re
+from decord import VideoReader, cpu
+from PIL import Image
+import numpy as np
+import transformers
+import moviepy as mp
+from typing import Dict, Optional, Sequence, List
+import librosa
+import whisper
+from ola.conversation import conv_templates, SeparatorStyle
+from ola.model.builder import load_pretrained_model
+from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token, tokenizer_speech_token
+from ola.mm_utils import KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image
+from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN, SPEECH_TOKEN_INDEX
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--model_path', type=str, default='/data1/cxy/plm-v/modeling/internvl3_5-2B')
+parser.add_argument('--text', type=str, default="What does the speech say?")
+parser.add_argument('--audio_path', type=str, default=None)
+parser.add_argument('--image_path', type=str, default=None)
+parser.add_argument('--video_path', type=str, default=None)
+args = parser.parse_args()
+model_path = args.model_path
+tokenizer, model, image_processor, _ = load_pretrained_model(model_path,'ola_internvl', None)
+model = model.to('cuda').eval()
+model = model.bfloat16()
+resource_path = "/data1/cxy/plm-v/modeling/example/"
+# set the max number of tiles in `max_num`
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def load_image(image_file, input_size=448, max_num=12):
+    image = Image.open(image_file).convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+pixel_values = load_image(f'{resource_path}image1.jpg', max_num=12).to(torch.bfloat16).cuda()
+# breakpoint()
+generation_config = dict(max_new_tokens=1024, do_sample=True)
+# breakpoint()
+question = 'Hello, who are you?'
+response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
+# 多模态推理测试
+print("\n" + "="*80)
+print("🧪 开始多模态推理测试")
+print("="*80)
+def test_inference(test_name, question, pixel_values_input=None, speech_input=None, speech_lengths_input=None, num_patches_list=None):
+    """统一的推理测试函数"""
+    print(f"\n{'='*60}")
+    print(f"🧪 测试: {test_name}")
+    print(f"📝 问题: {question}")
+    print(f"{'='*60}")
+    try:
+        # 准备参数
+        chat_kwargs = {
+            'tokenizer': tokenizer,
+            'pixel_values': pixel_values_input,
+            'question': question,
+            'generation_config': generation_config,
+            'verbose': True
+        }
+        # 如果有视频数据，添加num_patches_list参数
+        if num_patches_list is not None:
+            chat_kwargs['num_patches_list'] = num_patches_list
+        # 如果有speech数据，添加speech参数
+        if speech_input is not None:
+            chat_kwargs.update({
+                'speech': speech_input,  # mel 谱图，用于 Whisper
+                'speech_lengths': speech_lengths_input,
+                'speech_wav': speech_wavs,  # 原始音频波形，用于 BEATs
+            })
+        # 执行推理
+        # breakpoint()
+        response = model.chat(**chat_kwargs)
+        print(f"✅ 推理成功!")
+        print(f"🤖 回复: {response}")
+        return True, response
+    except Exception as e:
+        print(f"❌ 推理失败: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return False, str(e)
+# 测试1: Pure Text (应该正常，使用训练好的InternVL)
+success1, response1 = test_inference(
+    test_name="Pure Text",
+    question="Hello, who are you? Please introduce yourself briefly.",
+    pixel_values_input=None,
+    speech_input=None,
+    speech_lengths_input=None
+)
+# 测试2: Text & Image - Visual only (应该正常，使用训练好的InternVL)
+# success2, response2 = test_inference(
+#     test_name="Text & Image (Visual only)",
+#     question="<image>\nPlease describe this image in detail.",
+#     pixel_values_input=pixel_values,
+#     speech_input=None,
+#     speech_lengths_input=None
+# )
+print("\n" + "="*60)
+print("🔄 准备Speech相关测试 (可能输出乱码，因为speech部分未训练)")
+print("="*60)
+def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+    if bound:
+        start, end = bound[0], bound[1]
+    else:
+        start, end = -100000, 100000
+    start_idx = max(first_idx, round(start * fps))
+    end_idx = min(round(end * fps), max_frame)
+    seg_size = float(end_idx - start_idx) / num_segments
+    frame_indices = np.array([
+        int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+        for idx in range(num_segments)
+    ])
+    return frame_indices
+def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    max_frame = len(vr) - 1
+    fps = float(vr.get_avg_fps())
+    pixel_values_list, num_patches_list = [], []
+    transform = build_transform(input_size=input_size)
+    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
+    for frame_index in frame_indices:
+        img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
+        img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
+        pixel_values = [transform(tile) for tile in img]
+        pixel_values = torch.stack(pixel_values)
+        num_patches_list.append(pixel_values.shape[0])
+        pixel_values_list.append(pixel_values)
+    pixel_values = torch.cat(pixel_values_list)
+    return pixel_values, num_patches_list
+def load_audio(audio_file_name):
+    """
+    加载音频文件，使用Ola风格的mel谱图预处理
+    这与原始的Ola load_audio函数保持一致
+    """
+    speech_wav, samplerate = librosa.load(audio_file_name, sr=16000)
+    if len(speech_wav.shape) > 1:
+        speech_wav = speech_wav[:, 0]
+    speech_wav = speech_wav.astype(np.float32)
+    CHUNK_LIM = 480000
+    SAMPLE_RATE = 16000
+    speechs = []
+    speech_wavs = []
+    if len(speech_wav) <= CHUNK_LIM:
+        speech = whisper.pad_or_trim(speech_wav)
+        speech_wav_chunk = whisper.pad_or_trim(speech_wav)
+        speechs.append(speech)
+        speech_wavs.append(torch.from_numpy(speech_wav_chunk).unsqueeze(0))
+    else:
+        for i in range(0, len(speech_wav), CHUNK_LIM):
+            chunk = speech_wav[i : i + CHUNK_LIM]
+            if len(chunk) < CHUNK_LIM:
+                chunk = whisper.pad_or_trim(chunk)
+            speechs.append(chunk)
+            speech_wavs.append(torch.from_numpy(chunk).unsqueeze(0))
+    # 生成mel谱图
+    mels = []
+    for chunk in speechs:
+        chunk = whisper.log_mel_spectrogram(chunk, n_mels=128).permute(1, 0).unsqueeze(0)
+        mels.append(chunk)
+    mels = torch.cat(mels, dim=0)
+    speech_wavs = torch.cat(speech_wavs, dim=0)
+    if mels.shape[0] > 25:
+        mels = mels[:25]
+        speech_wavs = speech_wavs[:25]
+    speech_length = torch.LongTensor([mels.shape[1]] * mels.shape[0])
+    speech_chunks = torch.LongTensor([mels.shape[0]])
+    return mels, speech_length, speech_chunks, speech_wavs
+def extract_audio(videos_file_path):
+    my_clip = mp.VideoFileClip(videos_file_path)
+    return my_clip.audio
+# 加载视频数据用于视频测试
+print("\n📥 加载视频数据...")
+try:
+    video_path = f'{resource_path}red-panda.mp4'
+    if os.path.exists(video_path):
+        video_pixel_values, video_num_patches_list = load_video(video_path, num_segments=8, max_num=1)
+        video_pixel_values = video_pixel_values.to(torch.bfloat16).cuda()
+        video_loaded = True
+        print(f"✅ 视频加载成功:")
+        print(f"  - 视频帧数: {len(video_num_patches_list)}")
+        print(f"  - 视频像素值形状: {video_pixel_values.shape}")
+        print(f"  - 每帧patch数: {video_num_patches_list}")
+    else:
+        print(f"⚠️  视频文件不存在: {video_path}")
+        video_loaded = False
+        video_pixel_values = None
+        video_num_patches_list = None
+except Exception as e:
+    print(f"❌ 视频加载失败: {e}")
+    video_loaded = False
+    video_pixel_values = None
+    video_num_patches_list = None
+audio_path = f'/data1/cxy/dataset/english.mp3'
+# 加载音频数据用于后续测试
+print("\n📥 加载音频数据...")
+try:
+    # 加载音频文件 - 使用Ola风格的mel谱图预处理
+    mels, speech_lengths, speech_chunks, speech_wavs = load_audio(audio_path)
+    print(f"✅ 音频加载成功:")
+    print(f"  - mel谱图形状: {mels.shape}")
+    print(f"  - 音频长度: {speech_lengths}")
+    print(f"  - 音频块数: {speech_chunks}")
+    print(f"  - 原始音频波形形状: {speech_wavs.shape}")
+    # 将音频数据转换为适当的格式并移到GPU
+    mels = mels.to(torch.bfloat16).cuda()
+    speech_lengths = speech_lengths.cuda()
+    speech_chunks = speech_chunks.cuda()
+    speech_wavs = speech_wavs.cuda()
+    audio_loaded = True
+except Exception as e:
+    print(f"❌ 音频加载失败: {e}")
+    audio_loaded = False
+    mels = None
+    speech_lengths = None
+# 测试3: Audio only (可能乱码，speech部分未训练)
+if audio_loaded:
+    success3, response3 = test_inference(
+        test_name="Audio only (预期乱码)",
+        question="<speech>\nPlease transcribe and summarize what you heard in the audio.",
+        pixel_values_input=None,
+        speech_input=mels,
+        speech_lengths_input=speech_lengths
+    )
+else:
+    print("⚠️  跳过Audio only测试 (音频加载失败)")
+    success3 = False
+# # 测试4: Audio + Image (可能乱码，speech部分未训练)
+# if audio_loaded:
+#     success4, response4 = test_inference(
+#         test_name="Audio + Image (预期乱码)",
+#         question="<image>\nUser's question in speech: <speech>\n",
+#         pixel_values_input=pixel_values,
+#         speech_input=mels,
+#         speech_lengths_input=speech_lengths
+#     )
+# else:
+#     print("⚠️  跳过Audio + Image测试 (音频加载失败)")
+#     success4 = False
+# 测试5: Video + Text (应该正常，使用训练好的InternVL)
+# if video_loaded:
+#     # 构建视频帧前缀
+#     video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(video_num_patches_list))])
+#     video_question = video_prefix + 'What is the red panda doing in this video? Please describe the actions and movements you observe.'
+#     success5, response5 = test_inference(
+#         test_name="Video + Text",
+#         question=video_question,
+#         pixel_values_input=video_pixel_values,
+#         speech_input=None,
+#         speech_lengths_input=None,
+#         num_patches_list=video_num_patches_list
+#     )
+# else:
+#     print("⚠️  跳过Video + Text测试 (视频加载失败)")
+#     success5 = False
+# 测试5: Video + Audio (可能乱码，speech部分未训练)
+# if audio_loaded:
+#     success5, response5 = test_inference(
+#         test_name="Video + Audio (预期乱码)",
+#         question="<speech><image>\nDescribe what you hear and see in this content.",
+#         pixel_values_input=pixel_values,
+#         speech_input=mels,
+#         speech_lengths_input=speech_lengths
+#     )
+# else:
+#     print("⚠️  跳过Video + Audio测试 (音频加载失败)")
+#     success5 = False
+# 测试总结
+print("\n" + "="*80)
+print("📊 多模态推理测试总结")
+print("="*80)
+test_results = [
+    ("Pure Text", success1, "PASS", "应该正常 (训练好的InternVL)"),
+    # ("Text & Image", success2, "PASS", "应该正常 (训练好的InternVL)"),
+    # ("Video + Text", success5 if video_loaded else False, "PASS", "应该正常 (训练好的InternVL)"),
+    ("Audio only", success3 if audio_loaded else False, "GARBLED", "可能乱码 (speech未训练)"),
+    # ("Audio + Image", success4 if audio_loaded else False, "GARBLED", "可能乱码 (speech未训练)"),
+]
+for test_name, success, expected, note in test_results:
+    status = "✅ PASS" if success else "❌ FAIL"
+    print(f"{status} {test_name:<15} (预期: {expected:<8}) - {note}")
+passed = sum(1 for _, success, _, _ in test_results if success)
+total = len(test_results)
+print(f"\n📈 测试统计: {passed}/{total} 通过")
+if passed >= 2:  # 至少pure text、text&image、video+text中的2个应该通过
+    print("🎉 基础功能正常，Speech集成架构成功！")
+    print("💡 Speech相关测试如果输出乱码是正常的，因为speech部分还未训练")
+    if passed >= 3:
+        print("🌟 所有基础模态测试都通过了！")
+else:
+    print("⚠️  基础功能可能存在问题，需要进一步检查")
+print("\n=== 多模态推理测试完成 ===")

inference/infer_ola_internvl_audio.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import os
+os.environ['LOWRES_RESIZE'] = '384x32'
+os.environ['HIGHRES_BASE'] = '0x32'
+os.environ['VIDEO_RESIZE'] = "0x64"
+os.environ['VIDEO_MAXRES'] = "480"
+os.environ['VIDEO_MINRES'] = "288"
+os.environ['MAXRES'] = '1536'
+os.environ['MINRES'] = '0'
+os.environ['FORCE_NO_DOWNSAMPLE'] = '1'
+os.environ['LOAD_VISION_EARLY'] = '1'
+os.environ['PAD2STRIDE'] = '1'
+os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+import os
+import sys
+from pathlib import Path
+import math
+import numpy as np
+import torch
+import torchvision.transforms as T
+from decord import VideoReader, cpu  # 暂时注释掉，专注于语音功能测试
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoModel, AutoTokenizer
+from contextlib import redirect_stdout
+import io
+import librosa
+import whisper
+import moviepy as mp
+import torch
+from transformers import AutoTokenizer, AutoConfig, AutoModel
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+import gradio as gr
+import torch
+import re
+from decord import VideoReader, cpu
+from PIL import Image
+import numpy as np
+import transformers
+import moviepy as mp
+from typing import Dict, Optional, Sequence, List
+import librosa
+import whisper
+from ola.conversation import conv_templates, SeparatorStyle
+from ola.model.builder import load_pretrained_model
+from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token, tokenizer_speech_token
+from ola.mm_utils import KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image
+from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN, SPEECH_TOKEN_INDEX
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--model_path', type=str, default='/data1/cxy/plm-v/modeling/internvl3_5-2B')
+parser.add_argument('--text', type=str, default="What does the speech say?")
+parser.add_argument('--audio_path', type=str, default=None)
+parser.add_argument('--image_path', type=str, default=None)
+parser.add_argument('--video_path', type=str, default=None)
+args = parser.parse_args()
+model_path = args.model_path
+tokenizer, model, image_processor, _ = load_pretrained_model(model_path,'ola_internvl', None)
+model = model.to('cuda').eval()
+model = model.bfloat16()
+resource_path = "/data1/cxy/plm-v/modeling/example/"
+generation_config = dict(
+    max_new_tokens=256,
+    do_sample=False,  # Use greedy decoding to avoid sampling issues
+    temperature=0.5,
+    top_p=0.8,
+    top_k=10,
+)
+# 多模态推理测试
+print("\n" + "="*80)
+print("🧪 开始多模态推理测试")
+print("="*80)
+def test_inference(test_name, question, pixel_values_input=None, speech_input=None, speech_lengths_input=None, speech_wavs_input=None, speech_chunks_input=None, num_patches_list=None):
+    """统一的推理测试函数"""
+    print(f"\n{'='*60}")
+    print(f"🧪 测试: {test_name}")
+    print(f"📝 问题: {question}")
+    print(f"{'='*60}")
+    try:
+        # 准备参数
+        chat_kwargs = {
+            'tokenizer': tokenizer,
+            'pixel_values': pixel_values_input,
+            'question': question,
+            'generation_config': generation_config,
+            'verbose': True
+        }
+        # 如果有视频数据，添加num_patches_list参数
+        if num_patches_list is not None:
+            chat_kwargs['num_patches_list'] = num_patches_list
+        # 如果有speech数据，添加speech参数
+        if speech_input is not None:
+            chat_kwargs.update({
+                'speech': speech_input,  # mel 谱图，用于 Whisper
+                'speech_lengths': speech_lengths_input,
+                'speech_wav': speech_wavs_input,  # 原始音频波形，用于 BEATs
+            })
+        # 如果有speech_chunks数据，添加speech_chunks参数
+        if speech_chunks_input is not None:
+            chat_kwargs['speech_chunks'] = speech_chunks_input
+        # 执行推理
+        # breakpoint()
+        response = model.chat(**chat_kwargs)
+        print(f"✅ 推理成功!")
+        print(f"🤖 回复: {response}")
+        return True, response
+    except Exception as e:
+        print(f"❌ 推理失败: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return False, str(e)
+# success1, response1 = test_inference(
+#     test_name="Pure Text",
+#     question="What is China's capital? Please introduce the city in detail.",
+#     pixel_values_input=None,
+#     speech_input=None,
+#     speech_lengths_input=None
+# )
+print("\n" + "="*60)
+print("🔄 准备Speech相关测试 (可能输出乱码，因为speech部分未训练)")
+print("="*60)
+def load_audio(audio_file_name):
+    speech_wav, samplerate = librosa.load(audio_file_name, sr=16000)
+    if len(speech_wav.shape) > 1:
+        speech_wav = speech_wav[:, 0]
+    speech_wav = speech_wav.astype(np.float32)
+    CHUNK_LIM = 480000
+    SAMPLE_RATE = 16000
+    speechs = []
+    speech_wavs = []
+    if len(speech_wav) <= CHUNK_LIM:
+        speech = whisper.pad_or_trim(speech_wav)
+        speech_wav = whisper.pad_or_trim(speech_wav)
+        speechs.append(speech)
+        speech_wavs.append(torch.from_numpy(speech_wav).unsqueeze(0))
+    else:
+        for i in range(0, len(speech_wav), CHUNK_LIM):
+            chunk = speech_wav[i : i + CHUNK_LIM]
+            if len(chunk) < CHUNK_LIM:
+                chunk = whisper.pad_or_trim(chunk)
+            speechs.append(chunk)
+            speech_wavs.append(torch.from_numpy(chunk).unsqueeze(0))
+    mels = []
+    for chunk in speechs:
+        chunk = whisper.log_mel_spectrogram(chunk, n_mels=128).permute(1, 0).unsqueeze(0)
+        mels.append(chunk)
+    mels = torch.cat(mels, dim=0)
+    speech_wavs = torch.cat(speech_wavs, dim=0)
+    if mels.shape[0] > 25:
+        mels = mels[:25]
+        speech_wavs = speech_wavs[:25]
+    speech_length = torch.LongTensor([mels.shape[1]] * mels.shape[0])
+    speech_chunks = torch.LongTensor([mels.shape[0]])
+    return mels, speech_length, speech_chunks, speech_wavs
+audio_path = f'/data1/cxy/dataset/english.mp3'
+# 加载音频数据用于后续测试
+print("\n📥 加载音频数据...")
+try:
+    # 加载音频文件 - 使用Ola风格的mel谱图预处理
+    speech, speech_lengths, speech_chunks, speech_wavs = load_audio(audio_path)
+    print(f"✅ 音频加载成功:")
+    print(f"  - mel谱图形状: {speech.shape}")
+    print(f"  - 音频长度: {speech_lengths}")
+    print(f"  - 音频块数: {speech_chunks}")
+    print(f"  - 原始音频波形形状: {speech_wavs.shape}")
+    # 将音频数据转换为适当的格式并移到GPU
+    speech = speech.to(torch.bfloat16).cuda()
+    speech_lengths = speech_lengths.cuda()
+    speech_chunks = speech_chunks.cuda()
+    speech_wavs = speech_wavs.cuda()
+    audio_loaded = True
+except Exception as e:
+    print(f"❌ 音频加载失败: {e}")
+    audio_loaded = False
+    mels = None
+    speech_lengths = None
+# 测试3: Audio only (可能乱码，speech部分未训练)
+if audio_loaded:
+    success3, response3 = test_inference(
+        test_name="Audio only (预期乱码)",
+        question="<speech>\nPlease transcribe and summarize what you heard in the audio.",
+        pixel_values_input=None,
+        speech_input=speech,
+        speech_lengths_input=speech_lengths,
+        speech_wavs_input=speech_wavs,
+        speech_chunks_input=speech_chunks
+    )
+else:
+    print("⚠️  跳过Audio only测试 (音频加载失败)")
+    success3 = False
+# 测试总结
+print("\n" + "="*80)
+print("📊 多模态推理测试总结")
+print("="*80)
+test_results = [
+    ("Audio only", success3 if audio_loaded else False, "GARBLED", "可能乱码 (speech未训练)"),
+]
+for test_name, success, expected, note in test_results:
+    status = "✅ PASS" if success else "❌ FAIL"
+    print(f"{status} {test_name:<15} (预期: {expected:<8}) - {note}")
+passed = sum(1 for _, success, _, _ in test_results if success)
+total = len(test_results)
+print(f"\n📈 测试统计: {passed}/{total} 通过")
+print("\n=== 多模态推理测试完成 ===")

inference/infer_ola_internvl_audio_ckpt.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import os
+os.environ['LOWRES_RESIZE'] = '384x32'
+os.environ['HIGHRES_BASE'] = '0x32'
+os.environ['VIDEO_RESIZE'] = "0x64"
+os.environ['VIDEO_MAXRES'] = "480"
+os.environ['VIDEO_MINRES'] = "288"
+os.environ['MAXRES'] = '1536'
+os.environ['MINRES'] = '0'
+os.environ['FORCE_NO_DOWNSAMPLE'] = '1'
+os.environ['LOAD_VISION_EARLY'] = '1'
+os.environ['PAD2STRIDE'] = '1'
+os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+import os
+import sys
+from pathlib import Path
+import math
+import numpy as np
+import torch
+import torchvision.transforms as T
+from decord import VideoReader, cpu  # 暂时注释掉，专注于语音功能测试
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoModel, AutoTokenizer
+from contextlib import redirect_stdout
+import io
+import librosa
+import whisper
+import moviepy as mp
+import torch
+from transformers import AutoTokenizer, AutoConfig, AutoModel
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+import gradio as gr
+import torch
+import re
+from decord import VideoReader, cpu
+from PIL import Image
+import numpy as np
+import transformers
+import moviepy as mp
+from typing import Dict, Optional, Sequence, List
+import librosa
+import whisper
+from ola.conversation import conv_templates, SeparatorStyle
+from ola.model.builder import load_pretrained_model
+from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token, tokenizer_speech_token
+from ola.mm_utils import KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image
+from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN, SPEECH_TOKEN_INDEX
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--model_path', type=str, default='/data1/cxy/plm-v/modeling/internvl3_5-2B')
+parser.add_argument('--text', type=str, default="Give the caption of the given audio or speech.")
+parser.add_argument('--audio_path', type=str, default=None)
+parser.add_argument('--image_path', type=str, default=None)
+parser.add_argument('--video_path', type=str, default=None)
+args = parser.parse_args()
+model_path = args.model_path
+tokenizer, model, image_processor, _ = load_pretrained_model(model_path,'ola_internvl', None)
+model = model.to('cuda').eval()
+model = model.bfloat16()
+resource_path = "/data1/cxy/plm-v/modeling/example/"
+generation_config = dict(
+    max_new_tokens=256,
+    do_sample=False,  # Use greedy decoding to avoid sampling issues
+    temperature=0.5,
+    top_p=0.8,
+    top_k=10,
+)
+# 多模态推理测试
+print("\n" + "="*80)
+print("🧪 开始多模态推理测试")
+print("="*80)
+def test_inference(test_name, question, pixel_values_input=None, speech_input=None, speech_lengths_input=None, speech_wavs_input=None, speech_chunks_input=None, num_patches_list=None):
+    """统一的推理测试函数"""
+    print(f"\n{'='*60}")
+    print(f"🧪 测试: {test_name}")
+    print(f"📝 问题: {question}")
+    print(f"{'='*60}")
+    try:
+        # 准备参数
+        chat_kwargs = {
+            'tokenizer': tokenizer,
+            'pixel_values': pixel_values_input,
+            'question': question,
+            'generation_config': generation_config,
+            'verbose': True
+        }
+        # 如果有视频数据，添加num_patches_list参数
+        if num_patches_list is not None:
+            chat_kwargs['num_patches_list'] = num_patches_list
+        # 如果有speech数据，添加speech参数
+        if speech_input is not None:
+            chat_kwargs.update({
+                'speech': speech_input,  # mel 谱图，用于 Whisper
+                'speech_lengths': speech_lengths_input,
+                'speech_wav': speech_wavs_input,  # 原始音频波形，用于 BEATs
+            })
+        # 如果有speech_chunks数据，添加speech_chunks参数
+        if speech_chunks_input is not None:
+            chat_kwargs['speech_chunks'] = speech_chunks_input
+        # 执行推理
+        # breakpoint()
+        response = model.chat(**chat_kwargs)
+        print(f"✅ 推理成功!")
+        print(f"🤖 回复: {response}")
+        return True, response
+    except Exception as e:
+        print(f"❌ 推理失败: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return False, str(e)
+# success1, response1 = test_inference(
+#     test_name="Pure Text",
+#     question="What is China's capital? Please introduce the city in detail.",
+#     pixel_values_input=None,
+#     speech_input=None,
+#     speech_lengths_input=None
+# )
+print("\n" + "="*60)
+print("🔄 准备Speech相关测试 (可能输出乱码，因为speech部分未训练)")
+print("="*60)
+def load_audio(audio_file_name):
+    speech_wav, samplerate = librosa.load(audio_file_name, sr=16000)
+    if len(speech_wav.shape) > 1:
+        speech_wav = speech_wav[:, 0]
+    speech_wav = speech_wav.astype(np.float32)
+    CHUNK_LIM = 480000
+    SAMPLE_RATE = 16000
+    speechs = []
+    speech_wavs = []
+    if len(speech_wav) <= CHUNK_LIM:
+        speech = whisper.pad_or_trim(speech_wav)
+        speech_wav = whisper.pad_or_trim(speech_wav)
+        speechs.append(speech)
+        speech_wavs.append(torch.from_numpy(speech_wav).unsqueeze(0))
+    else:
+        for i in range(0, len(speech_wav), CHUNK_LIM):
+            chunk = speech_wav[i : i + CHUNK_LIM]
+            if len(chunk) < CHUNK_LIM:
+                chunk = whisper.pad_or_trim(chunk)
+            speechs.append(chunk)
+            speech_wavs.append(torch.from_numpy(chunk).unsqueeze(0))
+    mels = []
+    for chunk in speechs:
+        chunk = whisper.log_mel_spectrogram(chunk, n_mels=128).permute(1, 0).unsqueeze(0)
+        mels.append(chunk)
+    mels = torch.cat(mels, dim=0)
+    speech_wavs = torch.cat(speech_wavs, dim=0)
+    if mels.shape[0] > 25:
+        mels = mels[:25]
+        speech_wavs = speech_wavs[:25]
+    speech_length = torch.LongTensor([mels.shape[1]] * mels.shape[0])
+    speech_chunks = torch.LongTensor([mels.shape[0]])
+    return mels, speech_length, speech_chunks, speech_wavs
+# audio_path = f'/data1/cxy/dataset/english.mp3'
+audio_path = "/data1/cxy/plm-v/modeling/data/Clotho/train/Leaves rustling.wav"
+# 加载音频数据用于后续测试
+print("\n📥 加载音频数据...")
+try:
+    # 加载音频文件 - 使用Ola风格的mel谱图预处理
+    speech, speech_lengths, speech_chunks, speech_wavs = load_audio(audio_path)
+    print(f"✅ 音频加载成功:")
+    print(f"  - mel谱图形状: {speech.shape}")
+    print(f"  - 音频长度: {speech_lengths}")
+    print(f"  - 音频块数: {speech_chunks}")
+    print(f"  - 原始音频波形形状: {speech_wavs.shape}")
+    # 将音频数据转换为适当的格式并移到GPU
+    speech = speech.to(torch.bfloat16).cuda()
+    speech_lengths = speech_lengths.cuda()
+    speech_chunks = speech_chunks.cuda()
+    speech_wavs = speech_wavs.cuda()
+    audio_loaded = True
+except Exception as e:
+    print(f"❌ 音频加载失败: {e}")
+    audio_loaded = False
+    mels = None
+    speech_lengths = None
+# 测试3: Audio only (可能乱码，speech部分未训练)
+if audio_loaded:
+    success3, response3 = test_inference(
+        test_name="Audio only (预期乱码)",
+        question="<speech>\nGive the caption of the given audio or speech.",
+        pixel_values_input=None,
+        speech_input=speech,
+        speech_lengths_input=speech_lengths,
+        speech_wavs_input=speech_wavs,
+        speech_chunks_input=speech_chunks
+    )
+else:
+    print("⚠️  跳过Audio only测试 (音频加载失败)")
+    success3 = False
+# 测试总结
+print("\n" + "="*80)
+print("📊 多模态推理测试总结")
+print("="*80)
+test_results = [
+    ("Audio only", success3 if audio_loaded else False, "GARBLED", "可能乱码 (speech未训练)"),
+]
+for test_name, success, expected, note in test_results:
+    status = "✅ PASS" if success else "❌ FAIL"
+    print(f"{status} {test_name:<15} (预期: {expected:<8}) - {note}")
+passed = sum(1 for _, success, _, _ in test_results if success)
+total = len(test_results)
+print(f"\n📈 测试统计: {passed}/{total} 通过")
+print("\n=== 多模态推理测试完成 ===")

inference/infer_ola_internvl_copy.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import os
+os.environ['LOWRES_RESIZE'] = '384x32'
+os.environ['HIGHRES_BASE'] = '0x32'
+os.environ['VIDEO_RESIZE'] = "0x64"
+os.environ['VIDEO_MAXRES'] = "480"
+os.environ['VIDEO_MINRES'] = "288"
+os.environ['MAXRES'] = '1536'
+os.environ['MINRES'] = '0'
+os.environ['FORCE_NO_DOWNSAMPLE'] = '1'
+os.environ['LOAD_VISION_EARLY'] = '1'
+os.environ['PAD2STRIDE'] = '1'
+import gradio as gr
+import torch
+import re
+from decord import VideoReader, cpu
+from PIL import Image
+import numpy as np
+import transformers
+import moviepy as mp
+from typing import Dict, Optional, Sequence, List
+import librosa
+import whisper
+from ola.conversation import conv_templates, SeparatorStyle
+from ola.model.builder import load_pretrained_model
+from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token, tokenizer_speech_token
+from ola.mm_utils import KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image
+from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN, SPEECH_TOKEN_INDEX
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--model_path', type=str, default='/data1/cxy/plm-v/modeling/internvl3_5-2B')
+parser.add_argument('--text', type=str, default="What does the speech say?")
+parser.add_argument('--audio_path', type=str, default=None)
+parser.add_argument('--image_path', type=str, default=None)
+parser.add_argument('--video_path', type=str, default=None)
+args = parser.parse_args()
+model_path = args.model_path
+tokenizer, model, image_processor, _ = load_pretrained_model(model_path,'ola_internvl', None)
+breakpoint()
+model = model.to('cuda').eval()
+model = model.bfloat16()
+# breakpoint()
+USE_SPEECH=False
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+def load_audio(audio_file_name):
+    speech_wav, samplerate = librosa.load(audio_file_name, sr=16000)
+    if len(speech_wav.shape) > 1:
+        speech_wav = speech_wav[:, 0]
+    speech_wav = speech_wav.astype(np.float32)
+    CHUNK_LIM = 480000
+    SAMPLE_RATE = 16000
+    speechs = []
+    speech_wavs = []
+    if len(speech_wav) <= CHUNK_LIM:
+        speech = whisper.pad_or_trim(speech_wav)
+        speech_wav = whisper.pad_or_trim(speech_wav)
+        speechs.append(speech)
+        speech_wavs.append(torch.from_numpy(speech_wav).unsqueeze(0))
+    else:
+        for i in range(0, len(speech_wav), CHUNK_LIM):
+            chunk = speech_wav[i : i + CHUNK_LIM]
+            if len(chunk) < CHUNK_LIM:
+                chunk = whisper.pad_or_trim(chunk)
+            speechs.append(chunk)
+            speech_wavs.append(torch.from_numpy(chunk).unsqueeze(0))
+    mels = []
+    for chunk in speechs:
+        chunk = whisper.log_mel_spectrogram(chunk, n_mels=128).permute(1, 0).unsqueeze(0)
+        mels.append(chunk)
+    mels = torch.cat(mels, dim=0)
+    speech_wavs = torch.cat(speech_wavs, dim=0)
+    if mels.shape[0] > 25:
+        mels = mels[:25]
+        speech_wavs = speech_wavs[:25]
+    speech_length = torch.LongTensor([mels.shape[1]] * mels.shape[0])
+    speech_chunks = torch.LongTensor([mels.shape[0]])
+    return mels, speech_length, speech_chunks, speech_wavs
+def extract_audio(videos_file_path):
+    my_clip = mp.VideoFileClip(videos_file_path)
+    return my_clip.audio
+image_path = args.image_path
+audio_path = args.audio_path
+video_path = args.video_path
+text = args.text
+modality = "text"
+if video_path is not None:
+    modality = "video"
+    visual = video_path
+    assert image_path is None
+elif image_path is not None:
+    visual = image_path
+    modality = "image"
+    assert video_path is None
+elif audio_path is not None:
+    modality = "text"
+# input audio and video, do not parse audio in the video, else parse audio in the video
+if audio_path:
+    USE_SPEECH = True
+elif modality == "video":
+    USE_SPEECH = True
+else:
+    USE_SPEECH = False
+speechs = []
+speech_lengths = []
+speech_wavs = []
+speech_chunks = []
+if modality == "video":
+    vr = VideoReader(visual, ctx=cpu(0))
+    total_frame_num = len(vr)
+    fps = round(vr.get_avg_fps())
+    uniform_sampled_frames = np.linspace(0, total_frame_num - 1, 64, dtype=int)
+    frame_idx = uniform_sampled_frames.tolist()
+    spare_frames = vr.get_batch(frame_idx).asnumpy()
+    video = [Image.fromarray(frame) for frame in spare_frames]
+elif modality == "image":
+    image = [Image.open(visual)]
+    image_sizes = [image[0].size]
+else:
+    images = [torch.zeros(1, 3, 224, 224).to(dtype=torch.bfloat16, device='cuda', non_blocking=True)]
+    images_highres = [torch.zeros(1, 3, 224, 224).to(dtype=torch.bfloat16, device='cuda', non_blocking=True)]
+    image_sizes = [(224, 224)]
+if USE_SPEECH and audio_path:
+    audio_path = audio_path
+    speech, speech_length, speech_chunk, speech_wav = load_audio(audio_path)
+    speechs.append(speech.bfloat16().to('cuda'))
+    speech_lengths.append(speech_length.to('cuda'))
+    speech_chunks.append(speech_chunk.to('cuda'))
+    speech_wavs.append(speech_wav.to('cuda'))
+    print('load audio')
+elif USE_SPEECH and not audio_path:
+    # parse audio in the video
+    audio = extract_audio(visual)
+    audio.write_audiofile("./video_audio.wav")
+    video_audio_path = './video_audio.wav'
+    speech, speech_length, speech_chunk, speech_wav = load_audio(video_audio_path)
+    speechs.append(speech.bfloat16().to('cuda'))
+    speech_lengths.append(speech_length.to('cuda'))
+    speech_chunks.append(speech_chunk.to('cuda'))
+    speech_wavs.append(speech_wav.to('cuda'))
+else:
+    speechs = [torch.zeros(1, 3000, 128).bfloat16().to('cuda')]
+    speech_lengths = [torch.LongTensor([3000]).to('cuda')]
+    speech_wavs = [torch.zeros([1, 480000]).to('cuda')]
+    speech_chunks = [torch.LongTensor([1]).to('cuda')]
+conv_mode = "qwen_1_5"
+if text:
+    qs = text
+else:
+    qs = ''
+if USE_SPEECH and audio_path and image_path: # image + speech instruction
+    qs = DEFAULT_IMAGE_TOKEN + "\n" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '\n'
+elif USE_SPEECH and video_path: # video + audio
+    qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "\n" + qs
+elif USE_SPEECH and audio_path: # audio + text
+    qs = DEFAULT_SPEECH_TOKEN + "\n" + qs
+elif image_path or video_path: # image / video
+    qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
+elif text: # text
+    qs = qs
+conv = conv_templates[conv_mode].copy()
+conv.append_message(conv.roles[0], qs)
+conv.append_message(conv.roles[1], None)
+prompt = conv.get_prompt()
+if USE_SPEECH and audio_path and image_path: # image + speech instruction
+    input_ids = tokenizer_speech_question_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
+elif USE_SPEECH and video_path: # video + audio
+    input_ids = tokenizer_speech_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
+elif USE_SPEECH and audio_path: # audio + text
+    # breakpoint()
+    input_ids = tokenizer_speech_token(prompt, tokenizer, SPEECH_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
+else:
+    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
+if modality == "video":
+    video_processed = []
+    for idx, frame in enumerate(video):
+        image_processor.do_resize = False
+        image_processor.do_center_crop = False
+        frame = process_anyres_video(frame, image_processor)
+        if frame_idx is not None and idx in frame_idx:
+            video_processed.append(frame.unsqueeze(0))
+        elif frame_idx is None:
+            video_processed.append(frame.unsqueeze(0))
+    if frame_idx is None:
+        frame_idx = np.arange(0, len(video_processed), dtype=int).tolist()
+    video_processed = torch.cat(video_processed, dim=0).bfloat16().to("cuda")
+    video_processed = (video_processed, video_processed)
+    video_data = (video_processed, (384, 384), "video")
+elif modality == "image":
+    image_processor.do_resize = False
+    image_processor.do_center_crop = False
+    image_tensor, image_highres_tensor = [], []
+    for visual in image:
+        image_tensor_, image_highres_tensor_ = process_anyres_highres_image(visual, image_processor)
+        image_tensor.append(image_tensor_)
+        image_highres_tensor.append(image_highres_tensor_)
+    if all(x.shape == image_tensor[0].shape for x in image_tensor):
+        image_tensor = torch.stack(image_tensor, dim=0)
+    if all(x.shape == image_highres_tensor[0].shape for x in image_highres_tensor):
+        image_highres_tensor = torch.stack(image_highres_tensor, dim=0)
+    if type(image_tensor) is list:
+        image_tensor = [_image.bfloat16().to("cuda") for _image in image_tensor]
+    else:
+        image_tensor = image_tensor.bfloat16().to("cuda")
+    if type(image_highres_tensor) is list:
+        image_highres_tensor = [_image.bfloat16().to("cuda") for _image in image_highres_tensor]
+    else:
+        image_highres_tensor = image_highres_tensor.bfloat16().to("cuda")
+pad_token_ids = 151643
+attention_masks = input_ids.ne(pad_token_ids).long().to('cuda')
+stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+keywords = [stop_str]
+stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+gen_kwargs = {}
+if "max_new_tokens" not in gen_kwargs:
+    gen_kwargs["max_new_tokens"] = 1024
+if "temperature" not in gen_kwargs:
+    gen_kwargs["temperature"] = 0.2
+if "top_p" not in gen_kwargs:
+    gen_kwargs["top_p"] = None
+if "num_beams" not in gen_kwargs:
+    gen_kwargs["num_beams"] = 1
+# breakpoint()
+with torch.inference_mode():
+    if modality == "video":
+        output_ids = model.generate(
+            inputs=input_ids,
+            images=video_data[0][0],
+            images_highres=video_data[0][1],
+            modalities=video_data[2],
+            speech=speechs,
+            speech_lengths=speech_lengths,
+            speech_chunks=speech_chunks,
+            speech_wav=speech_wavs,
+            attention_mask=attention_masks,
+            use_cache=True,
+            stopping_criteria=[stopping_criteria],
+            do_sample=True if gen_kwargs["temperature"] > 0 else False,
+            temperature=gen_kwargs["temperature"],
+            top_p=gen_kwargs["top_p"],
+            num_beams=gen_kwargs["num_beams"],
+            max_new_tokens=gen_kwargs["max_new_tokens"],
+        )
+    elif modality == "image":
+        output_ids = model.generate(
+            inputs=input_ids,
+            images=image_tensor,
+            images_highres=image_highres_tensor,
+            image_sizes=image_sizes,
+            modalities=['image'],
+            speech=speechs,
+            speech_lengths=speech_lengths,
+            speech_chunks=speech_chunks,
+            speech_wav=speech_wavs,
+            attention_mask=attention_masks,
+            use_cache=True,
+            stopping_criteria=[stopping_criteria],
+            do_sample=True if gen_kwargs["temperature"] > 0 else False,
+            temperature=gen_kwargs["temperature"],
+            top_p=gen_kwargs["top_p"],
+            num_beams=gen_kwargs["num_beams"],
+            max_new_tokens=gen_kwargs["max_new_tokens"],
+        )
+    elif modality == "text":
+        output_ids = model.generate(
+            input_ids,
+            images=images,
+            images_highres=images_highres,
+            image_sizes=image_sizes,
+            modalities=['text'],
+            speech=speechs,
+            speech_lengths=speech_lengths,
+            speech_chunks=speech_chunks,
+            speech_wav=speech_wavs,
+            attention_mask=attention_masks,
+            use_cache=True,
+            stopping_criteria=[stopping_criteria],
+            do_sample=True if gen_kwargs["temperature"] > 0 else False,
+            temperature=gen_kwargs["temperature"],
+            top_p=gen_kwargs["top_p"],
+            num_beams=gen_kwargs["num_beams"],
+            max_new_tokens=gen_kwargs["max_new_tokens"],
+            )
+outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+outputs = outputs.strip()
+if outputs.endswith(stop_str):
+    outputs = outputs[:-len(stop_str)]
+outputs = outputs.strip()
+print(outputs)

inference/infer_ola_internvl_text_visual.py ADDED Viewed

	@@ -0,0 +1,409 @@

+import os
+os.environ['LOWRES_RESIZE'] = '384x32'
+os.environ['HIGHRES_BASE'] = '0x32'
+os.environ['VIDEO_RESIZE'] = "0x64"
+os.environ['VIDEO_MAXRES'] = "480"
+os.environ['VIDEO_MINRES'] = "288"
+os.environ['MAXRES'] = '1536'
+os.environ['MINRES'] = '0'
+os.environ['FORCE_NO_DOWNSAMPLE'] = '1'
+os.environ['LOAD_VISION_EARLY'] = '1'
+os.environ['PAD2STRIDE'] = '1'
+os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+import os
+import sys
+from pathlib import Path
+import math
+import numpy as np
+import torch
+import torchvision.transforms as T
+from decord import VideoReader, cpu  # 暂时注释掉，专注于语音功能测试
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoModel, AutoTokenizer
+from contextlib import redirect_stdout
+import io
+import librosa
+import whisper
+import moviepy as mp
+import torch
+from transformers import AutoTokenizer, AutoConfig, AutoModel
+# pure text
+# image + text
+# video + text
+# audio + text
+# video + audio + text
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+import gradio as gr
+import torch
+import re
+from decord import VideoReader, cpu
+from PIL import Image
+import numpy as np
+import transformers
+import moviepy as mp
+from typing import Dict, Optional, Sequence, List
+import librosa
+import whisper
+from ola.conversation import conv_templates, SeparatorStyle
+from ola.model.builder import load_pretrained_model
+from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token, tokenizer_speech_token
+from ola.mm_utils import KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image
+from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN, SPEECH_TOKEN_INDEX
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--model_path', type=str, default='/data1/cxy/plm-v/modeling/internvl3_5-2B')
+parser.add_argument('--text', type=str, default="What does the speech say?")
+parser.add_argument('--audio_path', type=str, default=None)
+parser.add_argument('--image_path', type=str, default=None)
+parser.add_argument('--video_path', type=str, default=None)
+args = parser.parse_args()
+model_path = args.model_path
+tokenizer, model, image_processor, _ = load_pretrained_model(model_path,'ola_internvl', None)
+model = model.to('cuda').eval()
+model = model.bfloat16()
+resource_path = "/data1/cxy/plm-v/modeling/example/"
+# set the max number of tiles in `max_num`
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def load_image(image_file, input_size=448, max_num=12):
+    image = Image.open(image_file).convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+pixel_values = load_image(f'{resource_path}image1.jpg', max_num=12).to(torch.bfloat16).cuda()
+# breakpoint()
+generation_config = dict(max_new_tokens=1024, do_sample=True)
+generation_config = dict(
+    max_new_tokens=256,
+    do_sample=False,  # Use greedy decoding to avoid sampling issues
+    temperature=0.5,
+    top_p=0.8,
+    top_k=10,
+)
+# 多模态推理测试
+print("\n" + "="*80)
+print("🧪 开始多模态推理测试")
+print("="*80)
+def test_inference(test_name, question, pixel_values_input=None, speech_input=None, speech_lengths_input=None, num_patches_list=None):
+    """统一的推理测试函数"""
+    print(f"\n{'='*60}")
+    print(f"🧪 测试: {test_name}")
+    print(f"📝 问题: {question}")
+    print(f"{'='*60}")
+    try:
+        # 准备参数
+        chat_kwargs = {
+            'tokenizer': tokenizer,
+            'pixel_values': pixel_values_input,
+            'question': question,
+            'generation_config': generation_config,
+            'verbose': True
+        }
+        # 如果有视频数据，添加num_patches_list参数
+        if num_patches_list is not None:
+            chat_kwargs['num_patches_list'] = num_patches_list
+        # 如果有speech数据，添加speech参数
+        if speech_input is not None:
+            chat_kwargs.update({
+                'speech': speech_input,  # mel 谱图，用于 Whisper
+                'speech_lengths': speech_lengths_input,
+                'speech_wav': speech_wavs,  # 原始音频波形，用于 BEATs
+            })
+        # 执行推理
+        # breakpoint()
+        response = model.chat(**chat_kwargs)
+        print(f"✅ 推理成功!")
+        print(f"🤖 回复: {response}")
+        return True, response
+    except Exception as e:
+        print(f"❌ 推理失败: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return False, str(e)
+# 测试1: Pure Text (应该正常，使用训练好的InternVL)
+success1, response1 = test_inference(
+    test_name="Pure Text",
+    question="Hello, who are you? Please introduce yourself briefly.",
+    pixel_values_input=None,
+    speech_input=None,
+    speech_lengths_input=None
+)
+# breakpoint()
+# 测试2: Text & Image - Visual only (应该正常，使用训练好的InternVL)
+success2, response2 = test_inference(
+    test_name="Text & Image (Visual only)",
+    question="<image>\nPlease describe this image in detail.",
+    pixel_values_input=pixel_values,
+    speech_input=None,
+    speech_lengths_input=None
+)
+print("\n" + "="*60)
+print("🔄 准备Speech相关测试 (可能输出乱码，因为speech部分未训练)")
+print("="*60)
+def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+    if bound:
+        start, end = bound[0], bound[1]
+    else:
+        start, end = -100000, 100000
+    start_idx = max(first_idx, round(start * fps))
+    end_idx = min(round(end * fps), max_frame)
+    seg_size = float(end_idx - start_idx) / num_segments
+    frame_indices = np.array([
+        int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+        for idx in range(num_segments)
+    ])
+    return frame_indices
+def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    max_frame = len(vr) - 1
+    fps = float(vr.get_avg_fps())
+    pixel_values_list, num_patches_list = [], []
+    transform = build_transform(input_size=input_size)
+    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
+    for frame_index in frame_indices:
+        img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
+        img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
+        pixel_values = [transform(tile) for tile in img]
+        pixel_values = torch.stack(pixel_values)
+        num_patches_list.append(pixel_values.shape[0])
+        pixel_values_list.append(pixel_values)
+    pixel_values = torch.cat(pixel_values_list)
+    return pixel_values, num_patches_list
+def load_audio(audio_file_name):
+    """
+    加载音频文件，使用Ola风格的mel谱图预处理
+    这与原始的Ola load_audio函数保持一致
+    """
+    speech_wav, samplerate = librosa.load(audio_file_name, sr=16000)
+    if len(speech_wav.shape) > 1:
+        speech_wav = speech_wav[:, 0]
+    speech_wav = speech_wav.astype(np.float32)
+    CHUNK_LIM = 480000
+    SAMPLE_RATE = 16000
+    speechs = []
+    speech_wavs = []
+    if len(speech_wav) <= CHUNK_LIM:
+        speech = whisper.pad_or_trim(speech_wav)
+        speech_wav_chunk = whisper.pad_or_trim(speech_wav)
+        speechs.append(speech)
+        speech_wavs.append(torch.from_numpy(speech_wav_chunk).unsqueeze(0))
+    else:
+        for i in range(0, len(speech_wav), CHUNK_LIM):
+            chunk = speech_wav[i : i + CHUNK_LIM]
+            if len(chunk) < CHUNK_LIM:
+                chunk = whisper.pad_or_trim(chunk)
+            speechs.append(chunk)
+            speech_wavs.append(torch.from_numpy(chunk).unsqueeze(0))
+    # 生成mel谱图
+    mels = []
+    for chunk in speechs:
+        chunk = whisper.log_mel_spectrogram(chunk, n_mels=128).permute(1, 0).unsqueeze(0)
+        mels.append(chunk)
+    mels = torch.cat(mels, dim=0)
+    speech_wavs = torch.cat(speech_wavs, dim=0)
+    if mels.shape[0] > 25:
+        mels = mels[:25]
+        speech_wavs = speech_wavs[:25]
+    speech_length = torch.LongTensor([mels.shape[1]] * mels.shape[0])
+    speech_chunks = torch.LongTensor([mels.shape[0]])
+    return mels, speech_length, speech_chunks, speech_wavs
+def extract_audio(videos_file_path):
+    my_clip = mp.VideoFileClip(videos_file_path)
+    return my_clip.audio
+# 加载视频数据用于视频测试
+print("\n📥 加载视频数据...")
+try:
+    video_path = f'{resource_path}red-panda.mp4'
+    if os.path.exists(video_path):
+        video_pixel_values, video_num_patches_list = load_video(video_path, num_segments=8, max_num=1)
+        video_pixel_values = video_pixel_values.to(torch.bfloat16).cuda()
+        video_loaded = True
+        print(f"✅ 视频加载成功:")
+        print(f"  - 视频帧数: {len(video_num_patches_list)}")
+        print(f"  - 视频像素值形状: {video_pixel_values.shape}")
+        print(f"  - 每帧patch数: {video_num_patches_list}")
+    else:
+        print(f"⚠️  视频文件不存在: {video_path}")
+        video_loaded = False
+        video_pixel_values = None
+        video_num_patches_list = None
+except Exception as e:
+    print(f"❌ 视频加载失败: {e}")
+    video_loaded = False
+    video_pixel_values = None
+    video_num_patches_list = None
+audio_path = f'/data1/cxy/dataset/english.mp3'
+# 加载音频数据用于后续测试
+print("\n📥 加载音频数据...")
+try:
+    # 加载音频文件 - 使用Ola风格的mel谱图预处理
+    mels, speech_lengths, speech_chunks, speech_wavs = load_audio(audio_path)
+    print(f"✅ 音频加载成功:")
+    print(f"  - mel谱图形状: {mels.shape}")
+    print(f"  - 音频长度: {speech_lengths}")
+    print(f"  - 音频块数: {speech_chunks}")
+    print(f"  - 原始音频波形形状: {speech_wavs.shape}")
+    # 将音频数据转换为适当的格式并移到GPU
+    mels = mels.to(torch.bfloat16).cuda()
+    speech_lengths = speech_lengths.cuda()
+    speech_chunks = speech_chunks.cuda()
+    speech_wavs = speech_wavs.cuda()
+    audio_loaded = True
+except Exception as e:
+    print(f"❌ 音频加载失败: {e}")
+    audio_loaded = False
+    mels = None
+    speech_lengths = None
+# 测试5: Video + Text (应该正常，使用训练好的InternVL)
+if video_loaded:
+    # 构建视频帧前缀
+    video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(video_num_patches_list))])
+    video_question = video_prefix + 'What is the red panda doing in this video? Please describe the actions and movements you observe.'
+    success5, response5 = test_inference(
+        test_name="Video + Text",
+        question=video_question,
+        pixel_values_input=video_pixel_values,
+        speech_input=None,
+        speech_lengths_input=None,
+        num_patches_list=video_num_patches_list
+    )
+else:
+    print("⚠️  跳过Video + Text测试 (视频加载失败)")
+    success5 = False
+print("\n" + "="*80)
+print("📊 多模态推理测试总结")
+print("="*80)
+test_results = [
+    ("Pure Text", success1, "PASS", "应该正常 (训练好的InternVL)"),
+    ("Text & Image", success2, "PASS", "应该正常 (训练好的InternVL)"),
+    ("Video + Text", success5 if video_loaded else False, "PASS", "应该正常 (训练好的InternVL)"),
+]
+for test_name, success, expected, note in test_results:
+    status = "✅ PASS" if success else "❌ FAIL"
+    print(f"{status} {test_name:<15} (预期: {expected:<8}) - {note}")
+passed = sum(1 for _, success, _, _ in test_results if success)
+total = len(test_results)
+print(f"\n📈 测试统计: {passed}/{total} 通过")
+if passed >= 2:  # 至少pure text、text&image、video+text中的2个应该通过
+    print("🎉 基础功能正常，Speech集成架构成功！")
+    print("💡 Speech相关测试如果输出乱码是正常的，因为speech部分还未训练")
+    if passed >= 3:
+        print("🌟 所有基础模态测试都通过了！")
+else:
+    print("⚠️  基础功能可能存在问题，需要进一步检查")
+print("\n=== 多模态推理测试完成 ===")

inference/log.txt ADDED Viewed

	@@ -0,0 +1,480 @@

+[2025-09-15 09:15:42,098] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+LOAD_VISION_EARLY is set
+FORCE_NO_DOWNSAMPLE is set
+VIDEO_RESIZE is set as 0x64, 0, 64
+HIGHRES_BASE is set as 0x32, 0, 32
+MAXRES is set as 1536
+MINRES is set as 0
+VIDEO_MAXRES is set as 480
+VIDEO_MINRES is set as 288
+PAD2STRIDE is set
+LOWRES_RESIZE is set as 384x32
+Loading OlaQwen3ForCausalLM model...
+Loading BEATs Model
+Missing keys: ['model.speech_encoder.whisper_model.positional_embedding', 'model.speech_encoder.whisper_model.conv1.weight', 'model.speech_encoder.whisper_model.conv1.bias', 'model.speech_encoder.whisper_model.conv2.weight', 'model.speech_encoder.whisper_model.conv2.bias', 'model.speech_encoder.whisper_model.blocks.0.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.0.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.0.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.0.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.0.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.0.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.0.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.0.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.0.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.0.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.0.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.0.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.0.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.0.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.0.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.1.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.1.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.1.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.1.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.1.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.1.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.1.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.1.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.1.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.1.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.1.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.1.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.1.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.1.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.1.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.2.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.2.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.2.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.2.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.2.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.2.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.2.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.2.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.2.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.2.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.2.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.2.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.2.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.2.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.2.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.3.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.3.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.3.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.3.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.3.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.3.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.3.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.3.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.3.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.3.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.3.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.3.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.3.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.3.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.3.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.4.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.4.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.4.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.4.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.4.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.4.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.4.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.4.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.4.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.4.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.4.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.4.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.4.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.4.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.4.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.5.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.5.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.5.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.5.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.5.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.5.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.5.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.5.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.5.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.5.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.5.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.5.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.5.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.5.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.5.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.6.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.6.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.6.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.6.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.6.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.6.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.6.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.6.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.6.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.6.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.6.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.6.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.6.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.6.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.6.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.7.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.7.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.7.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.7.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.7.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.7.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.7.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.7.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.7.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.7.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.7.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.7.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.7.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.7.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.7.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.8.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.8.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.8.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.8.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.8.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.8.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.8.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.8.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.8.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.8.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.8.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.8.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.8.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.8.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.8.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.9.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.9.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.9.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.9.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.9.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.9.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.9.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.9.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.9.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.9.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.9.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.9.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.9.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.9.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.9.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.10.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.10.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.10.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.10.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.10.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.10.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.10.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.10.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.10.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.10.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.10.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.10.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.10.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.10.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.10.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.11.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.11.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.11.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.11.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.11.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.11.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.11.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.11.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.11.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.11.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.11.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.11.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.11.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.11.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.11.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.12.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.12.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.12.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.12.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.12.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.12.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.12.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.12.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.12.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.12.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.12.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.12.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.12.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.12.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.12.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.13.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.13.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.13.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.13.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.13.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.13.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.13.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.13.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.13.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.13.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.13.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.13.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.13.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.13.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.13.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.14.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.14.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.14.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.14.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.14.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.14.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.14.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.14.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.14.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.14.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.14.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.14.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.14.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.14.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.14.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.15.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.15.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.15.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.15.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.15.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.15.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.15.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.15.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.15.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.15.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.15.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.15.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.15.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.15.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.15.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.16.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.16.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.16.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.16.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.16.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.16.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.16.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.16.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.16.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.16.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.16.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.16.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.16.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.16.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.16.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.17.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.17.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.17.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.17.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.17.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.17.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.17.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.17.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.17.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.17.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.17.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.17.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.17.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.17.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.17.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.18.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.18.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.18.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.18.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.18.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.18.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.18.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.18.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.18.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.18.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.18.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.18.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.18.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.18.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.18.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.19.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.19.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.19.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.19.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.19.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.19.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.19.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.19.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.19.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.19.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.19.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.19.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.19.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.19.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.19.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.20.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.20.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.20.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.20.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.20.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.20.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.20.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.20.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.20.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.20.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.20.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.20.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.20.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.20.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.20.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.21.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.21.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.21.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.21.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.21.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.21.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.21.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.21.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.21.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.21.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.21.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.21.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.21.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.21.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.21.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.22.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.22.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.22.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.22.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.22.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.22.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.22.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.22.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.22.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.22.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.22.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.22.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.22.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.22.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.22.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.23.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.23.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.23.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.23.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.23.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.23.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.23.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.23.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.23.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.23.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.23.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.23.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.23.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.23.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.23.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.24.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.24.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.24.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.24.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.24.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.24.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.24.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.24.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.24.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.24.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.24.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.24.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.24.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.24.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.24.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.25.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.25.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.25.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.25.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.25.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.25.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.25.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.25.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.25.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.25.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.25.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.25.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.25.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.25.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.25.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.26.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.26.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.26.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.26.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.26.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.26.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.26.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.26.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.26.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.26.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.26.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.26.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.26.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.26.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.26.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.27.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.27.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.27.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.27.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.27.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.27.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.27.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.27.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.27.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.27.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.27.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.27.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.27.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.27.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.27.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.28.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.28.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.28.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.28.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.28.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.28.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.28.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.28.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.28.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.28.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.28.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.28.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.28.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.28.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.28.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.29.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.29.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.29.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.29.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.29.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.29.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.29.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.29.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.29.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.29.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.29.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.29.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.29.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.29.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.29.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.30.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.30.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.30.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.30.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.30.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.30.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.30.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.30.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.30.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.30.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.30.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.30.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.30.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.30.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.30.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.31.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.31.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.31.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.31.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.31.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.31.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.31.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.31.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.31.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.31.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.31.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.31.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.31.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.31.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.31.mlp_ln.bias', 'model.speech_encoder.whisper_model.ln_post.weight', 'model.speech_encoder.whisper_model.ln_post.bias', 'model.speech_encoder.beats_model.post_extract_proj.weight', 'model.speech_encoder.beats_model.post_extract_proj.bias', 'model.speech_encoder.beats_model.patch_embedding.weight', 'model.speech_encoder.beats_model.encoder.pos_conv.0.bias', 'model.speech_encoder.beats_model.encoder.pos_conv.0.weight_g', 'model.speech_encoder.beats_model.encoder.pos_conv.0.weight_v', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.0.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.0.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.0.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.0.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.0.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.0.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.1.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.1.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.1.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.1.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.1.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.1.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.2.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.2.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.2.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.2.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.2.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.2.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.3.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.3.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.3.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.3.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.3.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.3.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.4.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.4.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.4.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.4.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.4.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.4.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.5.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.5.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.5.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.5.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.5.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.5.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.6.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.6.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.6.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.6.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.6.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.6.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.7.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.7.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.7.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.7.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.7.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.7.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.8.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.8.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.8.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.8.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.8.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.8.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.9.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.9.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.9.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.9.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.9.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.9.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.10.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.10.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.10.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.10.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.10.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.10.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.11.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.11.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.11.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.11.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.11.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.11.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layer_norm.bias', 'model.speech_encoder.beats_model.layer_norm.weight', 'model.speech_encoder.beats_model.layer_norm.bias', 'model.speech_encoder.beats_model.predictor.weight', 'model.speech_encoder.beats_model.predictor.bias', 'model.speech_projector.speech_newline', 'model.speech_projector.speech_begin', 'model.speech_projector.speech_end', 'model.speech_projector.linear1.weight', 'model.speech_projector.linear1.bias', 'model.speech_projector.linear2.weight', 'model.speech_projector.linear2.bias']
+Unexpected keys: []
+Loading vision tower...
+Loading vision tower succeeded.
+User: Hello, who are you?
+Assistant: Hello! I'm PLM-V, an AI assistant created to provide information, answer questions, and help with various tasks. How can I assist you today?
+================================================================================
+🧪 开始多模态推理测试
+================================================================================
+============================================================
+🧪 测试: Pure Text
+📝 问题: Hello, who are you? Please introduce yourself briefly.
+============================================================
+<|im_start|>system
+You are PLM-V, a helpful assistant.<|im_end|>
+<|im_start|>user
+Hello, who are you? Please introduce yourself briefly.<|im_end|>
+<|im_start|>assistant
+ Hello! I am PLM-V, an intelligent assistant designed to provide you with detailed and accurate information, answer questions, and assist with a wide range of topics. My goal is to support you in a friendly and knowledgeable manner, making your interactions with me as informative and helpful as possible. How can I assist you today?
+✅ 推理成功!
+🤖 回复: Hello! I am PLM-V, an intelligent assistant designed to provide you with detailed and accurate information, answer questions, and assist with a wide range of topics. My goal is to support you in a friendly and knowledgeable manner, making your interactions with me as informative and helpful as possible. How can I assist you today?
+============================================================
+🧪 测试: Text & Image (Visual only)
+📝 问题: <image>
+Please describe this image in detail.
+============================================================
+dynamic ViT batch size: 13
+<|im_start|>system
+You are PLM-V, a helpful assistant.<|im_end|>
+<|im_start|>user
+<image>
+Please describe this image in detail.<|im_end|>
+<|im_start|>assistant
+ The image shows a close-up of a red panda, characterized by its distinctive reddish-brown fur with white markings around its face and back. The red panda appears to be leaning on a wooden structure, possibly a ledge or a part of a platform. The subject has a mix of black and white fur around its ears, eyes, and muzzle, which contrasts with its redder head. The expression on the red panda's face seems calm and curious as it looks directly at the camera. The background features blurred greenery, suggesting that the animal is in an outdoor environment with trees and plants. The setting gives a natural, outdoor feel to the image.
+✅ 推理成功!
+🤖 回复: The image shows a close-up of a red panda, characterized by its distinctive reddish-brown fur with white markings around its face and back. The red panda appears to be leaning on a wooden structure, possibly a ledge or a part of a platform. The subject has a mix of black and white fur around its ears, eyes, and muzzle, which contrasts with its redder head. The expression on the red panda's face seems calm and curious as it looks directly at the camera. The background features blurred greenery, suggesting that the animal is in an outdoor environment with trees and plants. The setting gives a natural, outdoor feel to the image.
+============================================================
+🔄 准备Speech相关测试 (可能输出乱码，因为speech部分未训练)
+============================================================
+📥 加载视频数据...
+✅ 视频加载成功:
+  - 视频帧数: 8
+  - 视频像素值形状: torch.Size([8, 3, 448, 448])
+  - 每帧patch数: [1, 1, 1, 1, 1, 1, 1, 1]
+📥 加载音频数据...
+✅ 音频加载成功:
+  - mel谱图形状: torch.Size([2, 3000, 128])
+  - 音频长度: tensor([3000, 3000])
+  - 音频块数: tensor([2])
+  - 原始音频波形形状: torch.Size([2, 480000])
+============================================================
+🧪 测试: Audio only (预期乱码)
+📝 问题: <speech>
+Please transcribe and summarize what you heard in the audio.
+============================================================
+speech batch size: 2
+<|im_start|>system
+You are PLM-V, a helpful assistant.<|im_end|>
+<|im_start|>user
+<speech>
+Please transcribe and summarize what you heard in the audio.<|im_end|>
+<|im_start|>assistant
+ The end
+The internet
+or
+or
+structure
+The overcorrection
+As年人
+pre-hel theorem is the following: suppose that the other
+There is no one knows that the
+We consider that the following is the
+It is sufficient that the
+Consider that the
+As we see the answer to the question
+Let us consider the following:
+The initial
+or, in the beginning of the text
+We need to consider the following:
+The answer is
+Here, we need to consider the following answers
+However, the question is: What is the
+But the answer to the question is:
+Or the sequence of the answer is:
+The final answer that is:
+Actually, the factor, and the issue is: The problem is:
+The problem solved, and the answer is:
+However, the problem that is: The problem posed is: For the final solution
+The answer to the question is: For the following question
+But the question that is solved by: The issue here is: The question we need to solve: The problem we are considering: The question that exists: The problem that we need to address: The question that is:
+Let us consider the following: The problem that was
+The answer is: The problem is:
+We reconsider the question: The question now
+The problem is: The question we have: The problem that follows: The question is: The final question: The problem that exists: The problem that is: The question posed
+The final answer is: The problem: The question considered: The question posed: The final answer: The problem that we consider: The problem that was solved: The answer: The problem solved: The problem solved:
+After a moment of reflection, the question, and the problem is: The question, and the problem is: The answer to the problem: The question and the problem is: The problem that the question is: The problem where the question is
+However, the initial question, and the answer following these lines:
+Alternatively, the problem is solved by the following question:
+The sequence of the question and answers
+The problem posed in the following
+After some thought,the sequence: The problem that the system
+The actual problem: The actual answer to the question: The actual issue that is: The actual thing that is: The actual fact that is: The actual solution to the problem: The actual moment: The actual thing: The actual purpose: The actual issue: The actual event: The actual time: The actual issue resolved: The actual thing that is: The actual thing that is: The actual moment that is: The actual situation
+The end of the answer is: The end of the process is: The end of the procedure is
+The end of the process that is:
+The final answer is: The final step in the process is: The final part of the procedure is:
+The final note that is: The final issue is: The final issue that is:
+The final step is: The final issue solved is: The final part of the question is: The final question is: The final version of the question is: The final part of the structure is: The final structure that is: The final structure of the problem is: The final structure of the consideration is: The final consideration of the structure is: The structure that is: The structure that contains the
+But the final answer is: The final answer to the question: The final answer that is: The final answer: The final answer: The final answer: The final answer: The final answer:
+The final issue is: The final answer: The initial issue is:
+The final answer
+So, the final answer is: The final answer
+Suppose we consider the final answer: The final answer is: The final answer
+The final answer is: The final answer
+The final answer
+The final answer
+The final answer
+The final answer is: The final answer
+The final answer is: The final answer
+The final answer is: The final answer: The final answer: The final answer
+But the final answer to the question is: The final answer: The final answer: The final answer:
+The final answer: The final answer
+The final answer
+The final answer is: The final answer
+The final solution is: The final answer
+The final answer
+The final answer is: The final answer
+The final answer
+The final step is: The final answer
+The final answer
+The final answer
+The final answer is: The final answer
+The final answer: The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer (a)
+But the
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer: The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+✅ 推理成功!
+🤖 回复: The end
+The internet
+or
+or
+structure
+The overcorrection
+As年人
+pre-hel theorem is the following: suppose that the other
+There is no one knows that the
+We consider that the following is the
+It is sufficient that the
+Consider that the
+As we see the answer to the question
+Let us consider the following:
+The initial
+or, in the beginning of the text
+We need to consider the following:
+The answer is
+Here, we need to consider the following answers
+However, the question is: What is the
+But the answer to the question is:
+Or the sequence of the answer is:
+The final answer that is:
+Actually, the factor, and the issue is: The problem is:
+The problem solved, and the answer is:
+However, the problem that is: The problem posed is: For the final solution
+The answer to the question is: For the following question
+But the question that is solved by: The issue here is: The question we need to solve: The problem we are considering: The question that exists: The problem that we need to address: The question that is:
+Let us consider the following: The problem that was
+The answer is: The problem is:
+We reconsider the question: The question now
+The problem is: The question we have: The problem that follows: The question is: The final question: The problem that exists: The problem that is: The question posed
+The final answer is: The problem: The question considered: The question posed: The final answer: The problem that we consider: The problem that was solved: The answer: The problem solved: The problem solved:
+After a moment of reflection, the question, and the problem is: The question, and the problem is: The answer to the problem: The question and the problem is: The problem that the question is: The problem where the question is
+However, the initial question, and the answer following these lines:
+Alternatively, the problem is solved by the following question:
+The sequence of the question and answers
+The problem posed in the following
+After some thought,the sequence: The problem that the system
+The actual problem: The actual answer to the question: The actual issue that is: The actual thing that is: The actual fact that is: The actual solution to the problem: The actual moment: The actual thing: The actual purpose: The actual issue: The actual event: The actual time: The actual issue resolved: The actual thing that is: The actual thing that is: The actual moment that is: The actual situation
+The end of the answer is: The end of the process is: The end of the procedure is
+The end of the process that is:
+The final answer is: The final step in the process is: The final part of the procedure is:
+The final note that is: The final issue is: The final issue that is:
+The final step is: The final issue solved is: The final part of the question is: The final question is: The final version of the question is: The final part of the structure is: The final structure that is: The final structure of the problem is: The final structure of the consideration is: The final consideration of the structure is: The structure that is: The structure that contains the
+But the final answer is: The final answer to the question: The final answer that is: The final answer: The final answer: The final answer: The final answer: The final answer:
+The final issue is: The final answer: The initial issue is:
+The final answer
+So, the final answer is: The final answer
+Suppose we consider the final answer: The final answer is: The final answer
+The final answer is: The final answer
+The final answer
+The final answer
+The final answer
+The final answer is: The final answer
+The final answer is: The final answer
+The final answer is: The final answer: The final answer: The final answer
+But the final answer to the question is: The final answer: The final answer: The final answer:
+The final answer: The final answer
+The final answer
+The final answer is: The final answer
+The final solution is: The final answer
+The final answer
+The final answer is: The final answer
+The final answer
+The final step is: The final answer
+The final answer
+The final answer
+The final answer is: The final answer
+The final answer: The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer (a)
+But the
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer: The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+The final answer
+============================================================
+🧪 测试: Audio + Image (预期乱码)
+📝 问题: <image>
+User's question in speech: <speech>
+============================================================
+dynamic ViT batch size: 13
+speech batch size: 2
+<|im_start|>system
+You are PLM-V, a helpful assistant.<|im_end|>
+<|im_start|>user
+<image>
+User's question in speech: <speech>
+<|im_end|>
+<|im_start|>assistant
+ Bicytude: 连们在考虑了的时候 - 衋体在当前的是一场关于亚的类型、以通过的是一支集
+在使用后端使用相同的方式的人，可能不知道，或者，其是否在何时进入，而是一次使用相同错误地管理，可能不会发生，或可能是在之前，或者是不是还有其他相关的人，或者，或者，或是否，或，或呢，或是一个，或是一个或，或是
+I don't understand the given content: 迭取到一张图片，该是关于一个文本文件，内容和类似于的功能，或者是一个关于，或者是一个的人，或若是关于一个关于学习和关于其他主题的上的行为，它是什么原因（或者，或者，或者，或？)还是关于其他主题呢，或在关于其他生物，比如，或者，或者，或者，或呢，或，我需要在使用了，或者，或者，或然呢？ - 我需要找到文件中关于“如何改进一个文本文件，以便于更趋时，或是在其他主题，或者是一个关于如何管理一个复杂的数据文件，或是在一个应用数据文件，或者一个关于如何在浏览器的“如何利用Python的自动控制中的“或，或是一个关于一个关于如何使用标签，或是一个关于其他事情，比如，或者一个关于另一个关于如何使用的或是什么，或者，或呢？等，或者？？或呢？
+✅ 推理成功!
+🤖 回复: Bicytude: 连们在考虑了的时候 - 衋体在当前的是一场关于亚的类型、以通过的是一支集
+在使用后端使用相同的方式的人，可能不知道，或者，其是否在何时进入，而是一次使用相同错误地管理，可能不会发生，或可能是在之前，或者是不是还有其他相关的人，或者，或者，或是否，或，或呢，或是一个，或是一个或，或是
+I don't understand the given content: 迭取到一张图片，该是关于一个文本文件，内容和类似于的功能，或者是一个关于，或者是一个的人，或若是关于一个关于学习和关于其他主题的上的行为，它是什么原因（或者，或者，或者，或？)还是关于其他主题呢，或在关于其他生物，比如，或者，或者，或者，或呢，或，我需要在使用了，或者，或者，或然呢？ - 我需要找到文件中关于“如何改进一个文本文件，以便于更趋时，或是在其他主题，或者是一个关于如何管理一个复杂的数据文件，或是在一���应用数据文件，或者一个关于如何在浏览器的“如何利用Python的自动控制中的“或，或是一个关于一个关于如何使用标签，或是一个关于其他事情，比如，或者一个关于另一个关于如何使用的或是什么，或者，或呢？等，或者？？或呢？
+============================================================
+🧪 测试: Video + Text
+📝 问题: Frame1: <image>
+Frame2: <image>
+Frame3: <image>
+Frame4: <image>
+Frame5: <image>
+Frame6: <image>
+Frame7: <image>
+Frame8: <image>
+What is the red panda doing in this video? Please describe the actions and movements you observe.
+============================================================
+dynamic ViT batch size: 8
+<|im_start|>system
+You are PLM-V, a helpful assistant.<|im_end|>
+<|im_start|>user
+Frame1: <image>
+Frame2: <image>
+Frame3: <image>
+Frame4: <image>
+Frame5: <image>
+Frame6: <image>
+Frame7: <image>
+Frame8: <image>
+What is the red panda doing in this video? Please describe the actions and movements you observe.<|im_end|>
+<|im_start|>assistant
+ In this video, a red panda is climbing up a branch, perched on it while holding something in its mouth, and later sitting on the ground and reaching up towards bamboo sticks suspended from a tree. At one point, one of the red pandas chews on bamboo, and at another point, the blue creature is seen on the grassy ground, looking up towards the red panda. The scene then shows the red panda still perched on the branch, holding something in its mouth, and another red panda is perched on the ground, reaching up towards the bamboo on the tree. After a few moments, the panda on the ground finishes its activity and sits down on the grassy ground.
+✅ 推理成功!
+🤖 回复: In this video, a red panda is climbing up a branch, perched on it while holding something in its mouth, and later sitting on the ground and reaching up towards bamboo sticks suspended from a tree. At one point, one of the red pandas chews on bamboo, and at another point, the blue creature is seen on the grassy ground, looking up towards the red panda. The scene then shows the red panda still perched on the branch, holding something in its mouth, and another red panda is perched on the ground, reaching up towards the bamboo on the tree. After a few moments, the panda on the ground finishes its activity and sits down on the grassy ground.
+================================================================================
+📊 多模态推理测试总结
+================================================================================
+✅ PASS Pure Text       (预期: PASS    ) - 应该正常 (训练好的InternVL)
+✅ PASS Text & Image    (预期: PASS    ) - 应该正常 (训练好的InternVL)
+✅ PASS Video + Text    (预期: PASS    ) - 应该正常 (训练好的InternVL)
+✅ PASS Audio only      (预期: GARBLED ) - 可能乱码 (speech未训练)
+✅ PASS Audio + Image   (预期: GARBLED ) - 可能乱码 (speech未训练)
+📈 测试统计: 5/5 通过
+🎉 基础功能正常，Speech集成架构成功！
+💡 Speech相关测试如果输出乱码是正常的，因为speech部分还未训练
+🌟 所有基础模态测试都通过了！
+=== 多模态推理测试完成 ===

inference/log1.txt ADDED Viewed

	@@ -0,0 +1,339 @@

+[2025-09-15 09:26:52,568] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+LOAD_VISION_EARLY is set
+FORCE_NO_DOWNSAMPLE is set
+VIDEO_RESIZE is set as 0x64, 0, 64
+HIGHRES_BASE is set as 0x32, 0, 32
+MAXRES is set as 1536
+MINRES is set as 0
+VIDEO_MAXRES is set as 480
+VIDEO_MINRES is set as 288
+PAD2STRIDE is set
+LOWRES_RESIZE is set as 384x32
+Loading OlaQwen3ForCausalLM model...
+Loading BEATs Model
+Missing keys: ['model.speech_encoder.whisper_model.positional_embedding', 'model.speech_encoder.whisper_model.conv1.weight', 'model.speech_encoder.whisper_model.conv1.bias', 'model.speech_encoder.whisper_model.conv2.weight', 'model.speech_encoder.whisper_model.conv2.bias', 'model.speech_encoder.whisper_model.blocks.0.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.0.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.0.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.0.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.0.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.0.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.0.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.0.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.0.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.0.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.0.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.0.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.0.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.0.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.0.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.1.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.1.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.1.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.1.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.1.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.1.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.1.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.1.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.1.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.1.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.1.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.1.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.1.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.1.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.1.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.2.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.2.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.2.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.2.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.2.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.2.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.2.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.2.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.2.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.2.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.2.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.2.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.2.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.2.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.2.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.3.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.3.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.3.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.3.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.3.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.3.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.3.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.3.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.3.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.3.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.3.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.3.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.3.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.3.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.3.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.4.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.4.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.4.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.4.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.4.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.4.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.4.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.4.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.4.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.4.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.4.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.4.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.4.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.4.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.4.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.5.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.5.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.5.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.5.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.5.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.5.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.5.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.5.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.5.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.5.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.5.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.5.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.5.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.5.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.5.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.6.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.6.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.6.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.6.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.6.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.6.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.6.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.6.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.6.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.6.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.6.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.6.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.6.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.6.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.6.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.7.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.7.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.7.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.7.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.7.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.7.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.7.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.7.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.7.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.7.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.7.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.7.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.7.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.7.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.7.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.8.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.8.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.8.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.8.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.8.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.8.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.8.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.8.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.8.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.8.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.8.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.8.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.8.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.8.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.8.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.9.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.9.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.9.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.9.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.9.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.9.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.9.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.9.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.9.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.9.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.9.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.9.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.9.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.9.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.9.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.10.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.10.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.10.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.10.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.10.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.10.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.10.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.10.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.10.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.10.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.10.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.10.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.10.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.10.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.10.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.11.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.11.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.11.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.11.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.11.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.11.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.11.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.11.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.11.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.11.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.11.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.11.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.11.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.11.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.11.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.12.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.12.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.12.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.12.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.12.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.12.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.12.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.12.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.12.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.12.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.12.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.12.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.12.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.12.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.12.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.13.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.13.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.13.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.13.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.13.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.13.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.13.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.13.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.13.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.13.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.13.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.13.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.13.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.13.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.13.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.14.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.14.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.14.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.14.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.14.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.14.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.14.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.14.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.14.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.14.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.14.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.14.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.14.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.14.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.14.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.15.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.15.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.15.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.15.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.15.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.15.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.15.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.15.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.15.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.15.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.15.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.15.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.15.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.15.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.15.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.16.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.16.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.16.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.16.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.16.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.16.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.16.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.16.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.16.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.16.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.16.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.16.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.16.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.16.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.16.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.17.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.17.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.17.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.17.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.17.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.17.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.17.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.17.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.17.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.17.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.17.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.17.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.17.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.17.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.17.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.18.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.18.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.18.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.18.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.18.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.18.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.18.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.18.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.18.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.18.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.18.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.18.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.18.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.18.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.18.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.19.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.19.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.19.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.19.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.19.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.19.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.19.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.19.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.19.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.19.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.19.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.19.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.19.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.19.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.19.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.20.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.20.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.20.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.20.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.20.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.20.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.20.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.20.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.20.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.20.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.20.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.20.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.20.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.20.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.20.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.21.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.21.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.21.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.21.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.21.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.21.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.21.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.21.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.21.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.21.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.21.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.21.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.21.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.21.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.21.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.22.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.22.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.22.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.22.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.22.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.22.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.22.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.22.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.22.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.22.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.22.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.22.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.22.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.22.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.22.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.23.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.23.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.23.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.23.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.23.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.23.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.23.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.23.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.23.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.23.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.23.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.23.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.23.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.23.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.23.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.24.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.24.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.24.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.24.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.24.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.24.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.24.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.24.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.24.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.24.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.24.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.24.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.24.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.24.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.24.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.25.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.25.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.25.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.25.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.25.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.25.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.25.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.25.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.25.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.25.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.25.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.25.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.25.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.25.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.25.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.26.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.26.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.26.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.26.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.26.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.26.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.26.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.26.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.26.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.26.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.26.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.26.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.26.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.26.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.26.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.27.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.27.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.27.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.27.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.27.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.27.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.27.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.27.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.27.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.27.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.27.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.27.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.27.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.27.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.27.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.28.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.28.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.28.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.28.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.28.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.28.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.28.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.28.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.28.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.28.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.28.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.28.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.28.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.28.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.28.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.29.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.29.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.29.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.29.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.29.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.29.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.29.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.29.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.29.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.29.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.29.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.29.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.29.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.29.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.29.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.30.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.30.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.30.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.30.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.30.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.30.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.30.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.30.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.30.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.30.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.30.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.30.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.30.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.30.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.30.mlp_ln.bias', 'model.speech_encoder.whisper_model.blocks.31.attn.query.weight', 'model.speech_encoder.whisper_model.blocks.31.attn.query.bias', 'model.speech_encoder.whisper_model.blocks.31.attn.key.weight', 'model.speech_encoder.whisper_model.blocks.31.attn.value.weight', 'model.speech_encoder.whisper_model.blocks.31.attn.value.bias', 'model.speech_encoder.whisper_model.blocks.31.attn.out.weight', 'model.speech_encoder.whisper_model.blocks.31.attn.out.bias', 'model.speech_encoder.whisper_model.blocks.31.attn_ln.weight', 'model.speech_encoder.whisper_model.blocks.31.attn_ln.bias', 'model.speech_encoder.whisper_model.blocks.31.mlp.0.weight', 'model.speech_encoder.whisper_model.blocks.31.mlp.0.bias', 'model.speech_encoder.whisper_model.blocks.31.mlp.2.weight', 'model.speech_encoder.whisper_model.blocks.31.mlp.2.bias', 'model.speech_encoder.whisper_model.blocks.31.mlp_ln.weight', 'model.speech_encoder.whisper_model.blocks.31.mlp_ln.bias', 'model.speech_encoder.whisper_model.ln_post.weight', 'model.speech_encoder.whisper_model.ln_post.bias', 'model.speech_encoder.beats_model.post_extract_proj.weight', 'model.speech_encoder.beats_model.post_extract_proj.bias', 'model.speech_encoder.beats_model.patch_embedding.weight', 'model.speech_encoder.beats_model.encoder.pos_conv.0.bias', 'model.speech_encoder.beats_model.encoder.pos_conv.0.weight_g', 'model.speech_encoder.beats_model.encoder.pos_conv.0.weight_v', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.0.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.0.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.0.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.0.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.0.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.0.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.0.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.1.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.1.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.1.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.1.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.1.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.1.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.1.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.2.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.2.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.2.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.2.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.2.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.2.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.2.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.3.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.3.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.3.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.3.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.3.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.3.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.3.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.4.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.4.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.4.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.4.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.4.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.4.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.4.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.5.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.5.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.5.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.5.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.5.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.5.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.5.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.6.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.6.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.6.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.6.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.6.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.6.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.6.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.7.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.7.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.7.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.7.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.7.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.7.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.7.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.8.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.8.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.8.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.8.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.8.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.8.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.8.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.9.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.9.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.9.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.9.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.9.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.9.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.9.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.10.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.10.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.10.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.10.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.10.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.10.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.10.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.grep_a', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.k_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.k_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.v_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.v_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.q_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.q_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.out_proj.weight', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.out_proj.bias', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.grep_linear.weight', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.grep_linear.bias', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn.relative_attention_bias.weight', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.11.self_attn_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layers.11.fc1.weight', 'model.speech_encoder.beats_model.encoder.layers.11.fc1.bias', 'model.speech_encoder.beats_model.encoder.layers.11.fc2.weight', 'model.speech_encoder.beats_model.encoder.layers.11.fc2.bias', 'model.speech_encoder.beats_model.encoder.layers.11.final_layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layers.11.final_layer_norm.bias', 'model.speech_encoder.beats_model.encoder.layer_norm.weight', 'model.speech_encoder.beats_model.encoder.layer_norm.bias', 'model.speech_encoder.beats_model.layer_norm.weight', 'model.speech_encoder.beats_model.layer_norm.bias', 'model.speech_encoder.beats_model.predictor.weight', 'model.speech_encoder.beats_model.predictor.bias', 'model.speech_projector.speech_newline', 'model.speech_projector.speech_begin', 'model.speech_projector.speech_end', 'model.speech_projector.linear1.weight', 'model.speech_projector.linear1.bias', 'model.speech_projector.linear2.weight', 'model.speech_projector.linear2.bias']
+Unexpected keys: []
+Loading vision tower...
+Loading vision tower succeeded.
+User: Hello, who are you?
+Assistant: Hello! I am an AI assistant called PLM-V. How can I help you today?
+================================================================================
+🧪 开始多模态推理测试
+================================================================================
+============================================================
+🧪 测试: Pure Text
+📝 问题: Hello, who are you? Please introduce yourself briefly.
+============================================================
+<|im_start|>system
+You are PLM-V, a helpful assistant.<|im_end|>
+<|im_start|>user
+Hello, who are you? Please introduce yourself briefly.<|im_end|>
+<|im_start|>assistant
+ Hello! I am PLM-V, a language model created by the platform OpenAI. My primary function is to assist users by providing information, answering questions, and engaging in conversation. I aim to be helpful, accurate, and respectful in all interactions. How can I assist you today?
+✅ 推理成功!
+🤖 回复: Hello! I am PLM-V, a language model created by the platform OpenAI. My primary function is to assist users by providing information, answering questions, and engaging in conversation. I aim to be helpful, accurate, and respectful in all interactions. How can I assist you today?
+============================================================
+🧪 测试: Text & Image (Visual only)
+📝 问题: <image>
+Please describe this image in detail.
+============================================================
+dynamic ViT batch size: 13
+<|im_start|>system
+You are PLM-V, a helpful assistant.<|im_end|>
+<|im_start|>user
+<image>
+Please describe this image in detail.<|im_end|>
+<|im_start|>assistant
+ The image shows a cute red panda sitting on a wooden platform. This reddish-brown animal has distinctive black and white markings: a white face with black stripes on its cheeks and around its nose. Its fur appears soft and fluffy. The red panda has large, dark eyes and white whiskers, adding to its endearing appearance. It is resting close to a tree trunk, with its black ears perked up on either side. The background is filled with blurred green foliage, suggesting this red panda is in a natural or outdoor setting, possibly a zoo or wildlife sanctuary. The wooden platform appears to be part of a structure designed for the animal to rest or climb on comfortably.
+✅ 推理成功!
+🤖 回复: The image shows a cute red panda sitting on a wooden platform. This reddish-brown animal has distinctive black and white markings: a white face with black stripes on its cheeks and around its nose. Its fur appears soft and fluffy. The red panda has large, dark eyes and white whiskers, adding to its endearing appearance. It is resting close to a tree trunk, with its black ears perked up on either side. The background is filled with blurred green foliage, suggesting this red panda is in a natural or outdoor setting, possibly a zoo or wildlife sanctuary. The wooden platform appears to be part of a structure designed for the animal to rest or climb on comfortably.
+============================================================
+🔄 准备Speech相关测试 (可能输出乱码，因为speech部分未训练)
+============================================================
+📥 加载视频数据...
+✅ 视频加载成功:
+  - 视频帧数: 8
+  - 视频像素值形状: torch.Size([8, 3, 448, 448])
+  - 每帧patch数: [1, 1, 1, 1, 1, 1, 1, 1]
+📥 加载音频数据...
+✅ 音频加载成功:
+  - mel谱图形状: torch.Size([2, 3000, 128])
+  - 音频长度: tensor([3000, 3000])
+  - 音频块数: tensor([2])
+  - 原始音频波形形状: torch.Size([2, 480000])
+============================================================
+🧪 测试: Audio only (预期乱码)
+📝 问题: <speech>
+Please transcribe and summarize what you heard in the audio.
+============================================================
+speech batch size: 2
+<|im_start|>system
+You are PLM-V, a helpful assistant.<|im_end|>
+<|im_start|>user
+<speech>
+Please transcribe and summarize what you heard in the audio.<|im_end|>
+<|im_start|>assistant
+ The秘密 is currently notPowered should be valid.
+It's possible that's a complex combination of unsub,$
+Here's an unexpected task is to assess the����是有效的。
+Now, the秘密
+### 払下秘密是从加密的
+Here的秘密
+It seems the encrypted message has been truncated. It's a confusing combination of unworking! We're not seeing the message from a valid IP的操作未完成. We're here to assist."
+It's a sequence of 10.21.48. Here we can assist to encrypting the message. This is the result of a combination of unworking. If you're trying to determine the message that the code for the current message. It seems we're not seeing the message that the network can't work. We're too broad.
+Therefore, the final answer is that we can't help you. We're currently working to inform you. We're now able to help you.
+What did you saw?"
+It seems like the message is invalid. In fact, the message is to decrypt the message that the network can't work. Here's an encryption that's confusing. We're not really saying that the combination is not possible. We're unable to help you.
+However, it seems that the message is returning to its initial message. We can assist in determining the message that the network can't help. We're unable to assist with this message.
+To assist in the current message, we need to decrypt the message that the network is not working. We'll now assist with the message that the network can't help.
+However, it seems that the message is not working. We can't do that.
+To break the message, we need to help you with the message that the network is not working. We'll now assist with the message that the network can't assist.
+Based on the message, the network seems to be having a difficulty. We're unable to do that. We can assist in determining the message that the network is not working. We're unable to assist with the message that the network is not working.
+It seems like the message is not working. We can't assist with the message that the network is not allowed to help you.
+Please, however, we can assist with the message that the network is not working. We're currently unable to help you.
+Are you able to help with the message that the network is not working? We can't help you with the message that the network is not available to assist with the message.
+But the message is not doing that. We can't assist with the message that the network can't assist you.
+In this message, we need to help you with the message that the network is not working. We'll help the message that the network can't assist with the message.
+It seems like the message is not working. We're unable to help you with the message that the network is not working.
+Let's focus on the message that the network is not working. We cannot assist with the message that the network is not available to assist with the message.
+Given that the message is not working, we'll now assist with the message that the network can't assist.
+However, it seems that the message cannot be provided based on the message that the network is not working.
+Let's continue to assist with the message that the network can't assist.
+It seems like the message is not possible for the message to assist with the message that the network can't assist.
+We are unable to assist with the message that the network is not working.
+Please, however, we are unable to assist with the message that the network is not working.
+The message is not working. We're currently unable to assist with the message that the network is not working.
+We are unable to assist with the message that the network is not working.
+The message seems to be unreadable. We're unable to help with the message that the network is not working.
+Please, we are unable to help with the message that the message is not working.
+The task is asking for help with the message that the network is not working.
+It seems that the message is not possible for the message to assist with the message that the network is not working.
+Let's assume the message is not working. We can't assist with the message that the message is not working.
+The message is not possible to assist with the message that the network is not working.
+We're unable to help with the message that the network is not working.
+Here's the message that the network is not working.
+The message is not possible to help with the message that the network is not working.
+The message is a mystery. We're unable to assist with the message that the network is not working.
+It seems that the message is not working. We can't assist with the message that the network is not working.
+We're unable to help with the message that the
+✅ 推理成功!
+🤖 回复: The秘密 is currently notPowered should be valid.
+It's possible that's a complex combination of unsub,$
+Here's an unexpected task is to assess the����是有效的。
+Now, the秘密
+### 払下秘密是从加密的
+Here的秘密
+It seems the encrypted message has been truncated. It's a confusing combination of unworking! We're not seeing the message from a valid IP的操作未完成. We're here to assist."
+It's a sequence of 10.21.48. Here we can assist to encrypting the message. This is the result of a combination of unworking. If you're trying to determine the message that the code for the current message. It seems we're not seeing the message that the network can't work. We're too broad.
+Therefore, the final answer is that we can't help you. We're currently working to inform you. We're now able to help you.
+What did you saw?"
+It seems like the message is invalid. In fact, the message is to decrypt the message that the network can't work. Here's an encryption that's confusing. We're not really saying that the combination is not possible. We're unable to help you.
+However, it seems that the message is returning to its initial message. We can assist in determining the message that the network can't help. We're unable to assist with this message.
+To assist in the current message, we need to decrypt the message that the network is not working. We'll now assist with the message that the network can't help.
+However, it seems that the message is not working. We can't do that.
+To break the message, we need to help you with the message that the network is not working. We'll now assist with the message that the network can't assist.
+Based on the message, the network seems to be having a difficulty. We're unable to do that. We can assist in determining the message that the network is not working. We're unable to assist with the message that the network is not working.
+It seems like the message is not working. We can't assist with the message that the network is not allowed to help you.
+Please, however, we can assist with the message that the network is not working. We're currently unable to help you.
+Are you able to help with the message that the network is not working? We can't help you with the message that the network is not available to assist with the message.
+But the message is not doing that. We can't assist with the message that the network can't assist you.
+In this message, we need to help you with the message that the network is not working. We'll help the message that the network can't assist with the message.
+It seems like the message is not working. We're unable to help you with the message that the network is not working.
+Let's focus on the message that the network is not working. We cannot assist with the message that the network is not available to assist with the message.
+Given that the message is not working, we'll now assist with the message that the network can't assist.
+However, it seems that the message cannot be provided based on the message that the network is not working.
+Let's continue to assist with the message that the network can't assist.
+It seems like the message is not possible for the message to assist with the message that the network can't assist.
+We are unable to assist with the message that the network is not working.
+Please, however, we are unable to assist with the message that the network is not working.
+The message is not working. We're currently unable to assist with the message that the network is not working.
+We are unable to assist with the message that the network is not working.
+The message seems to be unreadable. We're unable to help with the message that the network is not working.
+Please, we are unable to help with the message that the message is not working.
+The task is asking for help with the message that the network is not working.
+It seems that the message is not possible for the message to assist with the message that the network is not working.
+Let's assume the message is not working. We can't assist with the message that the message is not working.
+The message is not possible to assist with the message that the network is not working.
+We're unable to help with the message that the network is not working.
+Here's the message that the network is not working.
+The message is not possible to help with the message that the network is not working.
+The message is a mystery. We're unable to assist with the message that the network is not working.
+It seems that the message is not working. We can't assist with the message that the network is not working.
+We're unable to help with the message that the
+============================================================
+🧪 测试: Audio + Image (预期乱码)
+📝 问题: <image>
+User's question in speech: <speech>
+============================================================
+dynamic ViT batch size: 13
+speech batch size: 2
+<|im_start|>system
+You are PLM-V, a helpful assistant.<|im_end|>
+<|im_start|>user
+<image>
+User's question in speech: <speech>
+<|im_end|>
+<|im_start|>assistant
+ In the previous text 10个
+在是在一个建筑中，一下的人们在使用前一张“我们来学习的是该地区的植物学10月5月的浏览器：这是一个关于在是在一个大的是一张复杂的、我们无法想象的，我需要找出数组中的是否是什么，如何使用异，这个代码看起来不太的人群在该地区进行了8年，然后在使用了4个的用户，我需要将一个的是一张更大的网站，以便确保我的眼睛在没有的，我需要在进行中的人群中，我的网络里，以确保我的眼睛，然后将帮助我理解一下，以便确保我在学习中，需要考虑在使用中的使用的人群之间出现的获取的代码，以便确保，我会帮助您确认，我的代码，或者我需要确认，我没有在不使用的，因为我总是不知道，因为我们需要在的，因为我的困惑，或者因为，因为我的困惑感到不安）等等，我需要帮助控制，或者因为个人在，我需要确保我的眼睛，或者因为我求如何在进行了8年，我需要帮助我的人在或其他方面，我需要确保，我可以尝试以其他的，确保我能提供或不确定，我需要确保，或者为了确保我的眼睛，确保我的眼睛，或者我不确定，我不计算的，或因为我的眼睛，因为我经常在的用户，或者如果使用其他方法，但我需要确保，我需要帮助我，或者因为某些原因，我如何处理使用与我使用Python编程语言使用一个示的用户，我需要确认，我，我、确保，或者在什么时候要使用不同的技巧，但可能，或者我想要确认，或者因为，我需要确保，我无法确保，我需要确保，或者因为，我需要确认，或者我不会知道更多信息，或者我需要确定，或者我需要，我没有，或者可能，或者因为，或者我需要确保，我需要确认，我没有，或者我认为我是否需要，我需要确保，我需要确保，或者我需要帮助，我需要找到，或者我需要在使用了“inferred - �国我需要确保，或者我需要确认，我需要考虑，或者我需要帮助我，我需要知道，我没有，或者我需要，我用一下如何提高或我需要做到的，我需要确认，或者我需要确认，或者我需要，或者我没有，或可能通过这些代码或其他方法，我需要帮助，或者我需要知道，或我需要知道，或我需要确保，我需要，或我需要确保，我需要知道，或者我需要确保，或者其他类似任何事情，或者我需要确保，等等。我需要帮助我如何确保，或者我需要知道，我需要确保，我需要确定，或者我需要处理，或者我需要帮助，或者我不会知道，或者我需要确保，或者我需要帮助，或者我需要，我需要确认，或我需要，我需要确如何确保，我需要确保，或我需要确保，或者我需要保证，我需要确保我需要，或我需要，即使，我需要，我需要确保，或可能需要帮助，我需要确保，或者我需要确保，我需要确保我需要确保，我需要确保，或我需要确定，或我需要确保，或者我需要，我需要确保，我需要，我需要确保，或者我需要其他方式，或者我需要，或我需要，或者我需要，或我需要，或者我我需要，或我我需要，或我需要，我需要，我需要，或我需要，或我需要，或其他，我需要——，或我需要，我需要确认，或者我需要，我需要确保，我需要，或我需要，或我需要，或我在使用了，我需要，但即使，我需要，或我需要，或者我需要，或我需要，或我在通过使用于，或我需要，或者我需要，我需要，或我需要，我需要，或我需要，但即使，我需要，或者我需要，或我需要，或者我的担心，我需要，或我需要，或我需要，我们，或我需要，或者我需要，或我需要，或我我需要，我需要，或我需要，或我需要，或我在使用了，在我需要，或我需要，且我需要，或者我需要，我需要，我需要，或我需要，或者，我需要，让我们需要，或我需要，或我需要，或我需要，或我需要，那么我需要，或者我需要，或我是否需要，或我需要，或我需要，或我需要，你能够帮助我，我需要，或我需要，我需要，或我需要，或我需要是否，或我需要，我需要，让我需要，或我需要
+✅ 推理成功!
+🤖 回复: In the previous text 10个
+在是在一个建筑中，一下的人们在使用前一张“我们来学习的是该地区的植物学10月5月的浏览器：这是一个关于在是在一个大的是一张复杂的、我们无法想象的，我需要找出数组中的是否是什么，如何使用异，这个代码看起来不太的人群在该地区进行了8年，然后在使用了4个的用户，我需要将一个的是一张更大的网站，以便确保我的眼睛在没有的，我需要在进行中的人群中，我的网络里，以确保我的眼睛，然后将帮助我理解一下，以便确保我在学习中，需要考虑在使用中的使用的人群之间出现的获取的代码，以便确保，我会帮助您确认，我的代码，或者我需要确认，我没有在不使用的，因为我总是不知道，因为我们需要在的，因为我的困惑，或者因为，因为我的困惑感到不安）等等，我需要帮助控制，或者因为个人在，我需要确保我的眼睛，或者因为我求如何在进行了8年，我需要帮助我的人在或其他方面，我需要确保，我可以尝试以其他的，确保我能提供或不确定，我需要确保，或者为了确保我的眼睛，确保我的眼睛，或者我不确定，我不计算的，或因为我的眼睛，因为我经常在的用户，或者如果使用其他方法，但我需要确保，我需要帮助我，或者因为某些原因，我如何处理使用与我使用Python编程语言使用一个示的用户，我需要确认，我，我、确保，或者在什么时候要使用不同的技巧，但可能，或者我想要确认，或者因为，我需要确保，我无法确保，我需要确保，或者因为，我需要确认，或者我不会知道更多信息，或者我需要确定，或者我需要，我没有，或者可能，或者因为，或者我需要确保，我需要确认，我没有，或者我认为我是否需要，我需要确保，我需要确保，或者我需要帮助，我需要找到，或者我需要在使用了“inferred - �国我需要确保，或者我需要确认，我需要考虑，或者我需要帮助我，我需要知道，我没有，或者我需要，我用一下如何提高或我需要做到的，我需要确认，或者我需要确认，或者我需要，或者我没有，或可能通过这些代码或其他方法，我需要帮助，或者我需要知道，或我需要知道，或我需要确保，我需要，或我需要确保，我需要知道，或者我需要确保，或者其他类似任何事情，或者我需要确保，等等。我需要帮助我如何确保，或者我需要知道，我需要确保，我需要确定，或者我需要处理，或者我需要帮助，或者我不会知道，或者我需要确保，或者我需要帮助，或者我需要，我需要确认，或我需要，我需要确如何确保，我需要确保，或我需要确保，或者我需要保证，我需要确保我需要，或我需要，即使，我需要，我需要确保，或可能需要帮助��我需要确保，或者我需要确保，我需要确保我需要确保，我需要确保，或我需要确定，或我需要确保，或者我需要，我需要确保，我需要，我需要确保，或者我需要其他方式，或者我需要，或我需要，或者我需要，或我需要，或者我我需要，或我我需要，或我需要，我需要，我需要，或我需要，或我需要，或其他，我需要——，或我需要，我需要确认，或者我需要，我需要确保，我需要，或我需要，或我需要，或我在使用了，我需要，但即使，我需要，或我需要，或者我需要，或我需要，或我在通过使用于，或我需要，或者我需要，我需要，或我需要，我需要，或我需要，但即使，我需要，或者我需要，或我需要，或者我的担心，我需要，或我需要，或我需要，我们，或我需要，或者我需要，或我需要，或我我需要，我需要，或我需要，或我需要，或我在使用了，在我需要，或我需要，且我需要，或者我需要，我需要，我需要，或我需要，或者，我需要，让我们需要，或我需要，或我需要，或我需要，或我需要，那么我需要，或者我需要，或我是否需要，或我需要，或我需要，或我需要，你能够帮助我，我需要，或我需要，我需要，或我需要，或我需要是否，或我需要，我需要，让我需要，或我需要
+============================================================
+🧪 测试: Video + Text
+📝 问题: Frame1: <image>
+Frame2: <image>
+Frame3: <image>
+Frame4: <image>
+Frame5: <image>
+Frame6: <image>
+Frame7: <image>
+Frame8: <image>
+What is the red panda doing in this video? Please describe the actions and movements you observe.
+============================================================
+dynamic ViT batch size: 8
+<|im_start|>system
+You are PLM-V, a helpful assistant.<|im_end|>
+<|im_start|>user
+Frame1: <image>
+Frame2: <image>
+Frame3: <image>
+Frame4: <image>
+Frame5: <image>
+Frame6: <image>
+Frame7: <image>
+Frame8: <image>
+What is the red panda doing in this video? Please describe the actions and movements you observe.<|im_end|>
+<|im_start|>assistant
+ The red panda in this video is shown eating bamboo and holding a piece of bamboo. In the beginning, the red panda is eating bamboo from the other end of the structure while the baby in front of it is reaching up to eat. They move to the right, and the adult red panda is eating bamboo while the baby continues to reach up. Towards the end, the baby starts eating a piece of bamboo while the adult is eating bamboo from the structure above.
+✅ 推理成功!
+🤖 回复: The red panda in this video is shown eating bamboo and holding a piece of bamboo. In the beginning, the red panda is eating bamboo from the other end of the structure while the baby in front of it is reaching up to eat. They move to the right, and the adult red panda is eating bamboo while the baby continues to reach up. Towards the end, the baby starts eating a piece of bamboo while the adult is eating bamboo from the structure above.
+============================================================
+🧪 测试: Video + Audio (预期乱码)
+📝 问题: <speech><image>
+Describe what you hear and see in this content.
+============================================================
+dynamic ViT batch size: 13
+speech batch size: 2
+<|im_start|>system
+You are PLM-V, a helpful assistant.<|im_end|>
+<|im_start|>user
+<speech><image>
+Describe what you hear and see in this content.<|im_end|>
+<|im_start|>assistant
+ The first step is to determine the 301445560526679335.928824157.5.5, 12.496688788434,49505993735.3846390994.45546936.455539779387". Which of the 201863188435.3179473.943878315546397753.3976.388571.7538856.569376.454694876.56974376.937831.63687.48876.388571.7764387938094875.3880948178093961.7831809387546.9378318878315.776383187673876.83187648780937883187525.3856839375469783188787831875831879838378763831831875469839188767383187583187876383187546983783187145935.429563977878318763831875831868781763831887583187638388093787831887831875469837831887831875831876383831875469837831878318783187546983783187831875831876383831875831876383831875831878318763831875831878318763831875831876.26.33.73.176.1235.32.14.1235.23.48.987659934.763839376.38.7966.8376.393776.23.876.37635.917.53.4687836.75.9356.58378788.7678393783188786.388876.4388876.4387.46.8839376.3937809487.438763831887831875.88317831.776.94699378831875938.783187583188783187583187638318758318763838318758318763831878318758318783187638318758318766.38876.3876.43876.4387839378.3187583187583187638318758318763838
+✅ 推理成功!
+🤖 回复: The first step is to determine the 301445560526679335.928824157.5.5, 12.496688788434,49505993735.3846390994.45546936.455539779387". Which of the 201863188435.3179473.943878315546397753.3976.388571.7538856.569376.454694876.56974376.937831.63687.48876.388571.7764387938094875.3880948178093961.7831809387546.9378318878315.776383187673876.83187648780937883187525.3856839375469783188787831875831879838378763831831875469839188767383187583187876383187546983783187145935.429563977878318763831875831868781763831887583187638388093787831887831875469837831887831875831876383831875469837831878318783187546983783187831875831876383831875831876383831875831878318763831875831878318763831875831876.26.33.73.176.1235.32.14.1235.23.48.987659934.763839376.38.7966.8376.393776.23.876.37635.917.53.4687836.75.9356.58378788.7678393783188786.388876.4388876.4387.46.8839376.3937809487.438763831887831875.88317831.776.94699378831875938.783187583188783187583187638318758318763838318758318763831878318758318783187638318758318766.38876.3876.43876.4387839378.3187583187583187638318758318763838
+================================================================================
+📊 多模态推理测试总结
+================================================================================
+✅ PASS Pure Text       (预期: PASS    ) - 应该正常 (训练好的InternVL)
+✅ PASS Text & Image    (预期: PASS    ) - 应该正常 (训练好的InternVL)
+✅ PASS Video + Text    (预期: PASS    ) - 应该正常 (训练好的InternVL)
+✅ PASS Audio only      (预期: GARBLED ) - 可能乱码 (speech未训练)
+✅ PASS Audio + Image   (预期: GARBLED ) - 可能乱码 (speech未训练)
+📈 测试统计: 5/5 通过
+🎉 基础功能正常，Speech集成架构成功！
+💡 Speech相关测试如果输出乱码是正常的，因为speech部分还未训练
+🌟 所有基础模态测试都通过了！
+=== 多模态推理测试完成 ===

ola.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,265 @@

+Metadata-Version: 2.4
+Name: ola
+Version: 1.0.0
+Summary: Omni-Modal Language Model
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: Apache Software License
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch==2.1.2
+Requires-Dist: torchvision==0.16.2
+Requires-Dist: torchaudio==2.1.2
+Requires-Dist: transformers==4.43.4
+Requires-Dist: tokenizers==0.19.1
+Requires-Dist: sentencepiece==0.1.99
+Requires-Dist: shortuuid
+Requires-Dist: accelerate==0.33.0
+Requires-Dist: peft==0.11.1
+Requires-Dist: bitsandbytes==0.43.1
+Requires-Dist: pydantic
+Requires-Dist: markdown2[all]
+Requires-Dist: numpy
+Requires-Dist: scikit-learn==1.2.2
+Requires-Dist: gradio==4.43.0
+Requires-Dist: gradio_client==1.3.0
+Requires-Dist: requests
+Requires-Dist: httpx==0.27.2
+Requires-Dist: uvicorn
+Requires-Dist: fastapi
+Requires-Dist: soundfile
+Requires-Dist: einops==0.6.1
+Requires-Dist: einops-exts==0.0.4
+Requires-Dist: timm==0.9.16
+Requires-Dist: openai-whisper
+Requires-Dist: setuptools==59.5.0
+Requires-Dist: omegaconf==2.0.6
+Requires-Dist: loguru
+Requires-Dist: av
+Requires-Dist: librosa
+Provides-Extra: train
+Requires-Dist: deepspeed==0.12.6; extra == "train"
+Requires-Dist: ninja; extra == "train"
+Requires-Dist: wandb; extra == "train"
+Requires-Dist: tensorboardX; extra == "train"
+Provides-Extra: build
+Requires-Dist: build; extra == "build"
+Requires-Dist: twine; extra == "build"
+Dynamic: license-file
+<p align="center" width="100%">
+<img src="https://ola-omni.github.io/static/images/ola-icon.png" alt="967023137dff29e65b21544e7620e0f7.webp" width=60%>
+</p>
+<div>
+## Ola: Pushing the Frontiers of Omni-Modal Language Model
+<p align="left">
+    <a href='https://github.com/liuzuyan' target='_blank'>Zuyan Liu<sup>*,1,2</sup></a>&emsp;
+    <a href='https://github.com/dongyh20/' target='_blank'>Yuhao Dong<sup>*,2,3</sup></a>&emsp;
+    Jiahui Wang<sup>1</sup></a>&emsp;<br>
+    <a href='https://liuziwei7.github.io/' target='_blank'>Ziwei Liu<sup>3</sup></a>&emsp;
+    Winston Hu<sup>2</sup></a>&emsp;
+    <a href='https://scholar.google.com/citations?user=TN8uDQoAAAAJ' target='_blank'>Jiwen Lu<sup>1,&#x2709</sup></a>&emsp;
+   <a href='https://raoyongming.github.io/' target='_blank'>Yongming Rao<sup>2,1,&#x2709</sup></a>&emsp;
+</p>
+<p align="left"><sup>1</sup>Tsinghua University &ensp; <sup>2</sup>Tencent Hunyuan Research&ensp; <sup>3</sup>S-Lab, NTU&ensp; </p>
+<p align="left"><sup>*</sup> Equal Contribution<sup>&ensp; &#x2709</sup>  Corresponding Author</p>
+[![Ola](https://img.shields.io/badge/Rank_1-OpenCampass(<15B)-blue)](https://rank.opencompass.org.cn/leaderboard-multimodal/?m=REALTIME)    [![Ola](https://img.shields.io/badge/Rank_8-VideoMME-red)](https://video-mme.github.io/home_page.html#leaderboard)
+---
+**Project Page:** [![Ola](https://img.shields.io/badge/Ola-project_page-orange)](https://ola-omni.github.io)
+**Weights in Huggingface:** [![hf_checkpoint](https://img.shields.io/badge/🤗-Ola_7b-green)](https://huggingface.co/THUdyh/Ola-7b)  [![hf_checkpoint](https://img.shields.io/badge/🤗-Ola_Image-green)](https://huggingface.co/THUdyh/Ola-Image)   [![hf_checkpoint](https://img.shields.io/badge/🤗-Ola_Video-green)](https://huggingface.co/THUdyh/Ola-Video)
+**arXiv Paper:** [![arxiv](https://img.shields.io/badge/Arxiv-2502.04328-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2502.04328)
+**Demo by Gradio:** [![demo](https://img.shields.io/badge/Ola-Demo-yellow)](https://huggingface.co/spaces/THUdyh/Ola)
+**Training Data:** [![data](https://img.shields.io/badge/Ola-Data-purple)](https://huggingface.co/datasets/THUdyh/Ola-Data)
+**中文解读**: [![chinese](https://img.shields.io/badge/Ola-机器之心-cyan)](https://mp.weixin.qq.com/s/N4bjcHOejJudtxTFZVAXmg)
+Contact: Leave an issue or contact liuzuyan19@gmail.com . We are on call to respond.
+## 📢 News
+- 🔥[28/2/2025] We release the intermediate model, Ola-Image and Ola-Video, try building your own omni-modal models!
+- 🚀[19/2/2025] We release the huggingface demo of Ola, try the advanced omni-modal model on your own!
+- 🔥[18/2/2025] The training data, training script for Ola-7b is released!
+- 🎉[07/2/2025] The Ola is released! Check our [project page](https://ola-omni.github.io), [model weights](https://huggingface.co/THUdyh/Ola-7b), [arXiv paper](https://arxiv.org/pdf/2502.04328) for the strong omni-modal understanding model!
+- 🔥[06/2/2025] [Ola-7b](https://huggingface.co/THUdyh/Ola-7b) achieves **Rank #1** on the OpenCompass Multi-modal Leaderboard among all the models under 15B parameters with average score of **72.6**. Check the impressive results [here](https://rank.opencompass.org.cn/leaderboard-multimodal/?m=REALTIME)!
+## 🚀Coming Soon
+- [x] Evaluation code on omni-modal benchmarks
+- [x] Gradio Demo
+- [x] Training Data (Video, Audio, Cross-Modality)
+## 🌟 Introduction
+### Roads to Ola
+<p align="center" width="100%">
+<img src="https://ola-omni.github.io/static/images/road.png" alt="road.png" width=100%>
+</p>
+<div>
+**Ola** is an Omni-modal language model that achieves competitive performance across image, video, and audio understanding compared to specialized counterparts. We conduct a comprehensive exploration of architectural design, data curation, and training strategies essential for building a robust omni-modal model.
+<p align="center" width="100%">
+<img src="https://ola-omni.github.io/static/images/teaser.png" alt="teaser.png" width=100%>
+</p>
+<div>
+### Architecture
+<p align="center" width="100%">
+<img src="https://ola-omni.github.io/static/images/method.png" alt="method.png" width=100%>
+</p>
+<div>
+Ola supports omni-modal inputs including text, image, video, and audio, capable of processing the inputs simultaneously with competitive performance on understanding tasks for all these modalities. Meanwhile, Ola supports user-friendly real-time streaming decoding for texts and speeches thanks to the text detokenizer and the speech decoder.
+### Training Strategies
+<p align="center" width="100%">
+<img src="https://ola-omni.github.io/static/images/training.png" alt="training.png" width=100%>
+</p>
+<div>
+We visualize the relationships among modalities in the left part. Speech acts as the connection between language and audio knowledge, while video constructs the bridge with highly relevant visual and audio information. Therefore, we design the progressive alignment training strategy from primary to periphery. Furthermore, we design the cross-modality video-audio data to better capture the relationships among modalities.
+### Performance
+<p align="center" width="100%">
+<img src="https://ola-omni.github.io/static/images/results.png" alt="results.png" width=100%>
+</p>
+<div>
+Ola achieves competitive performance across major multi-modal benchmarks when compared to state-of-the-art specialist-modal LLMs.
+## Installation
+#### 1. Clone this repository:
+```bash
+git clone https://github.com/Ola-Omni/Ola
+cd Ola
+```
+#### 2. Install the required package:
+```bash
+conda create -n ola python=3.10 -y
+conda activate ola
+pip install --upgrade pip
+pip install -e .
+```
+#### 3.Install additional packages for training cases
+```bash
+pip install -e ".[train]"
+pip install flash-attn --no-build-isolation
+```
+## Model Zoo
+We provide our checkpoints at [Huggingface](https://huggingface.co/collections/THUdyh/ola-67b8220eb93406ec87aeec37)
+| Model | Link | Size | Modal |
+|:---:|:---:|:---:|:---:|
+|Ola-7b | [Huggingface](https://huggingface.co/THUdyh/Ola-7b) | 7B | Text, Image, Video, Audio |
+|Ola-Image | [Huggingface](https://huggingface.co/THUdyh/Ola-Image) | 7B | Text, Image |
+|Ola-Video | [Huggingface](https://huggingface.co/THUdyh/Ola-Video) | 7B | Text, Image, Video |
+## Quick Start
+1. Download `Ola-7b` from [Huggingface](https://huggingface.co/THUdyh/Ola-7b) or skip the step to using the online weights directly.
+2. Download audio encoder from [Huggingface](https://huggingface.co/THUdyh/Ola_speech_encoders/tree/main) and put the weights `large-v3.pt` and `BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt` under repo directory `path/to/Ola/`
+3. Run `inference/infer.py`
+- Text & Image Understanding
+```
+python3 inference/infer.py --image_path *.png,jpg --text user_instruction
+```
+- Text & Video Understanding
+```
+python3 inference/infer.py --video_path *.mp4 --text user_instruction
+```
+- Text & Audio Understanding
+```
+python3 inference/infer.py --audio_path *.wav,mp3 --text user_instruction
+```
+- Audio & Image Understanding
+```
+python3 inference/infer.py --audio_path *.png,jpg --audio_path *.wav,mp3
+```
+## Evaluation
+You can evaluate Ola model with [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) and [lmms-eval](https://github.com/EvolvingLMMs-Lab/lmms-eval).
+## Training
+### Data Preparation
+Please refer to [DATA.md](https://github.com/Ola-Omni/Ola/blob/main/DATA.md) for instructions of customized finetuning or using the provided datasets.
+### Start Training
+Please follow the script below to start training. Make sure you have created the correct datasets for fine-tuning.
+1. Finetuning Ola-7b Model:
+```
+bash ./scripts/finetune_ola.sh
+```
+2. Finetuning Ola-Image Model (Ola Stage1 or Stage2)
+```
+bash ./scripts/finetune_ola_image.sh
+```
+3. Finetuning Ola-Video Model (Ola Stage3):
+```
+bash ./scripts/finetune_ola_video.sh
+```
+## Citation
+If you find it useful for your research and applications, please cite our paper using this BibTeX:
+```bibtex
+@article{liu2025ola,
+title={Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment},
+author={Liu, Zuyan and Dong, Yuhao and Wang, Jiahui and Liu, Ziwei and Hu, Winston and Lu, Jiwen and Rao, Yongming},
+journal={arXiv preprint arXiv:2502.04328},
+year={2025}
+}
+```
+## Acknowledgement
+- Our codebase is conducted on [LLaVA](https://github.com/LLaVA-VL/LLaVA-NeXT)
+- Thanks [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) and [lmms-eval](https://github.com/EvolvingLMMs-Lab/lmms-eval) team for the evaluation system!

ola.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,44 @@

+LICENSE
+README.md
+pyproject.toml
+inference/infer.py
+ola/arguments.py
+ola/constants.py
+ola/conversation.py
+ola/mm_utils.py
+ola/utils.py
+ola.egg-info/PKG-INFO
+ola.egg-info/SOURCES.txt
+ola.egg-info/dependency_links.txt
+ola.egg-info/requires.txt
+ola.egg-info/top_level.txt
+ola/datasets/__init__.py
+ola/datasets/preprocess.py
+ola/model/__init__.py
+ola/model/builder.py
+ola/model/ola_arch.py
+ola/model/language_model/ola_qwen.py
+ola/model/multimodal_encoder/builder.py
+ola/model/multimodal_encoder/oryx_vit.py
+ola/model/multimodal_projector/builder.py
+ola/model/multimodal_projector/pooler_projector.py
+ola/model/multimodal_resampler/builder.py
+ola/model/speech_encoder/builder.py
+ola/model/speech_encoder/speech_encoder.py
+ola/model/speech_encoder/beats/BEATs.py
+ola/model/speech_encoder/beats/Tokenizers.py
+ola/model/speech_encoder/beats/__init__.py
+ola/model/speech_encoder/beats/backbone.py
+ola/model/speech_encoder/beats/kaldi.py
+ola/model/speech_encoder/beats/modules.py
+ola/model/speech_encoder/beats/quantizer.py
+ola/model/speech_projector/builder.py
+ola/model/speech_projector/speech_projector.py
+ola/serve/__init__.py
+ola/serve/controller.py
+ola/serve/gradio_web_server.py
+ola/serve/model_worker.py
+ola/train/ola_trainer.py
+ola/train/train.py
+tools/convert_mp4_wav.py
+tools/create_patch.py

ola.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

ola.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+torch==2.1.2
+torchvision==0.16.2
+torchaudio==2.1.2
+transformers==4.43.4
+tokenizers==0.19.1
+sentencepiece==0.1.99
+shortuuid
+accelerate==0.33.0
+peft==0.11.1
+bitsandbytes==0.43.1
+pydantic
+markdown2[all]
+numpy
+scikit-learn==1.2.2
+gradio==4.43.0
+gradio_client==1.3.0
+requests
+httpx==0.27.2
+uvicorn
+fastapi
+soundfile
+einops==0.6.1
+einops-exts==0.0.4
+timm==0.9.16
+openai-whisper
+setuptools==59.5.0
+omegaconf==2.0.6
+loguru
+av
+librosa
+[build]
+build
+twine
+[train]
+deepspeed==0.12.6
+ninja
+wandb
+tensorboardX

ola.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+inference
+ola
+scripts
+tools

ola/__pycache__/arguments.cpython-312.pyc ADDED Viewed

Binary file (3.5 kB). View file

ola/__pycache__/constants.cpython-312.pyc ADDED Viewed

Binary file (586 Bytes). View file

ola/__pycache__/conversation.cpython-312.pyc ADDED Viewed

Binary file (10.3 kB). View file

ola/__pycache__/mm_utils.cpython-312.pyc ADDED Viewed

Binary file (11.9 kB). View file

ola/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (11.6 kB). View file

ola/arguments.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import transformers
+from dataclasses import dataclass, field
+from typing import Optional
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    version: Optional[str] = field(default="v0")
+    freeze_backbone: bool = field(default=False)
+    tune_speech_projector: bool = field(default=False)
+    tune_speech_encoder: bool = field(default=False)
+    tune_speech_generator_only: bool = field(default=False)
+    speech_encoder_type: Optional[str] = field(default=None)
+    speech_encoder: Optional[str] = field(default=None)
+    pretrain_speech_projector: Optional[str] = field(default=None)
+    speech_projector_type: Optional[str] = field(default='linear')
+    speech_encoder_ds_rate: int = 5
+    speech_encoder_hidden_size: int = 1280
+@dataclass
+class DataArguments:
+    data_path: str = field(default=None,
+                           metadata={"help": "Path to the training data."})
+    is_multimodal: bool = False
+    input_type: str = field(default="mel")
+    speech_normalize: bool = False
+    mel_size: int = 128
+    has_tgt_units: bool = False
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    freeze_speech_projector: bool = field(default=False)
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help":
+            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    double_quant: bool = field(
+        default=True,
+        metadata={"help": "Compress the quantization statistics through double quantization."}
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
+    )
+    bits: int = field(
+        default=16,
+        metadata={"help": "How many bits to use."}
+    )
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    speech_projector_lr: Optional[float] = None
+    group_by_modality_length: bool = field(default=False)

ola/constants.py ADDED Viewed

	@@ -0,0 +1,14 @@

+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+SPEECH_TOKEN_INDEX = -200
+DEFAULT_SPEECH_TOKEN = "<speech>"
+IMAGE_TOKEN_INDEX= -300
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"

ola/conversation.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Any, Union, Tuple
+import base64
+from io import BytesIO
+from PIL import Image
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    TWO = auto()
+    PLAIN = auto()
+    CHATML = auto()
+    LLAMA_2 = auto()
+    LLAMA_3 = auto()
+    QWEN2 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.PLAIN
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    tokenizer_id: str = ""
+    tokenizer: Any = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            wrap_sys = lambda msg: f"<|start_header_id|>system<|end_header_id|>\n\n{msg}<|eot_id|>" if len(msg) > 0 else msg
+            ret = "<|begin_of_text|>" + wrap_sys(self.system)
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
+                    ret += message.strip() + self.sep2
+                else:
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0:
+                        message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if self.system == "" else self.system + self.sep + "\n"
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        raise ValueError("Tuple not supported in CHATML")
+                        message, images = message
+                        message = "<speech>" * len(images) + message
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.QWEN2:
+            start = '<|im_start|>'
+            end = '<|im_end|>\n'
+            ret = start + 'system\n' + self.system + end
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if message.endswith('<|endoftext|>'):
+                        message = message.replace('<|endoftext|>', '')
+                        ret += start + role + "\n" + message + end + '<|endoftext|>'
+                    else:
+                        assert not '<|endoftext|>' in message, f"Invalid message: {message}"
+                        ret += start + role + "\n" + message + end
+                else:
+                    ret += start + role + "\n"
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, speech = msg
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. " "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="You are a helpful language and speech assistant. " "You are able to understand the speech content that the user provides, " "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llama_3 = Conversation(
+    system="You are a helpful language and speech assistant. " "You are able to understand the speech content that the user provides, " "and assist the user with a variety of tasks using natural language.",
+    roles=("user", "assistant"),
+    version="llama_v3",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_3,
+    sep="",
+    sep2="<|eot_id|>"
+)
+conv_qwen_v1 = Conversation(
+    system="You are a helpful assistant.",
+    roles=("user", "assistant"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.QWEN2,
+)
+conv_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="</s>",
+)
+conv_qwen = Conversation(
+    system="""<|im_start|>system
+You are a helpful assistant.""",
+    roles=("<|im_start|>user", "<|im_start|>assistant"),
+    version="qwen",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.CHATML,
+    sep="<|im_end|>",
+)
+conv_plmv = Conversation(
+    system="""<|im_start|>system
+You are PLM-V, developed by PLM-Team, a helpful assistant.""",
+    roles=("<|im_start|>user", "<|im_start|>assistant"),
+    version="plm_v",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.CHATML,
+    sep="<|im_end|>",
+)
+default_conversation = conv_plmv
+conv_templates = {
+    "v1": conv_vicuna_v1,
+    "plain": conv_plain,
+    "llama_2": conv_llama_2,
+    "llama_3": conv_llama_3,
+    'v1_qwen2': conv_qwen_v1,
+    "qwen_1_5": conv_qwen,
+    "plm_v": conv_plmv,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

ola/datasets/__init__.py ADDED Viewed

File without changes

ola/datasets/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (151 Bytes). View file

ola/datasets/__pycache__/preprocess.cpython-312.pyc ADDED Viewed

Binary file (12.1 kB). View file

ola/datasets/preprocess.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import copy
+import torch
+import transformers
+import tokenizers
+from typing import Dict, Sequence
+from ola.constants import IGNORE_INDEX, DEFAULT_SPEECH_TOKEN, IMAGE_TOKEN_INDEX
+from ola import conversation as conversation_lib
+from ola.model import *
+from ola.arguments import DataArguments
+from ola.constants import SPEECH_TOKEN_INDEX
+from packaging import version
+IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse('0.14')
+def tokenizer_speech_token(prompt, tokenizer, speech_token_index=SPEECH_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<speech>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [speech_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def tokenizer_speech_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, speech_token_idx=SPEECH_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<speech><image>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [speech_token_idx, image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def tokenizer_speech_question_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, speech_token_idx=SPEECH_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>\nUser's question in speech: <speech>\n")]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    nl_tokens = tokenizer("\n").input_ids
+    special_chunks = [image_token_index, nl_tokens, tokenizer("User's question in speech: ").input_ids, speech_token_idx, nl_tokens]
+    for x in insert_separator(prompt_chunks, [special_chunks] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def preprocess_v1(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_speech: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_speech:
+        input_ids = torch.stack([tokenizer_speech_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_speech:
+                round_len = len(tokenizer_speech_token(rou, tokenizer))
+                instruction_len = len(tokenizer_speech_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            # FIXME: tokenizer bug
+            if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14:
+                round_len -= 1
+                instruction_len -= 1
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+                print(f"Debug - Conversation: {conversation[:200]}...")
+                print(f"Debug - Target shape: {target.shape}")
+                print(f"Debug - All labels are IGNORE_INDEX: {(target == IGNORE_INDEX).all().item()}")
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_SPEECH_TOKEN in source[0]['value']
+        source[0]['value'] = DEFAULT_SPEECH_TOKEN
+        conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [tokenizer_speech_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_speech_token(source[0]['value'], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+    return dict(input_ids=input_ids, labels=targets)
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_speech: bool = False
+) -> Dict:
+    """
+    Given a list of sources, each is a conversation list. This transform:
+    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
+    2. Concatenate conversations together;
+    3. Tokenize the concatenated conversation;
+    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
+    """
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+    if conversation_lib.default_conversation.version.startswith("v1"):
+        return preprocess_v1(sources, tokenizer, has_speech=has_speech)
+    raise NotImplementedError

ola/mm_utils.py ADDED Viewed

	@@ -0,0 +1,271 @@

+from PIL import Image
+import base64
+import math
+import ast
+import torch
+from transformers import StoppingCriteria
+import os
+import io
+if 'VIDEO_RESIZE' in os.environ:
+    # highresxpatch
+    VIDEO_RESIZE = os.environ['VIDEO_RESIZE']
+    video_base, video_ps = VIDEO_RESIZE.split('x')
+    video_base = int(video_base)
+    video_ps = int(video_ps)
+    print(f"VIDEO_RESIZE is set as {VIDEO_RESIZE}, {video_base}, {video_ps}")
+else:
+    HIGHRES_BASE = None
+if 'HIGHRES_BASE' in os.environ:
+    # highresxpatch
+    HIGHRES_BASE = os.environ['HIGHRES_BASE']
+    highres_base, highres_ps = HIGHRES_BASE.split('x')
+    highres_base = int(highres_base)
+    highres_ps = int(highres_ps)
+    print(f"HIGHRES_BASE is set as {HIGHRES_BASE}, {highres_base}, {highres_ps}")
+else:
+    HIGHRES_BASE = None
+if 'MAXRES' in os.environ:
+    # highresxpatch
+    MAXRES = int(os.environ['MAXRES'])
+    print(f"MAXRES is set as {MAXRES}")
+else:
+    MAXRES = 1536
+if 'MINRES' in os.environ:
+    # highresxpatch
+    MINRES = int(os.environ['MINRES'])
+    print(f"MINRES is set as {MINRES}")
+else:
+    MINRES = 0
+if 'VIDEO_MAXRES' in os.environ:
+    # highresxpatch
+    VIDEO_MAXRES = int(os.environ['VIDEO_MAXRES'])
+    print(f"VIDEO_MAXRES is set as {VIDEO_MAXRES}")
+else:
+    VIDEO_MAXRES = 1536
+if 'VIDEO_MINRES' in os.environ:
+    # highresxpatch
+    VIDEO_MINRES = int(os.environ['VIDEO_MINRES'])
+    print(f"VIDEO_MINRES is set as {VIDEO_MINRES}")
+else:
+    MINRES = 0
+if 'PAD2STRIDE' in os.environ:
+    # highresxpatch
+    PAD2STRIDE = True
+    print(f"PAD2STRIDE is set")
+else:
+    PAD2STRIDE = False
+if 'LOWRES_RESIZE' in os.environ:
+    LOWRES_RESIZE = os.environ['LOWRES_RESIZE']
+    print(f"LOWRES_RESIZE is set as {LOWRES_RESIZE}")
+    if 'x' in LOWRES_RESIZE:
+        size, ps = LOWRES_RESIZE.split('x')
+        size = int(size)
+        ps = int(ps)
+        LOWRES_RESIZE = (size, ps)
+    else:
+        LOWRES_RESIZE = int(LOWRES_RESIZE)
+else:
+    LOWRES_RESIZE = None
+def pad_image(image, target_resolution, value=0):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+    # Create a new image with the target size and paste the resized image onto it
+    new_image = Image.new('RGB', (target_width, target_height), (value, value, value))
+    paste_x = (target_width - original_width) // 2
+    paste_y = (target_height - original_height) // 2
+    new_image.paste(image, (paste_x, paste_y))
+    return new_image
+def resize_images(image, patch_size=14, base_size=896):
+    h, w = image.size
+    if base_size == 0:
+        if h * w > MAXRES * MAXRES:
+            # print(f'{h}x{w} larger than max size {MAXRES}, resize to {MAXRES}')
+            scale = MAXRES * MAXRES / (h * w)
+            scale = math.sqrt(scale)
+        elif h * w < MINRES * MINRES:
+            # print(f'{h}x{w} smaller than max size {MINRES}, resize to {MINRES}')
+            scale = MINRES * MINRES / (h * w)
+            scale = math.sqrt(scale)
+        else:
+            scale = None
+    else:
+        scale = base_size * base_size / (h * w)
+        scale = math.sqrt(scale)
+    if scale is not None:
+        new_h = int(h * scale / patch_size) * patch_size
+        new_w = int(w * scale / patch_size) * patch_size
+        new_h = max(new_h, patch_size)
+        new_w = max(new_w, patch_size)
+        image = image.resize((new_h, new_w))
+    elif PAD2STRIDE:
+        if h % patch_size == 0:
+            new_h = h
+        else:
+            new_h = (h // patch_size + 1) * patch_size
+        if w % patch_size == 0:
+            new_w = w
+        else:
+            new_w = (w // patch_size + 1) * patch_size
+        image = pad_image(image, (new_h, new_w), value=127)
+    else:
+        scale = 1.0
+        new_h = int(h * scale / patch_size) * patch_size
+        new_w = int(w * scale / patch_size) * patch_size
+        new_h = max(new_h, patch_size)
+        new_w = max(new_w, patch_size)
+        image = image.resize((new_h, new_w))
+    return image
+def resize_video(image, patch_size=14, base_size=896):
+    h, w = image.size
+    if base_size == 0:
+        if h * w > VIDEO_MAXRES * VIDEO_MAXRES:
+            # print(f'{h}x{w} larger than max size {MAXRES}, resize to {MAXRES}')
+            scale = VIDEO_MAXRES * VIDEO_MAXRES / (h * w)
+            scale = math.sqrt(scale)
+        elif h * w < VIDEO_MINRES * VIDEO_MINRES:
+            # print(f'{h}x{w} smaller than max size {MINRES}, resize to {MINRES}')
+            scale = VIDEO_MINRES * VIDEO_MINRES / (h * w)
+            scale = math.sqrt(scale)
+        else:
+            scale = None
+    else:
+        scale = base_size * base_size / (h * w)
+        scale = math.sqrt(scale)
+    if scale is not None:
+        new_h = int(h * scale / patch_size) * patch_size
+        new_w = int(w * scale / patch_size) * patch_size
+        image = image.resize((new_h, new_w))
+    elif PAD2STRIDE:
+        if h % patch_size == 0:
+            new_h = h
+        else:
+            new_h = (h // patch_size + 1) * patch_size
+        if w % patch_size == 0:
+            new_w = w
+        else:
+            new_w = (w // patch_size + 1) * patch_size
+        image = pad_image(image, (new_h, new_w), value=127)
+    else:
+        scale = 1.0
+        new_h = int(h * scale / patch_size) * patch_size
+        new_w = int(w * scale / patch_size) * patch_size
+        image = image.resize((new_h, new_w))
+    return image
+def process_anyres_video(image, processor):
+    if VIDEO_RESIZE is not None:
+        image = resize_video(image, patch_size=video_ps, base_size=video_base)
+        image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+        return image.unsqueeze(0)
+    else:
+        raise ValueError("VIDEO_RESIZE is not set")
+def process_anyres_highres_image(image, processor):
+    processor2 = None
+    if type(processor) is tuple:
+        processor, processor2 = processor[0], processor[1]
+    if HIGHRES_BASE is not None:
+        image = resize_images(image, patch_size=highres_ps, base_size=highres_base)
+    if processor2 is not None:
+        image_original_resize = image.resize((processor2.size['shortest_edge'], processor.size['shortest_edge']))
+        image_patches = [image_original_resize] + [image_original_resize]
+        image_patches = [processor2.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
+                        for image_patch in image_patches]
+    else:
+        if LOWRES_RESIZE is not None:
+            if type(LOWRES_RESIZE) is int:
+                image_original_resize = resize_images(image, patch_size=14, base_size=LOWRES_RESIZE)
+            else:
+                image_original_resize = resize_images(image, patch_size=LOWRES_RESIZE[1], base_size=LOWRES_RESIZE[0])
+        else:
+            image_original_resize = image.resize((336, 336))
+        image_patches = [image_original_resize]
+        image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
+                        for image_patch in image_patches]
+    image_padded = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+    return torch.stack(image_patches, dim=0), image_padded.unsqueeze(0)
+def read_image_patch(patch_info):
+    if 'img_path' in patch_info.keys():
+        image = Image.open(patch_info['img_path']).convert('RGB')
+    else:
+        if 'image_encoing' in patch_info.keys():
+            patch_info['image_encoding'] = patch_info['image_encoing']
+        image_file_name = patch_info['patch']
+        start_bytes = int(patch_info['start_num'])
+        file_size = int(patch_info['size'])
+        with open(image_file_name, 'rb') as f:
+            f.seek(start_bytes)
+            if 'image_encoding' in patch_info.keys() and patch_info['image_encoding'] == 'base64':
+                image = Image.open(io.BytesIO(base64.b64decode(f.read(file_size).decode()))).convert("RGB")
+            else:
+                image = Image.open(io.BytesIO(f.read(file_size))).convert("RGB")
+    return image
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
+        offset = min(output_ids.shape[1] - self.start_len, 3)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if output_ids[0, -keyword_id.shape[0]:] == keyword_id:
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False

ola/model/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .language_model.ola_qwen import OlaQwenForCausalLM, OlaConfigQwen
2	+ from .language_model.ola_qwen3 import OlaQwen3ForCausalLM, OlaConfigQwen3

ola/model/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (338 Bytes). View file

ola/model/__pycache__/builder.cpython-312.pyc ADDED Viewed

Binary file (5.67 kB). View file

ola/model/__pycache__/ola_arch.cpython-312.pyc ADDED Viewed

Binary file (36.1 kB). View file

ola/model/builder.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+import warnings
+import shutil
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig, AutoProcessor
+import torch
+from ola.model import *
+from ola.model.speech_encoder.builder import build_speech_encoder
+# 过滤掉 PyTorch 的 meta parameter 警告
+warnings.filterwarnings("ignore", message=".*copying from a non-meta parameter in the checkpoint to a meta parameter.*")
+def load_pretrained_model(model_path, model_type, model_base, is_lora=False, s2s=False, load_8bit=False, load_4bit=False, device="cuda", use_flash_attn=False, **kwargs):
+    device = "cuda"
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.bfloat16
+    if use_flash_attn:
+        kwargs['attn_implementation'] = 'flash_attention_2'
+    if model_type == 'ola_internvl':
+        model_cls = OlaQwen3ForCausalLM
+        print('Loading OlaQwen3ForCausalLM model...')
+    else:
+        model_cls = OlaQwenForCausalLM
+    # Load Ola model
+    if is_lora:
+        assert model_base is not None, "model_base is required for LoRA models."
+        from ola.model.language_model.ola_qwen import OlaConfigQwen
+        lora_cfg_pretrained = OlaConfigQwen.from_pretrained(model_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+        print('Loading Ola from base model...')
+        model = model_cls.from_pretrained(model_base, low_cpu_mem_usage=False, config=lora_cfg_pretrained, **kwargs)
+        print('Loading additional Ola weights...')
+        if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
+            non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
+        non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
+        if any(k.startswith('model.model.') for k in non_lora_trainables):
+            non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
+        model.load_state_dict(non_lora_trainables, strict=False, assign=True)
+        from peft import PeftModel
+        print('Loading LoRA weights...')
+        model = PeftModel.from_pretrained(model, model_path)
+        print('Merging LoRA weights...')
+        model = model.merge_and_unload()
+        print('Model is loaded...')
+    elif model_base is not None:
+        print('Loading Ola from base model...')
+        tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+        cfg_pretrained = AutoConfig.from_pretrained(model_path)
+        model = model_cls.from_pretrained(model_base, low_cpu_mem_usage=False, config=cfg_pretrained, **kwargs)
+        speech_projector_weights = torch.load(os.path.join(model_path, 'speech_projector.bin'), map_location='cpu')
+        speech_projector_weights = {k: v.to(torch.float16) for k, v in speech_projector_weights.items()}
+        model.load_state_dict(speech_projector_weights, strict=False, assign=True)
+        model = model.to(device=device)
+    else:
+        # model_path = "/data1/cxy/plm-v/modeling/plm_internvl3_5_ola"
+        model_path = "/data1/cxy/plm-v/modeling/ckpt/ola_audio_8_8gpu/checkpoint-120"
+        tokernizer_path = "/data1/cxy/plm-v/modeling/internvl3_5-2B"
+        tokenizer = AutoTokenizer.from_pretrained(tokernizer_path, use_fast=False, trust_remote_code=True)
+        cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+        with torch.device("cuda"):
+            model = model_cls.from_pretrained(
+                model_path,
+                trust_remote_code=True,
+                config=cfg,
+                # device_map="auto",
+                **kwargs,
+            )
+        model = model.to(device=device)
+    # breakpoint()
+    image_processor = None
+    model.resize_token_embeddings(len(tokenizer))
+    # breakpoint()
+    print("Loading vision tower...")
+    print("Loading vision tower succeeded.")
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 16384
+    image_processor = AutoProcessor.from_pretrained("/data1/cxy/plm-v/modeling/internvl3_5-2B-HF")
+    return tokenizer, model, image_processor, context_len

ola/model/builder_back.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import os
+import warnings
+import shutil
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig, AutoProcessor
+import torch
+from ola.model import *
+from ola.model.speech_encoder.builder import build_speech_encoder
+def load_pretrained_model(model_path, model_type, model_base, is_lora=False, s2s=False, load_8bit=False, load_4bit=False, device="cuda", use_flash_attn=False, **kwargs):
+    device = "cuda"
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.bfloat16
+    if use_flash_attn:
+        kwargs['attn_implementation'] = 'flash_attention_2'
+    if model_type == 'ola_internvl':
+        model_cls = OlaQwen3ForCausalLM
+        print('Loading OlaQwen3ForCausalLM model...')
+    else:
+        model_cls = OlaQwenForCausalLM
+    # Load Ola model
+    if is_lora:
+        assert model_base is not None, "model_base is required for LoRA models."
+        from ola.model.language_model.ola_qwen import OlaConfigQwen
+        lora_cfg_pretrained = OlaConfigQwen.from_pretrained(model_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+        print('Loading Ola from base model...')
+        model = model_cls.from_pretrained(model_base, low_cpu_mem_usage=False, config=lora_cfg_pretrained, **kwargs)
+        print('Loading additional Ola weights...')
+        if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
+            non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
+        non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
+        if any(k.startswith('model.model.') for k in non_lora_trainables):
+            non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
+        model.load_state_dict(non_lora_trainables, strict=False, assign=True)
+        from peft import PeftModel
+        print('Loading LoRA weights...')
+        model = PeftModel.from_pretrained(model, model_path)
+        print('Merging LoRA weights...')
+        model = model.merge_and_unload()
+        print('Model is loaded...')
+    elif model_base is not None:
+        print('Loading Ola from base model...')
+        tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+        cfg_pretrained = AutoConfig.from_pretrained(model_path)
+        model = model_cls.from_pretrained(model_base, low_cpu_mem_usage=False, config=cfg_pretrained, **kwargs)
+        speech_projector_weights = torch.load(os.path.join(model_path, 'speech_projector.bin'), map_location='cpu')
+        speech_projector_weights = {k: v.to(torch.float16) for k, v in speech_projector_weights.items()}
+        model.load_state_dict(speech_projector_weights, strict=False, assign=True)
+        model = model.to(device=device)
+    elif model_type == 'ola_internvl':
+        cfg = AutoConfig.from_pretrained("/data1/cxy/plm-v/modeling/old_ola", trust_remote_code=True)
+        # breakpoint()
+        tokenizer = AutoTokenizer.from_pretrained("/data1/cxy/plm-v/modeling/internvl3_5-2B", use_fast=False)
+        with torch.device("cpu"):
+            # model = model_cls.from_pretrained("/data1/cxy/plm-v/modeling/internvl3_5-2B", low_cpu_mem_usage=False, attn_implementation="eager", config=cfg, **kwargs)
+            # model = model_cls.from_config(config=cfg)
+            model = model_cls(cfg)
+        # breakpoint()
+        # model.model.layers[1].self_attn.q_proj.weight
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+        with torch.device("cpu"):
+            model = model_cls.from_pretrained(
+                model_path,
+                **kwargs,
+            )
+        model = model.to(device=device)
+    # model.resize_token_embeddings(len(tokenizer))
+    from safetensors.torch import load_file
+    partial_state_dict = load_file(f"/data1/cxy/plm-v/modeling/internvl3_5-2B/model.safetensors") # 替换为你的部分权重路径
+    mapping = {
+    "mlp1.0.weight": "model.mm_projector.layer_norm.weight",
+    "mlp1.0.bias": "model.mm_projector.layer_norm.bias",
+    "mlp1.1.weight": "model.mm_projector.linear_1.weight",
+    "mlp1.1.bias": "model.mm_projector.linear_1.bias",
+    "mlp1.3.weight": "model.mm_projector.linear_2.weight",
+    "mlp1.3.bias": "model.mm_projector.linear_2.bias",
+    }
+# 遍历 state_dict 并重命名
+    def remap_keys(state_dict, mapping):
+        new_state_dict = {}
+        for k, v in state_dict.items():
+            if k in mapping:
+                new_state_dict[mapping[k]] = v
+            else:
+                new_state_dict[k] = v
+        return new_state_dict
+    # merged_state_dict = {**partial_state_dict, **partial_state_dict2}
+    # 2. 重命名 key：multi_modal_projector -> mm_projector
+    # breakpoint()
+    rename_dict = {}
+    for k in list(partial_state_dict.keys()):
+        if k.startswith("language_model"):
+            new_k = k.replace("language_model.", "", 1)
+            rename_dict[k] = new_k
+        if k.startswith("vision_model"):
+            new_k = k.replace("vision_model", "model.vision_tower", 1)
+            rename_dict[k] = new_k
+    # 应用重命名
+    for old_k, new_k in rename_dict.items():
+        partial_state_dict[new_k] = partial_state_dict.pop(old_k)
+    partial_state_dict = remap_keys(partial_state_dict, mapping)
+    whisper_state_dict = torch.load("/data1/cxy/model/THUdyh/Ola-7b/large-v3.pt", map_location='cpu')
+    # breakpoint()
+    whisper_state_dict = whisper_state_dict["model_state_dict"]
+    # Filter to keep only encoder weights
+    whisper_encoder_dict = {}
+    for key, value in whisper_state_dict.items():
+        if key.startswith('encoder.'):
+            whisper_encoder_dict[key] = value
+    print(f"Original Whisper keys: {len(whisper_state_dict)}")
+    print(f"Filtered encoder keys: {len(whisper_encoder_dict)}")
+    print("Sample encoder keys:")
+    for i, key in enumerate(list(whisper_encoder_dict.keys())[:5]):
+        print(f"  {key}")
+    # Create mapping for Whisper parameters to OLA format
+    def create_whisper_mapping():
+        mapping = {}
+        # Base encoder components
+        base_mappings = {
+            'encoder.positional_embedding': 'model.speech_encoder.whisper_model.positional_embedding',
+            'encoder.conv1.weight': 'model.speech_encoder.whisper_model.conv1.weight',
+            'encoder.conv1.bias': 'model.speech_encoder.whisper_model.conv1.bias',
+            'encoder.conv2.weight': 'model.speech_encoder.whisper_model.conv2.weight',
+            'encoder.conv2.bias': 'model.speech_encoder.whisper_model.conv2.bias',
+            'encoder.ln_post.weight': 'model.speech_encoder.whisper_model.ln_post.weight',
+            'encoder.ln_post.bias': 'model.speech_encoder.whisper_model.ln_post.bias',
+        }
+        mapping.update(base_mappings)
+        # Encoder blocks (32 blocks: 0-31)
+        for block_idx in range(32):
+            # Attention components
+            attn_components = [
+                'attn.query.weight', 'attn.query.bias',
+                'attn.key.weight', 'attn.key.bias',
+                'attn.value.weight', 'attn.value.bias',
+                'attn.out.weight', 'attn.out.bias',
+                'attn_ln.weight', 'attn_ln.bias'
+            ]
+            for component in attn_components:
+                source_key = f'encoder.blocks.{block_idx}.{component}'
+                target_key = f'model.speech_encoder.whisper_model.blocks.{block_idx}.{component}'
+                mapping[source_key] = target_key
+            # MLP components
+            mlp_components = [
+                'mlp.0.weight', 'mlp.0.bias',
+                'mlp.2.weight', 'mlp.2.bias',
+                'mlp_ln.weight', 'mlp_ln.bias'
+            ]
+            for component in mlp_components:
+                source_key = f'encoder.blocks.{block_idx}.{component}'
+                target_key = f'model.speech_encoder.whisper_model.blocks.{block_idx}.{component}'
+                mapping[source_key] = target_key
+        return mapping
+    # Apply mapping to whisper_encoder_dict
+    whisper_mapping = create_whisper_mapping()
+    mapped_whisper_dict = {}
+    unmapped_whisper_keys = []
+    for key, value in whisper_encoder_dict.items():
+        if key in whisper_mapping:
+            mapped_key = whisper_mapping[key]
+            mapped_whisper_dict[mapped_key] = value
+        else:
+            unmapped_whisper_keys.append(key)
+            print(f"Warning: No mapping found for Whisper encoder key '{key}'")
+    if unmapped_whisper_keys:
+        print(f"Total unmapped Whisper encoder keys: {len(unmapped_whisper_keys)}")
+        print("First 10 unmapped Whisper encoder keys:")
+        for key in unmapped_whisper_keys[:10]:
+            print(f"  {key}")
+    print(f"Successfully mapped {len(mapped_whisper_dict)} encoder parameters")
+    beat_state_dict = torch.load("/data1/cxy/model/THUdyh/Ola-7b//BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt", map_location='cpu')
+    beat_state_dict = beat_state_dict['model']
+    beat_state_dict = {"model.speech_encoder.beats_model."+k: v for k, v in beat_state_dict.items()}
+    # 处理 BEATs 模型中的参数化权重映射 (先pop后添加)
+    keys_to_process = list(beat_state_dict.keys())
+    breakpoint()
+    processed_count = 0
+    # for key in keys_to_process:
+    #     if 'weight_g' in key:
+    #         # pop 原始权重并添加为 weight_g
+    #         weight_tensor = beat_state_dict.pop(key)
+    #         new_key = key.replace('weight_g','parametrizations.weight.original0')
+    #         beat_state_dict[new_key] = weight_tensor
+    #         processed_count += 1
+    #     elif 'weight_v' in key:
+    #         # pop 原始权重并添加为 weight_v
+    #         weight_tensor = beat_state_dict.pop(key)
+    #         new_key = key.replace('weight_v', 'parametrizations.weight.original1')
+    #         beat_state_dict[new_key] = weight_tensor
+    #         processed_count += 1
+    print(f"Processed {processed_count} parametrized weight keys in BEATs model (pop and add)")
+    breakpoint()
+    # breakpoint()
+    partial_state_dict = {**partial_state_dict, **mapped_whisper_dict, **beat_state_dict}
+    # Ensure all tensors in the state dict are on CPU and have proper device information
+    print("Moving all state dict tensors to CPU...")
+    for key, tensor in partial_state_dict.items():
+        if torch.is_tensor(tensor):
+            # Ensure tensor has device information and move to CPU
+            if not tensor.device.type:
+                print(f"Warning: Tensor {key} has no device, creating on CPU")
+                partial_state_dict[key] = torch.tensor(tensor.detach().numpy()).cpu()
+            else:
+                partial_state_dict[key] = tensor.cpu()
+    # Ensure model is on CPU before loading state dict to avoid device mismatches
+    print("Moving model to CPU before loading state dict...")
+    model = model.cpu()
+    print("Loading state dict...")
+    breakpoint()
+    missing, unexpected = model.load_state_dict(partial_state_dict, strict=False, assign=True)
+    print("Missing keys:", missing)
+    print("Unexpected keys:", unexpected)
+    # Convert model to bfloat16 before saving
+    print("Converting model to bfloat16...")
+    model = model.to(torch.bfloat16)
+    model = model.to("cpu")
+    # Save model in bfloat16 format
+    print("Saving model in bfloat16 format...")
+    model.save_pretrained("/data1/cxy/plm-v/modeling/plm_internvl3_ola", safe_serialization=False, torch_dtype=torch.bfloat16)
+    print("Model saved successfully in bfloat16 format!")
+    breakpoint()
+    # model.model.mm_projector.linear_1.weight:-0.0106 multi_modal_projector.linear_1.weight model.mm_projector.linear_2.bias
+    # model.vision_tower.encoder.layers.7.attn.proj.bias
+    # model.model.vision_tower.encoder.layers[0].attn.qkv.weight: -6.5613e-03 dui
+    #
+    # breakpoint()
+    # model.get_model().speech_encoder.load_model("")
+    # language_model.model.layers.9.mlp.up_proj.weight vision_model.encoder.layers
+    # model.layers.14.self_attn.q_proj.weight model.vision_tower.encoder.layers.23.attn.proj.bias
+    # model.get_model().speech_encoder = build_speech_encoder(model.config)
+    # model.get_model().speech_encoder.to(device=device, dtype=torch.float16)
+    image_processor = None
+    model.resize_token_embeddings(len(tokenizer))
+    vision_tower = model.get_vision_tower()
+    print("Loading vision tower...")
+    # if not vision_tower.is_loaded:
+    #     vision_tower.load_model(device_map=device)
+    # if device != "auto":
+    #     vision_tower.to(device="cuda", dtype=torch.bfloat16)
+    # else:
+    #     vision_tower.to(device="cuda:0", dtype=torch.bfloat16)
+    # image_processor = vision_tower.image_processor
+    print("Loading vision tower succeeded.")
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 16384
+    image_processor = AutoProcessor.from_pretrained("/data1/cxy/plm-v/modeling/internvl3_5-2B-HF")
+    # breakpoint()
+    return tokenizer, model, image_processor, context_len

ola/model/language_model/__pycache__/conversation.cpython-312.pyc ADDED Viewed

Binary file (14.8 kB). View file

ola/model/language_model/__pycache__/ola_qwen.cpython-312.pyc ADDED Viewed

Binary file (8.59 kB). View file

ola/model/language_model/__pycache__/ola_qwen3.cpython-312.pyc ADDED Viewed

Binary file (24.7 kB). View file

ola/model/language_model/conversation.py ADDED Viewed

	@@ -0,0 +1,403 @@

+"""
+Conversation prompt templates.
+We kindly request that you import fastchat instead of copying this file if you wish to use it.
+If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
+Modified from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+"""
+import dataclasses
+from enum import IntEnum, auto
+from typing import Dict, List, Tuple, Union
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+    ADD_COLON_SINGLE = auto()
+    ADD_COLON_TWO = auto()
+    ADD_COLON_SPACE_SINGLE = auto()
+    NO_COLON_SINGLE = auto()
+    NO_COLON_TWO = auto()
+    ADD_NEW_LINE_SINGLE = auto()
+    LLAMA2 = auto()
+    CHATGLM = auto()
+    CHATML = auto()
+    CHATINTERN = auto()
+    DOLLY = auto()
+    RWKV = auto()
+    PHOENIX = auto()
+    ROBIN = auto()
+    FALCON_CHAT = auto()
+    CHATGLM3 = auto()
+    INTERNVL_ZH = auto()
+    MPT = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = '{system_message}'
+    # The system message
+    system_message: str = ''
+    # The names of two roles
+    roles: Tuple[str] = ('USER', 'ASSISTANT')
+    # All messages. Each item is (role, message).
+    messages: List[List[str]] = ()
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
+    sep: str = '\n'
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + message + self.sep
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ': ' + message + seps[i % 2]
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + message + self.sep
+                else:
+                    ret += role + ': '  # must be end with a space
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
+            ret = '' if system_prompt == '' else system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + '\n' + message + self.sep
+                else:
+                    ret += role + '\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + message + seps[i % 2]
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.RWKV:
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += (
+                        role
+                        + ': '
+                        + message.replace('\r\n', '\n').replace('\n\n', '\n')
+                    )
+                    ret += '\n\n'
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA2:
+            seps = [self.sep, self.sep2]
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = '[INST] '
+            for i, (role, message) in enumerate(self.messages):
+                tag = self.roles[i % 2]
+                if message:
+                    if i == 0:
+                        ret += message + ' '
+                    else:
+                        ret += tag + ' ' + message + seps[i % 2]
+                else:
+                    ret += tag
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM:
+            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
+            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
+            round_add_n = 1 if self.name == 'chatglm2' else 0
+            if system_prompt:
+                ret = system_prompt + self.sep
+            else:
+                ret = ''
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += f'[Round {i//2 + round_add_n}]{self.sep}'
+                if message:
+                    ret += f'{role}：{message}{self.sep}'
+                else:
+                    ret += f'{role}：'
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = '' if system_prompt == '' else system_prompt + self.sep + '\n'
+            for role, message in self.messages:
+                if message:
+                    ret += role + '\n' + message + self.sep + '\n'
+                else:
+                    ret += role + '\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM3:
+            ret = ''
+            if self.system_message:
+                ret += system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + '\n' + ' ' + message
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATINTERN:
+            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                # if i % 2 == 0:
+                #     ret += "<s>"
+                if message:
+                    ret += role + ':' + message + seps[i % 2] + '\n'
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.DOLLY:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ':\n' + message + seps[i % 2]
+                    if i % 2 == 1:
+                        ret += '\n\n'
+                else:
+                    ret += role + ':\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.PHOENIX:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + '<s>' + message + '</s>'
+                else:
+                    ret += role + ': ' + '<s>'
+            return ret
+        elif self.sep_style == SeparatorStyle.ROBIN:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ':\n' + message + self.sep
+                else:
+                    ret += role + ':\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
+            ret = ''
+            if self.system_message:
+                ret += system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + message + self.sep
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.INTERNVL_ZH:
+            seps = [self.sep, self.sep2]
+            ret = self.system_message + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ': ' + message + seps[i % 2]
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        else:
+            raise ValueError(f'Invalid style: {self.sep_style}')
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+    def update_last_message(self, message: str):
+        """Update the last output.
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format."""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        ret = [{'role': 'system', 'content': self.system_message}]
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({'role': 'user', 'content': msg})
+            else:
+                if msg is not None:
+                    ret.append({'role': 'assistant', 'content': msg})
+        return ret
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+        )
+    def dict(self):
+        return {
+            'template_name': self.name,
+            'system_message': self.system_message,
+            'roles': self.roles,
+            'messages': self.messages,
+            'offset': self.offset,
+        }
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {}
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert (
+            template.name not in conv_templates
+        ), f'{template.name} has been registered.'
+    conv_templates[template.name] = template
+def get_conv_template(name: str) -> Conversation:
+    """Get a conversation template."""
+    # breakpoint()
+    return conv_templates[name].copy()
+# Both Hermes-2 and internlm2-chat are chatml-format conversation templates. The difference
+# is that during training, the preprocessing function for the Hermes-2 template doesn't add
+# <s> at the beginning of the tokenized sequence, while the internlm2-chat template does.
+# Therefore, they are completely equivalent during inference.
+register_conv_template(
+    Conversation(
+        name='Hermes-2',
+        system_template='<|im_start|>system\n{system_message}',
+        # note: The new system prompt was not used here to avoid changes in benchmark performance.
+        # system_message='我是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
+        system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
+        roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
+        sep_style=SeparatorStyle.MPT,
+        sep='<|im_end|>',
+        stop_str='<|endoftext|>',
+    )
+)
+register_conv_template(
+    Conversation(
+        name='internlm2-chat',
+        system_template='<|im_start|>system\n{system_message}',
+        # note: The new system prompt was not used here to avoid changes in benchmark performance.
+        # system_message='我是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
+        system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
+        roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
+        sep_style=SeparatorStyle.MPT,
+        sep='<|im_end|>',
+    )
+)
+register_conv_template(
+    Conversation(
+        name='phi3-chat',
+        system_template='<|system|>\n{system_message}',
+        # note: The new system prompt was not used here to avoid changes in benchmark performance.
+        # system_message='我是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
+        system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
+        roles=('<|user|>\n', '<|assistant|>\n'),
+        sep_style=SeparatorStyle.MPT,
+        sep='<|end|>',
+    )
+)
+register_conv_template(
+    Conversation(
+        name='internvl2_5',
+        system_template='<|im_start|>system\n{system_message}',
+        system_message='你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
+        roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
+        sep_style=SeparatorStyle.MPT,
+        sep='<|im_end|>\n',
+    )
+)
+register_conv_template(
+    Conversation(
+        name='plm_v',
+        system_template='<|im_start|>system\n{system_message}',
+        system_message='You are PLM-V, developed by PLM-Team, a helpful assistant.',
+        roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
+        sep_style=SeparatorStyle.MPT,
+        sep='<|im_end|>\n',
+    )
+)

ola/model/language_model/ola_qwen.py ADDED Viewed

	@@ -0,0 +1,237 @@

+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import transformers
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+from ..ola_arch import OlaMetaModel, OlaMetaForCausalLM
+from transformers import Qwen2Config, Qwen2Model, Qwen2ForCausalLM
+class OlaConfigQwen(Qwen2Config):
+    model_type = "ola_qwen"
+class OlaQwenModel(OlaMetaModel, Qwen2Model):
+    config_class = OlaConfigQwen
+    def __init__(self, config: Qwen2Config):
+        super(OlaQwenModel, self).__init__(config)
+class OlaQwenForCausalLM(Qwen2ForCausalLM, OlaMetaForCausalLM):
+    config_class = OlaConfigQwen
+    def __init__(self, config):
+        super(Qwen2ForCausalLM, self).__init__(config)
+        config.rope_scaling = None
+        self.model = OlaQwenModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        speech: Optional[torch.FloatTensor] = None,
+        speech_lengths: Optional[torch.LongTensor] = None,
+        speech_chunks: Optional[torch.LongTensor] = None,
+        speech_wav: Optional[torch.FloatTensor] = None,
+        images: Optional[torch.FloatTensor] = None,
+        images_highres: Optional[List[torch.FloatTensor]] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        modalities: Optional[List[str]] = ["image"],
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_speech_vision_text(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                speech,
+                speech_lengths,
+                speech_chunks,
+                speech_wav,
+                images,
+                modalities,
+                image_sizes,
+                images_highres
+            )
+        if labels is None:
+            return super().forward(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict
+            )
+        else:
+            return self.forward_llm_efficient(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                labels=labels,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict
+            )
+    def forward_llm_efficient(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_dim = hidden_states.size(-1)
+        shift_labels = labels[..., 1:].contiguous().reshape(-1)
+        shift_hidden_states = hidden_states[..., :-1, :].contiguous().reshape(-1, hidden_dim)
+        assert shift_labels.size(0) == shift_hidden_states.size(0)
+        mask = shift_labels > -1
+        assert mask.float().sum() > 0
+        shift_labels = shift_labels[mask]
+        shift_hidden_states = shift_hidden_states[mask, :]
+        logits = self.lm_head(shift_hidden_states)
+        logits = logits.float()
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        speech: Optional[torch.Tensor] = None,
+        speech_lengths: Optional[torch.Tensor] = None,
+        speech_chunks: Optional[torch.Tensor] = None,
+        speech_wav: Optional[torch.FloatTensor] = None,
+        images: Optional[torch.Tensor] = None,
+        images_highres: Optional[List[torch.FloatTensor]] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        modalities: Optional[List[str]] = ["image"],
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        (
+            inputs,
+            position_ids,
+            attention_mask,
+            _,
+            inputs_embeds,
+            _
+        ) = self.prepare_inputs_labels_for_speech_vision_text(
+            inputs,
+            position_ids,
+            attention_mask,
+            None,
+            None,
+            speech,
+            speech_lengths,
+            speech_chunks,
+            speech_wav,
+            images,
+            modalities,
+            image_sizes,
+            images_highres
+        )
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
+                                      inputs_embeds=None, **kwargs):
+        speech = kwargs.pop("speech", None)
+        speech_lengths = kwargs.pop("speech_lengths", None)
+        speech_chunks = kwargs.pop("speech_chunks", None)
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if speech is not None:
+            inputs['speech'] = speech
+            inputs['speech_lengths'] = speech_lengths
+            inputs['speech_chunks'] = speech_chunks
+        if images is not None:
+            inputs["images"] = images
+        if image_sizes is not None:
+            inputs["image_sizes"] = image_sizes
+        return inputs
+AutoConfig.register("ola_qwen", OlaConfigQwen)
+AutoModelForCausalLM.register(OlaConfigQwen, OlaQwenForCausalLM)

ola/model/language_model/ola_qwen3.py ADDED Viewed

	@@ -0,0 +1,466 @@

+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import transformers
+from transformers import GenerationConfig
+from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig
+SPEECH_TOKEN_INDEX = -200
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+from ..ola_arch import OlaMetaModel, OlaMetaForCausalLM
+from transformers import Qwen3Config, Qwen3Model, Qwen3ForCausalLM
+from .conversation import get_conv_template
+from ola.constants import IGNORE_INDEX
+def tokenizer_speech_token(prompt, tokenizer, speech_token_index=SPEECH_TOKEN_INDEX, return_tensors=None):
+    """Tokenize prompt with speech tokens, similar to OLA's implementation"""
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<speech>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [speech_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+class Qwen3Model(Qwen3Model):
+    def __init__(self, config: Qwen3Config, llm_config: Qwen3Config):
+        # breakpoint()
+        super(Qwen3Model, self).__init__(llm_config)
+class OlaConfigQwen3(Qwen3Config, PretrainedConfig):
+    model_type = "ola_internvl"
+class OlaQwen3Model(OlaMetaModel, Qwen3Model):
+    config_class = OlaConfigQwen3
+    def __init__(self, config: Qwen3Config):
+        super(OlaQwen3Model, self).__init__(config, config.llm_config)
+class OlaQwen3ForCausalLM(Qwen3ForCausalLM, OlaMetaForCausalLM):
+    config_class = OlaConfigQwen3
+    # 从零初始化时不需要 checkpoint conversion mapping
+    # _checkpoint_conversion_mapping = {
+    #     "^language_model.lm_head": "lm_head",
+    #     "^language_model.model": "model.model",
+    #     "^vision_model": "model.vision_tower",
+    # }
+    # model.model.embed_tokens:
+    def __init__(self, config):
+        super(Qwen3ForCausalLM, self).__init__(config)
+        config.rope_scaling = None
+        # breakpoint()
+        self.model = OlaQwen3Model(config)
+        self.vocab_size = config.vocab_size
+        # breakpoint()
+        self.ps_version = config.ps_version
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.template = "plm_v"
+        self.select_layer = config.select_layer
+        self.conv_template = get_conv_template(self.template)
+        self.system_message = self.conv_template.system_message
+        self.num_image_token = int((config.vision_config.image_size // config.vision_config.patch_size) ** 2 * (config.downsample_ratio ** 2))
+        self.downsample_ratio = config.downsample_ratio
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        speech: Optional[torch.FloatTensor] = None,
+        speech_lengths: Optional[torch.LongTensor] = None,
+        speech_chunks: Optional[torch.LongTensor] = None,
+        speech_wav: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        images_highres: Optional[List[torch.FloatTensor]] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        modalities: Optional[List[str]] = ["image"],
+        image_flags: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        # breakpoint()
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_speech_text_for_internvl(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                speech,
+                speech_lengths,
+                speech_chunks,
+                speech_wav,
+                modalities,
+            )
+        if labels is None:
+            return super().forward(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict
+            )
+        else:
+            return self.forward_llm_efficient(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                labels=labels,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict
+            )
+    def forward_llm_efficient(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Check inputs before model forward
+        print(f"Debug - Input embeddings range: {inputs_embeds.min().item()} to {inputs_embeds.max().item()}")
+        print(f"Debug - Input embeddings has nan: {torch.isnan(inputs_embeds).any().item()}")
+        print(f"Debug - Input embeddings has inf: {torch.isinf(inputs_embeds).any().item()}")
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        # Check hidden states immediately after model forward
+        print(f"Debug - Raw hidden states range: {hidden_states.min().item()} to {hidden_states.max().item()}")
+        print(f"Debug - Raw hidden states has nan: {torch.isnan(hidden_states).any().item()}")
+        print(f"Debug - Raw hidden states has inf: {torch.isinf(hidden_states).any().item()}")
+        hidden_dim = hidden_states.size(-1)
+        shift_labels = labels[..., 1:].contiguous().reshape(-1)
+        shift_hidden_states = hidden_states[..., :-1, :].contiguous().reshape(-1, hidden_dim)
+        assert shift_labels.size(0) == shift_hidden_states.size(0)
+        mask = shift_labels != IGNORE_INDEX
+        # Debug logging
+        print(f"Debug - Total tokens: {shift_labels.size(0)}")
+        print(f"Debug - Valid tokens: {mask.float().sum().item()}")
+        print(f"Debug - Ignored tokens: {(~mask).float().sum().item()}")
+        print(f"Debug - Label range: {shift_labels.min().item()} to {shift_labels.max().item()}")
+        assert mask.float().sum() > 0, f"No valid tokens found! Total: {shift_labels.size(0)}, Valid: {mask.float().sum().item()}"
+        shift_labels = shift_labels[mask]
+        shift_hidden_states = shift_hidden_states[mask, :]
+        print(f"Debug - After filtering: {shift_labels.size(0)} tokens")
+        print(f"Debug - Hidden states shape: {shift_hidden_states.shape}")
+        print(f"Debug - Hidden states range: {shift_hidden_states.min().item()} to {shift_hidden_states.max().item()}")
+        print(f"Debug - Hidden states has nan: {torch.isnan(shift_hidden_states).any().item()}")
+        print(f"Debug - Hidden states has inf: {torch.isinf(shift_hidden_states).any().item()}")
+        # Check lm_head weights
+        print(f"Debug - lm_head weight shape: {self.lm_head.weight.shape}")
+        print(f"Debug - lm_head weight range: {self.lm_head.weight.min().item()} to {self.lm_head.weight.max().item()}")
+        print(f"Debug - lm_head weight has nan: {torch.isnan(self.lm_head.weight).any().item()}")
+        print(f"Debug - lm_head weight has inf: {torch.isinf(self.lm_head.weight).any().item()}")
+        logits = self.lm_head(shift_hidden_states)
+        logits = logits.float()
+        print(f"Debug - Logits shape: {logits.shape}")
+        print(f"Debug - Logits range: {logits.min().item()} to {logits.max().item()}")
+        print(f"Debug - Logits has nan: {torch.isnan(logits).any().item()}")
+        print(f"Debug - Logits has inf: {torch.isinf(logits).any().item()}")
+        # Fix nan values in logits
+        if torch.isnan(logits).any():
+            print("WARNING: Found nan values in logits, replacing with zeros")
+            logits = torch.where(torch.isnan(logits), torch.zeros_like(logits), logits)
+        # Fix inf values in logits
+        if torch.isinf(logits).any():
+            print("WARNING: Found inf values in logits, clamping to finite range")
+            logits = torch.clamp(logits, min=-1e4, max=1e4)
+        # Additional check: if logits are still problematic, use a fallback
+        if torch.isnan(logits).any() or torch.isinf(logits).any():
+            print("ERROR: Logits still contain nan/inf after fixing, using fallback")
+            logits = torch.zeros_like(logits)
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(logits, shift_labels)
+        print(f"Debug - Loss: {loss.item()}")
+        print(f"Debug - Loss has nan: {torch.isnan(loss).item()}")
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        if self.ps_version == 'v1':
+            warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
+                          'which results in a transposed image.')
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+    def extract_feature(self, pixel_values):
+        if self.select_layer == -1:
+            # breakpoint()
+            vit_embeds = self.get_vision_tower()(
+                pixel_values=pixel_values,
+                output_hidden_states=False,
+                return_dict=True).last_hidden_state
+        else:
+            vit_embeds = self.get_vision_tower()(
+                pixel_values=pixel_values,
+                output_hidden_states=True,
+                return_dict=True).hidden_states[self.select_layer]
+        vit_embeds = vit_embeds[:, 1:, :]
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        # breakpoint()
+        vit_embeds = self.get_vision_projector()(vit_embeds)
+        return vit_embeds
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        input_ids: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        visual_features: Optional[torch.FloatTensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        output_hidden_states: Optional[bool] = None,
+        speech: Optional[torch.FloatTensor] = None,
+        speech_lengths: Optional[torch.LongTensor] = None,
+        speech_chunks: Optional[torch.LongTensor] = None,
+        speech_wav: Optional[torch.FloatTensor] = None,
+        modalities: Optional[List[str]] = ["image"],
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        if speech is not None:
+            (
+                _,
+                position_ids,
+                attention_mask,
+                _,
+                input_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_speech_text_for_internvl(
+                input_ids,
+                position_ids,
+                attention_mask,
+                None,
+                None,  # labels
+                speech,
+                speech_lengths,
+                speech_chunks,
+                speech_wav,
+                modalities,
+            )
+        else:
+            # internvl
+            assert self.img_context_token_id is not None
+            if pixel_values is not None:
+                if visual_features is not None:
+                    vit_embeds = visual_features
+                else:
+                    vit_embeds = self.extract_feature(pixel_values)
+                input_embeds = self.get_model().get_input_embeddings()(input_ids)
+                B, N, C = input_embeds.shape
+                input_embeds = input_embeds.reshape(B * N, C)
+                input_ids = input_ids.reshape(B * N)
+                selected = (input_ids == self.img_context_token_id)
+                assert selected.sum() != 0
+                input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
+                input_embeds = input_embeds.reshape(B, N, C)
+            else:
+                input_embeds = self.get_model().get_input_embeddings()(input_ids)
+        return super().generate(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            output_hidden_states=output_hidden_states,
+            use_cache=True,
+            **kwargs,
+        )
+    def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
+             num_patches_list=None, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>',
+             verbose=False, speech=None, speech_lengths=None, speech_wav=None, speech_chunks=None):
+        if history is None and pixel_values is not None and '<image>' not in question:
+            question = '<image>\n' + question
+        if num_patches_list is None:
+            num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
+        assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
+        img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
+        self.img_context_token_id = img_context_token_id
+        template = get_conv_template(self.template)
+        template.system_message = self.system_message
+        eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
+        history = [] if history is None else history
+        for (old_question, old_answer) in history:
+            template.append_message(template.roles[0], old_question)
+            template.append_message(template.roles[1], old_answer)
+        template.append_message(template.roles[0], question)
+        template.append_message(template.roles[1], None)
+        query = template.get_prompt()
+        if verbose and pixel_values is not None:
+            image_bs = pixel_values.shape[0]
+            print(f'dynamic ViT batch size: {image_bs}')
+        # Replace image tokens
+        for num_patches in num_patches_list:
+            image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
+            query = query.replace('<image>', image_tokens, 1)
+        from ola.conversation import conv_templates, SeparatorStyle
+        from ola.mm_utils import KeywordsStoppingCriteria
+        conv_mode = "plm_v"
+        conv = conv_templates[conv_mode].copy()
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        # Use OLA-style tokenization for speech inputs
+        if speech is not None and '<speech>' in query:
+            # Use OLA-style tokenization directly with <speech> tokens
+            input_ids = tokenizer_speech_token(query, tokenizer, return_tensors='pt').unsqueeze(0).to(self.device)
+            # Handle case where pad_token_id might be None
+            pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 151643
+            attention_mask = input_ids.ne(pad_token_id).long().to(self.device)
+        else:
+            model_inputs = tokenizer(query, return_tensors='pt')
+            input_ids = model_inputs['input_ids'].to(self.device)
+            attention_mask = model_inputs['attention_mask'].to(self.device)
+        generation_config['eos_token_id'] = eos_token_id
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        # generation_config["stopping_criteria"] = stopping_criteria
+        generation_output = self.generate(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            speech=speech,
+            speech_lengths=speech_lengths,
+            speech_chunks=speech_chunks,
+            speech_wav=speech_wav,
+            **generation_config
+        )
+        response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
+        response = response.split(template.sep.strip())[0].strip()
+        history.append((question, response))
+        if return_history:
+            return response, history
+        else:
+            query_to_print = query.replace(IMG_CONTEXT_TOKEN, '')
+            query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
+            if verbose:
+                print(query_to_print, response)
+            return response
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
+                                      inputs_embeds=None, **kwargs):
+        speech = kwargs.pop("speech", None)
+        speech_lengths = kwargs.pop("speech_lengths", None)
+        speech_chunks = kwargs.pop("speech_chunks", None)
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if speech is not None:
+            inputs['speech'] = speech
+            inputs['speech_lengths'] = speech_lengths
+            inputs['speech_chunks'] = speech_chunks
+        if images is not None:
+            inputs["images"] = images
+        if image_sizes is not None:
+            inputs["image_sizes"] = image_sizes
+        return inputs
+AutoConfig.register("ola_internvl", OlaConfigQwen3)
+AutoModelForCausalLM.register(OlaConfigQwen3, OlaQwen3ForCausalLM)

ola/model/multimodal_encoder/__pycache__/builder.cpython-312.pyc ADDED Viewed

Binary file (960 Bytes). View file

ola/model/multimodal_encoder/__pycache__/configuration_intern_vit.cpython-312.pyc ADDED Viewed

Binary file (5.7 kB). View file

ola/model/multimodal_encoder/__pycache__/internvl_vit.cpython-312.pyc ADDED Viewed

Binary file (26.2 kB). View file

ola/model/multimodal_encoder/__pycache__/oryx_vit.cpython-312.pyc ADDED Viewed

Binary file (46.7 kB). View file

ola/model/multimodal_encoder/builder.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+from .oryx_vit import SigLIPViTAnysizeWrapper
+from .internvl_vit import InternVisionModel
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    # breakpoint()
+    if vision_tower_cfg.model_type == 'intern_vit_6b':
+        vision_tower = InternVisionModel(vision_tower_cfg)
+        # breakpoint()
+        return vision_tower
+    else:
+        vision_tower = getattr(vision_tower_cfg, 'vision_tower', getattr(vision_tower_cfg, 'mm_vision_tower', None))
+        is_absolute_path_exists = os.path.exists(vision_tower)
+        print(f"Buiding OryxViTWrapper from {vision_tower}...")
+        # path = vision_tower.split(":")[1]
+        return SigLIPViTAnysizeWrapper(vision_tower, path=vision_tower, args=vision_tower_cfg, **kwargs)

ola/model/multimodal_encoder/configuration_intern_vit.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class InternVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
+    instantiate a vision encoder according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of color channels in the input images (e.g., 3 for RGB).
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+        hidden_size (`int`, *optional*, defaults to 3200):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_attention_heads (`int`, *optional*, defaults to 25):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 12800):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        qk_normalization (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the queries and keys in the self-attention layers.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        use_flash_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use flash attention mechanism.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for stochastic depth.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 0.1):
+            A factor for layer scale.
+    """
+    model_type = 'intern_vit_6b'
+    def __init__(
+            self,
+            num_channels=3,
+            patch_size=14,
+            image_size=224,
+            qkv_bias=False,
+            hidden_size=3200,
+            num_attention_heads=25,
+            intermediate_size=12800,
+            qk_normalization=True,
+            num_hidden_layers=48,
+            use_flash_attn=True,
+            hidden_act='gelu',
+            norm_type='rms_norm',
+            layer_norm_eps=1e-6,
+            dropout=0.0,
+            drop_path_rate=0.0,
+            attention_dropout=0.0,
+            initializer_range=0.02,
+            initializer_factor=0.1,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.drop_path_rate = drop_path_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.norm_type = norm_type
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.use_flash_attn = use_flash_attn
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if 'vision_config' in config_dict:
+            config_dict = config_dict['vision_config']
+        if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+        return cls.from_dict(config_dict, **kwargs)

ola/model/multimodal_encoder/internvl_vit.py ADDED Viewed

	@@ -0,0 +1,435 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from timm.layers import DropPath
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from .configuration_intern_vit import InternVisionConfig
+try:
+    from flash_attn.bert_padding import pad_input, unpad_input
+    from flash_attn.flash_attn_interface import \
+        flash_attn_varlen_qkvpacked_func
+    has_flash_attn = True
+except:
+    print('FlashAttention2 is not installed.')
+    has_flash_attn = False
+logger = logging.get_logger(__name__)
+class FlashAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+    def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
+        super().__init__()
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+    def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
+                max_s=None, need_weights=False):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
+                if unpadded: (nnz, 3, h, d)
+            key_padding_mask: a bool tensor of shape (B, S)
+        """
+        assert not need_weights
+        assert qkv.dtype in [torch.float16, torch.bfloat16]
+        assert qkv.is_cuda
+        if cu_seqlens is None:
+            batch_size = qkv.shape[0]
+            seqlen = qkv.shape[1]
+            if key_padding_mask is None:
+                qkv = rearrange(qkv, 'b s ... -> (b s) ...')
+                max_s = seqlen
+                cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                                          device=qkv.device)
+                output = flash_attn_varlen_qkvpacked_func(
+                    qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
+            else:
+                nheads = qkv.shape[-2]
+                x = rearrange(qkv, 'b s three h d -> b s (three h d)')
+                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
+                x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
+                output_unpad = flash_attn_varlen_qkvpacked_func(
+                    x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
+                                             indices, batch_size, seqlen),
+                                   'b s (h d) -> b s h d', h=nheads)
+        else:
+            assert max_s is not None
+            output = flash_attn_varlen_qkvpacked_func(
+                qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                softmax_scale=self.softmax_scale, causal=causal
+            )
+        return output, None
+class InternRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+try:
+    from apex.normalization import FusedRMSNorm
+    InternRMSNorm = FusedRMSNorm  # noqa
+    logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
+except ImportError:
+    # using the normal InternRMSNorm
+    pass
+except Exception:
+    logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
+    pass
+NORM2FN = {
+    'rms_norm': InternRMSNorm,
+    'layer_norm': nn.LayerNorm,
+}
+class InternVisionEmbeddings(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = pos_embed.float().reshape(
+            1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \
+            reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
+        return pos_embed
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat([
+            self.position_embedding[:, :1, :],
+            self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)
+        ], dim=1)
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+class InternAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_flash_attn = config.use_flash_attn and has_flash_attn
+        if config.use_flash_attn and not has_flash_attn:
+            print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).'
+            )
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attention_dropout)
+        self.proj_drop = nn.Dropout(config.dropout)
+        self.qk_normalization = config.qk_normalization
+        if self.qk_normalization:
+            self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        if self.use_flash_attn:
+            self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def _naive_attn(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+        attn = ((q * self.scale) @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
+        qkv = self.qkv(x)
+        qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
+        if self.qk_normalization:
+            q, k, v = qkv.unbind(2)
+            q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
+            k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
+            qkv = torch.stack([q, k, v], dim=2)
+        context, _ = self.inner_attn(
+            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
+        )
+        outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
+        outs = self.proj_drop(outs)
+        return outs
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
+        return x
+class InternMLP(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.act = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class InternVisionEncoderLayer(nn.Module):
+    def __init__(self, config: InternVisionConfig, drop_path_rate: float):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = config.norm_type
+        self.attn = InternAttention(config)
+        self.mlp = InternMLP(config)
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        """
+        hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states).to(hidden_states.dtype)) * self.ls1)
+        hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states).to(hidden_states.dtype)) * self.ls2)
+        return hidden_states
+class InternVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InternEncoderLayer`].
+    Args:
+        config (`InternConfig`):
+            The corresponding vision configuration for the `InternEncoder`.
+    """
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
+        self.layers = nn.ModuleList([
+            InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = True
+    def forward(
+            self,
+            inputs_embeds,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    encoder_layer,
+                    hidden_states)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                )
+            hidden_states = layer_outputs
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states
+        )
+class InternVisionModel(PreTrainedModel):
+    main_input_name = 'pixel_values'
+    # _supports_flash_attn_2 = True
+    supports_gradient_checkpointing = True
+    config_class = InternVisionConfig
+    _no_split_modules = ['InternVisionEncoderLayer']
+    # support transformers 4.51.+
+    _tp_plan = ''
+    def __init__(self, config: InternVisionConfig):
+        super().__init__(config)
+        self.config = config
+        # Force eager attention implementation to avoid scaled_dot_product_attention error
+        # self._attn_implementation = "eager"
+        self.embeddings = InternVisionEmbeddings(config)
+        self.encoder = InternVisionEncoder(config)
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
+        pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        self.embeddings.image_size = new_size
+        logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
+    def get_input_embeddings(self):
+        return self.embeddings
+    def forward(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError('You have to specify pixel_values or pixel_embeds')
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(pixel_values.shape) == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        pooled_output = last_hidden_state[:, 0, :]
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )

ola/model/multimodal_encoder/oryx_vit.py ADDED Viewed

	@@ -0,0 +1,1075 @@

+import math
+import warnings
+from dataclasses import dataclass
+from functools import partial
+from typing import (
+    Callable,
+    Dict,
+    Final,
+    List,
+    Literal,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+from torch.utils.checkpoint import checkpoint
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from timm.layers import (
+        AttentionPoolLatent,
+        DropPath,
+        LayerType,
+        Mlp,
+        PatchDropout,
+        PatchEmbed,
+        resample_abs_pos_embed,
+    )
+    from timm.models._manipulate import checkpoint_seq, named_apply
+except:
+    print('Wrong timm version')
+from flash_attn import flash_attn_func, flash_attn_varlen_func
+from typing import Optional
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+import os
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)  # noqa: E741
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.0))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    # type: (torch.Tensor, float, float, float, float) -> torch.Tensor
+    r"""The original timm.models.layers.weight_init.trunc_normal_ can not handle bfloat16 yet, here we first
+    convert the tensor to float32, apply the trunc_normal_() in float32, and then convert it back to its original dtype.
+    Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn
+    from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    with torch.no_grad():
+        dtype = tensor.dtype
+        tensor_fp32 = tensor.float()
+        tensor_fp32 = _no_grad_trunc_normal_(tensor_fp32, mean, std, a, b)
+        tensor_dtype = tensor_fp32.to(dtype=dtype)
+        tensor.copy_(tensor_dtype)
+def init_weights(self):
+    if self.pos_embed is not None:
+        trunc_normal_(self.pos_embed, std=self.pos_embed.shape[1] ** -0.5)
+    trunc_normal_(self.latent, std=self.latent_dim**-0.5)
+def init_weights_vit_timm(module: nn.Module, name: str = "") -> None:
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif hasattr(module, "init_weights"):
+        module.init_weights()
+class Attention(nn.Module):
+    fused_attn: Final[bool]
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        # self.fused_attn = use_fused_attn()
+        self.fused_attn = True
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop) if proj_drop > 0.0 else nn.Identity()
+    def forward(self, x: torch.Tensor, cu_slens=None) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, self.head_dim)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if cu_slens is not None:
+            q = q.permute(0, 2, 1, 3)   # B, num_heads, N, C -> B, N, num_heads, C
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            max_seqlen = torch.max(cu_slens[1:] - cu_slens[:-1]).item()
+            x = flash_attn_varlen_func(
+                q.squeeze(0),
+                k.squeeze(0),
+                v.squeeze(0),
+                cu_seqlens_q=cu_slens,
+                cu_seqlens_k=cu_slens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+                softmax_scale=self.scale,
+                causal=False,
+                )
+            x = x.reshape(B, N, -1)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        else:
+            q = q.permute(0, 2, 1, 3)   # B, num_heads, N, C -> B, N, num_heads, C
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            x = flash_attn_func(q, k, v, softmax_scale=self.scale) # -> b, n, h, c
+            x = x.reshape(B, N, -1)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        # if self.fused_attn:
+        #     x = F.scaled_dot_product_attention(
+        #         q,
+        #         k,
+        #         v,
+        #         dropout_p=self.attn_drop.p if self.training else 0.0,
+        #     )
+        # else:
+        #     q = q * self.scale
+        #     attn = q @ k.transpose(-2, -1)
+        #     attn = attn.softmax(dim=-1)
+        #     attn = self.attn_drop(attn)
+        #     x = attn @ v
+        # x = x.transpose(1, 2).reshape(B, N, C)
+        # x = self.proj(x)
+        # x = self.proj_drop(x)
+        return x
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: float = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: nn.Module = nn.GELU,
+        norm_layer: nn.Module = nn.LayerNorm,
+        mlp_layer: nn.Module = Mlp,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            norm_layer=norm_layer,
+        )
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = mlp_layer(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=proj_drop,
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, x: torch.Tensor, cu_slens=None) -> torch.Tensor:
+        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x), cu_slens=cu_slens)))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+class VisionTransformer(nn.Module):
+    """Vision Transformer
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    """
+    dynamic_img_size: Final[bool]
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        num_classes: int = 1000,
+        global_pool: Literal["", "avg", "token", "map"] = "token",
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        init_values: Optional[float] = None,
+        class_token: bool = True,
+        no_embed_class: bool = False,
+        reg_tokens: int = 0,
+        pre_norm: bool = False,
+        fc_norm: Optional[bool] = None,
+        dynamic_img_size: bool = False,
+        dynamic_img_pad: bool = False,
+        drop_rate: float = 0.0,
+        pos_drop_rate: float = 0.0,
+        patch_drop_rate: float = 0.0,
+        proj_drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
+        weight_init: Literal["skip", "jax", "jax_nlhb", "moco", ""] = "",
+        embed_layer: Callable = PatchEmbed,
+        norm_layer: Optional[LayerType] = None,
+        act_layer: Optional[LayerType] = None,
+        strict_img_size: bool = False,
+        block_fn: Type[nn.Module] = Block,
+        mlp_layer: Type[nn.Module] = Mlp,
+        ignore_head: bool = False,
+        add_patch2x2: bool = False,
+    ) -> None:
+        """
+        Args:
+            img_size: Input image size.
+            patch_size: Patch size.
+            in_chans: Number of image input channels.
+            num_classes: Mumber of classes for classification head.
+            global_pool: Type of global pooling for final sequence (default: 'token').
+            embed_dim: Transformer embedding dimension.
+            depth: Depth of transformer.
+            num_heads: Number of attention heads.
+            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
+            qkv_bias: Enable bias for qkv projections if True.
+            init_values: Layer-scale init values (layer-scale enabled if not None).
+            class_token: Use class token.
+            no_embed_class: Don't include position embeddings for class (or reg) tokens.
+            reg_tokens: Number of register tokens.
+            fc_norm: Pre head norm after pool (instead of before), if None, enabled when global_pool == 'avg'.
+            drop_rate: Head dropout rate.
+            pos_drop_rate: Position embedding dropout rate.
+            attn_drop_rate: Attention dropout rate.
+            drop_path_rate: Stochastic depth rate.
+            weight_init: Weight initialization scheme.
+            embed_layer: Patch embedding layer.
+            norm_layer: Normalization layer.
+            act_layer: MLP activation layer.
+            block_fn: Transformer block layer.
+        """
+        super().__init__()
+        assert global_pool in ("", "avg", "token", "map")
+        assert class_token or global_pool != "token"
+        use_fc_norm = global_pool == "avg" if fc_norm is None else fc_norm
+        # norm_layer = get_norm_layer(norm_layer) or partial(nn.LayerNorm, eps=1e-6)
+        # act_layer = get_act_layer(act_layer) or nn.GELU
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        act_layer = nn.GELU
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = self.embed_dim = (
+            embed_dim  # num_features for consistency with other models
+        )
+        self.num_prefix_tokens = 1 if class_token else 0
+        self.num_prefix_tokens += reg_tokens
+        self.num_reg_tokens = reg_tokens
+        self.has_class_token = class_token
+        self.no_embed_class = (
+            no_embed_class  # don't embed prefix positions (includes reg)
+        )
+        self.dynamic_img_size = dynamic_img_size
+        self.grad_checkpointing = False
+        self.ignore_head = ignore_head
+        embed_args = {}
+        if dynamic_img_size:
+            # flatten deferred until after pos embed
+            embed_args.update(dict(strict_img_size=False, output_fmt="NHWC"))
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            bias=not pre_norm,  # disable bias if pre-norm is used (e.g. CLIP)
+            dynamic_img_pad=dynamic_img_pad,
+            strict_img_size=strict_img_size,
+            **embed_args,
+        )
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = (
+            nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None
+        )
+        self.reg_token = (
+            nn.Parameter(torch.zeros(1, reg_tokens, embed_dim)) if reg_tokens else None
+        )
+        embed_len = (
+            num_patches if no_embed_class else num_patches + self.num_prefix_tokens
+        )
+        self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02)
+        # deepspeed.zero.register_external_parameter(self, self.pos_embed)
+        # deepspeed.zero.register_external_parameter(self, self.patch_embed.proj.weight)
+        # deepspeed.zero.register_external_parameter(self, self.patch_embed.proj.bias)
+        # print(self.patch_embed.state_dict().keys())
+        self.pos_drop = nn.Dropout(p=pos_drop_rate)
+        if patch_drop_rate > 0:
+            self.patch_drop = PatchDropout(
+                patch_drop_rate,
+                num_prefix_tokens=self.num_prefix_tokens,
+            )
+        else:
+            self.patch_drop = nn.Identity()
+        self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity()
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(
+            *[
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_norm=qk_norm,
+                    init_values=init_values,
+                    proj_drop=proj_drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    act_layer=act_layer,
+                    mlp_layer=mlp_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        if add_patch2x2:
+            if add_patch2x2 == 'v2':
+                self.downsample = nn.Sequential(
+                    nn.Conv2d(embed_dim, embed_dim*2, kernel_size=2, stride=2),
+                    nn.GELU(),
+                    nn.Conv2d(embed_dim*2, embed_dim*4, 1)
+                )
+            else:
+                mid_dim = embed_dim * 2
+                self.downsample = nn.Sequential(
+                    nn.Conv2d(embed_dim, mid_dim, kernel_size=2, stride=2),
+                    nn.GELU(),
+                    nn.Conv2d(mid_dim, mid_dim, 1)
+            )
+        else:
+            self.downsample = None
+        # self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity()
+        # # Classifier Head
+        # if global_pool == "map":
+        #     AttentionPoolLatent.init_weights = init_weights
+        #     self.attn_pool = AttentionPoolLatent(
+        #         self.embed_dim,
+        #         num_heads=num_heads,
+        #         mlp_ratio=mlp_ratio,
+        #         norm_layer=norm_layer,
+        #     )
+        # else:
+        #     self.attn_pool = None
+        # self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity()
+        # self.head_drop = nn.Dropout(drop_rate)
+        # self.head = (
+        #     nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        # )
+        # if weight_init != "skip":
+        #     self.init_weights(weight_init)
+    def init_weights(self, mode: Literal["jax", "jax_nlhb", "moco", ""] = "") -> None:
+        assert mode in ("jax", "jax_nlhb", "moco", "")
+        # head_bias = -math.log(self.num_classes) if "nlhb" in mode else 0.0
+        trunc_normal_(self.pos_embed, std=0.02)
+        if self.cls_token is not None:
+            nn.init.normal_(self.cls_token, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    @torch.jit.ignore
+    def no_weight_decay(self) -> Set:
+        return {"pos_embed", "cls_token", "dist_token"}
+    @torch.jit.ignore
+    def group_matcher(self, coarse: bool = False) -> Dict:
+        return dict(
+            stem=r"^cls_token|pos_embed|patch_embed",  # stem and embed
+            blocks=[(r"^blocks\.(\d+)", None), (r"^norm", (99999,))],
+        )
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable: bool = True) -> None:
+        self.grad_checkpointing = enable
+    @torch.jit.ignore
+    def get_classifier(self) -> nn.Module:
+        return self.head
+    def reset_classifier(self, num_classes: int, global_pool=None) -> None:
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ("", "avg", "token", "map")
+            if global_pool == "map" and self.attn_pool is None:
+                assert (
+                    False
+                ), "Cannot currently add attention pooling in reset_classifier()."
+            elif global_pool != "map " and self.attn_pool is not None:
+                self.attn_pool = None  # remove attention pooling
+            self.global_pool = global_pool
+        self.head = (
+            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
+    def rescale_positional_embedding(self, out_size):
+        h, w = out_size
+        pos_embed_shape = int((self.pos_embed.shape[1]) ** 0.5)
+        if (h, w) == (pos_embed_shape, pos_embed_shape):
+            return self.pos_embed
+        rescaled_positional_embedding = \
+            self.pos_embed.new_zeros(1, h*w, self.pos_embed.shape[2])
+        pe_2d = self.pos_embed[0].T.contiguous().view(1, -1, pos_embed_shape, pos_embed_shape)
+        pe_2d = F.interpolate(pe_2d, out_size, mode='bilinear', align_corners=False).view(-1, h*w)
+        rescaled_positional_embedding[0] = pe_2d.T.contiguous()
+        return rescaled_positional_embedding
+    def _pos_embed(self, x: torch.Tensor) -> torch.Tensor:
+        if self.dynamic_img_size:
+            B, H, W, C = x.shape
+            pos_embed = resample_abs_pos_embed(
+                self.pos_embed,
+                (H, W),
+                num_prefix_tokens=0 if self.no_embed_class else self.num_prefix_tokens,
+            )
+            x = x.view(B, -1, C)
+        else:
+            pos_embed = self.pos_embed
+        to_cat = []
+        if self.cls_token is not None:
+            to_cat.append(self.cls_token.expand(x.shape[0], -1, -1))
+        if self.reg_token is not None:
+            to_cat.append(self.reg_token.expand(x.shape[0], -1, -1))
+        if self.no_embed_class:
+            # deit-3, updated JAX (big vision)
+            # position embedding does not overlap with class token, add then concat
+            x = x + pos_embed
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+        else:
+            # original timm, JAX, and deit vit impl
+            # pos_embed has entry for class token, concat then add
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+            x = x + pos_embed
+        return self.pos_drop(x)
+    def _intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,
+    ) -> List[torch.Tensor]:
+        outputs, num_blocks = [], len(self.blocks)
+        take_indices = set(
+            range(num_blocks - n, num_blocks) if isinstance(n, int) else n
+        )
+        # forward pass
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.patch_drop(x)
+        x = self.norm_pre(x)
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in take_indices:
+                outputs.append(x)
+        return outputs
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,
+        reshape: bool = False,
+        return_prefix_tokens: bool = False,
+        norm: bool = False,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        """Intermediate layer accessor (NOTE: This is a WIP experiment).
+        Inspired by DINO / DINOv2 interface
+        """
+        # take last n blocks if n is an int, if in is a sequence, select by matching indices
+        outputs = self._intermediate_layers(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        prefix_tokens = [out[:, 0 : self.num_prefix_tokens] for out in outputs]
+        outputs = [out[:, self.num_prefix_tokens :] for out in outputs]
+        if reshape:
+            grid_size = self.patch_embed.grid_size
+            outputs = [
+                out.reshape(x.shape[0], grid_size[0], grid_size[1], -1)
+                .permute(0, 3, 1, 2)
+                .contiguous()
+                for out in outputs
+            ]
+        if return_prefix_tokens:
+            return tuple(zip(outputs, prefix_tokens))
+        return tuple(outputs)
+    def forward_features_list(self, x_list):
+        x_all = []
+        image_sizes = []
+        for x in x_list:
+            bs, _, h, w = x.shape
+            # fix patch size=14 in datasets
+            pad_h = (self.patch_embed.patch_size[0] - h % self.patch_embed.patch_size[0]) % self.patch_embed.patch_size[0]
+            pad_w = (self.patch_embed.patch_size[1] - w % self.patch_embed.patch_size[1]) % self.patch_embed.patch_size[1]
+            x = F.pad(x, (0, pad_w, 0, pad_h))
+            bs, _, h, w = x.shape
+            h = h // self.patch_embed.patch_size[0]
+            w = w // self.patch_embed.patch_size[1]
+            x = self.patch_embed(x)
+            # x = self._pos_embed(x)
+            x = x + self.rescale_positional_embedding(out_size=(h, w))
+            x = self.patch_drop(x)
+            x = self.norm_pre(x)
+            x_all.append(x)
+            image_sizes.append((h, w))
+        slen = [xi.size(1) for xi in x_all]
+        x = torch.cat(x_all, dim=1)
+        cu_indices = [0, ]
+        for i in slen:
+            cu_indices.append(cu_indices[-1] + i)
+        cu_slens = torch.tensor(cu_indices, dtype=torch.int32).to(x.device)
+        for idx, blk in enumerate(self.blocks):
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(blk, x, cu_slens, use_reentrant=True)
+            else:
+                x = blk(x, cu_slens=cu_slens)
+        feats = x.split(slen, dim=1) #[(1, slen, c)]
+        if self.downsample is not None:
+            new_feats = []
+            new_sizes = []
+            for f, s in zip(feats, image_sizes):
+                h, w = s
+                b, n, c = f.size()
+                f = f.reshape(b, h, w, c).permute(0, 3, 1, 2)
+                f = self.downsample(f)
+                b, c, h, w = f.size()
+                f = f.permute(0, 2, 3, 1).reshape(b, h*w, c)
+                new_feats.append(f)
+                new_sizes.append((h, w))
+            return new_feats, new_sizes
+        return feats, image_sizes
+    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        bs, _, h, w = x.shape
+        h = h // self.patch_embed.patch_size[0]
+        w = w // self.patch_embed.patch_size[1]
+        x = self.patch_embed(x)
+        # x = self._pos_embed(x)
+        x = x + self.rescale_positional_embedding(out_size=(h, w))
+        x = self.patch_drop(x)
+        x = self.norm_pre(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        if self.downsample is not None:
+            b, n, c = x.size()
+            x = x.reshape(b, h, w, c).permute(0, 3, 1, 2)
+            x = self.downsample(x)
+            b, c, h, w = x.size()
+            x = x.permute(0, 2, 3, 1).reshape(b, h*w, c)
+            new_feats = x
+            new_sizes = (h, w)
+            return new_feats, new_sizes
+        return x, (h, w)
+    def forward_head(self, x: torch.Tensor, pre_logits: bool = False) -> torch.Tensor:
+        x = self.norm(x)
+        if self.attn_pool is not None:
+            x = self.attn_pool(x)
+        elif self.global_pool == "avg":
+            x = x[:, self.num_prefix_tokens :].mean(dim=1)
+        elif self.global_pool:
+            x = x[:, 0]  # class token
+        x = self.fc_norm(x)
+        x = self.head_drop(x)
+        return x if pre_logits else self.head(x)
+    def forward(self, x, cal_attn_pool=False):
+        # import pdb;pdb.set_trace()
+        if type(x) is list:
+            x, image_sizes = self.forward_features_list(x)
+            return x, image_sizes, None
+        else:
+            x, image_sizes = self.forward_features(x)
+            return x, image_sizes, None
+@dataclass
+class SigLIPVisionCfg:
+    width: int = 1152
+    layers: Union[Tuple[int, int, int, int], int] = 27
+    heads: int = 16
+    patch_size: int = 14
+    image_size: Union[Tuple[int, int], int] = 336
+    global_pool: str = "map"
+    mlp_ratio: float = 3.7362
+    class_token: bool = False
+    num_classes: int = 0
+    use_checkpoint: bool = False
+SigLIP_MODEL_CONFIG = {
+    "siglip_so400m_patch14_384": {
+        "image_size": 384,
+        "patch_size": 14,
+        "width": 1152,
+        "layers": 27,
+        "heads": 16,
+        "mlp_ratio": 3.7362,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+    "siglip_so400m_patch16_384": {
+        "image_size": 384,
+        "patch_size": 16,
+        "width": 1152,
+        "layers": 27,
+        "heads": 16,
+        "mlp_ratio": 3.7362,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+    "siglip_so400m_patch14_224": {
+        "image_size": 224,
+        "patch_size": 14,
+        "width": 1152,
+        "layers": 27,
+        "heads": 16,
+        "mlp_ratio": 3.7362,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+    "siglip_large_patch16_384": {
+        "image_size": 384,
+        "patch_size": 16,
+        "width": 1024,
+        "layers": 24,
+        "heads": 16,
+        "mlp_ratio": 4,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+}
+def resize_evaclip_pos_embed(model: VisionTransformer, interpolation: str = 'bicubic'):
+    # interpolate position embedding
+    orig_size = 24
+    new_size = 128
+    pos_tokens = model.pos_embed
+    pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, model.embed_dim).permute(0, 3, 1, 2)
+    pos_tokens = torch.nn.functional.interpolate(
+        pos_tokens, size=(new_size, new_size), mode=interpolation, align_corners=False)
+    pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+    model.pos_embed = nn.Parameter(pos_tokens, requires_grad=True)
+    return model
+def create_siglip_vit(
+    model_name: str = "siglip_so400m_patch14_384",
+    image_size: int = 384,
+    select_layer: int = -1,
+    path: str = "",
+    gradient_checkpointing: bool = False,
+    **kwargs,
+):
+    assert (
+        model_name in SigLIP_MODEL_CONFIG.keys()
+    ), f"model name should be in {SigLIP_MODEL_CONFIG.keys()}"
+    vision_cfg = SigLIPVisionCfg(**SigLIP_MODEL_CONFIG[model_name])
+    if select_layer <= 0:
+        layers = min(vision_cfg.layers, vision_cfg.layers + select_layer + 1)
+    else:
+        layers = min(vision_cfg.layers, select_layer)
+    if 'patch2x2' or 'patch4x4' in path:
+        add_patch2x2 = True
+    else:
+        add_patch2x2 = False
+    if 'patch4x4pool' in path or 'patch2x2from4x4' in path:
+        add_patch2x2 = 'v2'
+    if FORCE_NO_DOWNSAMPLE:
+        add_patch2x2 = False
+    model = VisionTransformer(
+        img_size=2048,
+        patch_size=16,
+        embed_dim=vision_cfg.width,
+        depth=layers,
+        num_heads=vision_cfg.heads,
+        mlp_ratio=vision_cfg.mlp_ratio,
+        class_token=vision_cfg.class_token,
+        global_pool=vision_cfg.global_pool,
+        dynamic_img_pad=False,
+        strict_img_size=False,
+        ignore_head=kwargs.get("ignore_head", False),
+        weight_init=kwargs.get("weight_init", "skip"),
+        num_classes=0,
+        add_patch2x2=add_patch2x2
+    )
+    if gradient_checkpointing:
+        model.set_grad_checkpointing(True)
+    return model
+import os
+if 'LOAD_VISION_EARLY' in os.environ:
+    print("LOAD_VISION_EARLY is set")
+    LOAD_VISION_EARLY = True
+else:
+    LOAD_VISION_EARLY = False
+if 'VIT_WITH_GRAD' in os.environ:
+    print("VIT_WITH_GRAD is set")
+    VIT_WITH_GRAD = True
+else:
+    VIT_WITH_GRAD = False
+if 'FIX_SIZE' in os.environ:
+    print("FIX_SIZE is set")
+    FIX_SIZE = True
+else:
+    FIX_SIZE = False
+if 'ANYRES_SPLIT' in os.environ:
+    ANYRES_SPLIT = int(os.environ['ANYRES_SPLIT'])
+    print(f"ANYRES_SPLIT is set as {ANYRES_SPLIT}")
+else:
+    ANYRES_SPLIT = None
+if 'FORCE_NO_DOWNSAMPLE' in os.environ:
+    print("FORCE_NO_DOWNSAMPLE is set")
+    FORCE_NO_DOWNSAMPLE = True
+else:
+    FORCE_NO_DOWNSAMPLE = False
+from transformers import CLIPImageProcessor
+import torch.distributed as dist
+class SigLIPViTAnysizeWrapper(nn.Module):
+    def __init__(self, vision_tower, path, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.args = args
+        self.path = path
+        self.select_layer = -1
+        if self.select_layer < -1: self.select_layer += 1
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        self.output_dim = 1152
+        if not FORCE_NO_DOWNSAMPLE:
+            if 'patch2x2' or 'patch4x4' in path:
+                self.output_dim = 1152*2
+            if 'patch4x4pool' in path or 'patch2x2from4x4' in path:
+                self.output_dim = 1152*4
+        if not delay_load or LOAD_VISION_EARLY:
+            self.load_model()
+        elif getattr(args, "unfreeze_mm_vision_tower", False):
+            # TODO: better detector is needed.
+            print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
+            self.load_model()
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+        self.image_processor = CLIPImageProcessor.from_pretrained("/data1/cxy/model/openai/clip-vit-large-patch14")
+        if self.args.mm_projector_type == "conv_mlp" or self.args.mm_projector_type == "multipath_conv_mlp" or self.args.mm_projector_type == "multipath_conv_mlp_woconv":
+            self.image_processor.crop_size['height'] = 384
+            self.image_processor.crop_size['width'] = 384
+            self.image_processor.size['shortest_edge'] = 384
+            print("Resizeing clip processor to 384...")
+        self.image_processor.image_mean = [0.5, 0.5, 0.5]
+        self.image_processor.image_std = [0.5, 0.5, 0.5]
+        print("Loading vision model...")
+        if VIT_WITH_GRAD:
+            self.vision_tower = create_siglip_vit(path=self.path, model_name='siglip_so400m_patch16_384',
+                                                  gradient_checkpointing=True)
+            self.vision_tower.train()
+        else:
+            self.vision_tower = create_siglip_vit(path=self.path, model_name='siglip_so400m_patch16_384',
+                                                  gradient_checkpointing=False)
+            for p in self.vision_tower.parameters():
+                p.requires_grad = False
+            self.vision_tower.eval()
+        self.is_loaded = True
+    def train(self, mode = True):
+        self.training = mode
+        if self.is_loaded and not VIT_WITH_GRAD:
+            self.vision_tower.eval()
+    def split_images(self, images, split_res=512, base_size=32):
+        split_images = []
+        sub_images_info = []
+        for image in images:
+            now_sub_images = []
+            _, c, h, w = image.shape
+            if h * w <= split_res * split_res:
+                split_images.append(image)
+                sub_images_info.append(
+                    (
+                        1, 1, 1, h // base_size, w // base_size, [(0, h // base_size, 0, w // base_size)]
+                    )
+                )
+                continue
+            nsplit_h = math.ceil(h / split_res)
+            nsplit_w = math.ceil(w / split_res)
+            sub_h = int(h / nsplit_h  / base_size) * base_size
+            sub_w = int(w / nsplit_w / base_size) * base_size
+            crop_infos = []
+            for i in range(nsplit_h):
+                for j in range(nsplit_w):
+                    begin_h = i * sub_h
+                    begin_w = j * sub_w
+                    if i == nsplit_h - 1:
+                        end_h = h
+                    else:
+                        end_h = (i + 1) * sub_h
+                    if j == nsplit_w - 1:
+                        end_w = w
+                    else:
+                        end_w = (j + 1) * sub_w
+                    assert (end_h - begin_h) % base_size == 0 and (end_w - begin_w) % base_size == 0
+                    sub_image = image[:, :, begin_h:end_h, begin_w:end_w]
+                    now_sub_images.append(sub_image)
+                    crop_infos.append(
+                        (begin_h // base_size, end_h // base_size, begin_w // base_size, end_w // base_size)
+                    )
+            split_images += now_sub_images
+            sub_images_info.append(
+                (
+                    len(now_sub_images), nsplit_h, nsplit_w, h // base_size, w // base_size, crop_infos
+                )
+            )
+        return split_images, sub_images_info
+    def unsplit_images(self, features, sizes, sub_images_info):
+        new_features = []
+        for feature, size in zip(features, sizes):
+            h, w = size
+            new_features.append(
+                feature.reshape(1, h, w, -1)
+            )
+        fused_images = []
+        images_sizes = []
+        sub_count = 0
+        for n_split, nsplit_h, nsplit_w, total_h, total_w, crop_infos in sub_images_info:
+            sub_features = new_features[sub_count:sub_count+n_split]
+            sub_count += n_split
+            total_feature = new_features[0].new_zeros(1, total_h, total_w, self.hidden_size)
+            for feature, (begin_h, end_h, begin_w, end_w) in zip(sub_features, crop_infos):
+                total_feature[:, begin_h:end_h, begin_w:end_w] += feature
+            fused_images.append(total_feature.reshape(1, total_h * total_w, self.hidden_size))
+            images_sizes.append((total_h, total_w))
+        return fused_images, images_sizes
+    def forward_func(self, images, force_fix_size=False, cal_attn_pool=False):
+        if type(images) is list:
+            xs = [x.to(self.dtype) for x in images]
+            image_features, img_size, cls_token = self.vision_tower(xs, cal_attn_pool=cal_attn_pool)
+            image_features = [x.to(images[0].dtype) for x in image_features]
+        else:
+            image_forward_outs, img_size, cls_token = self.vision_tower(images.to(self.dtype), cal_attn_pool=cal_attn_pool)
+            image_features = image_forward_outs.to(images.dtype)
+        return image_features, img_size, cls_token
+    def forward(self, images, cal_attn_pool=False):
+        if VIT_WITH_GRAD:
+            image_features, img_size, cls_token = self.forward_func(images, cal_attn_pool=cal_attn_pool)
+            return image_features, img_size
+        else:
+            with torch.no_grad():
+                image_features, img_size, cls_token = self.forward_func(images, cal_attn_pool=cal_attn_pool)
+                return image_features, img_size
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, 1152, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.pos_embed.dtype
+    @property
+    def device(self):
+        return self.vision_tower.pos_embed.device
+    @property
+    def hidden_size(self):
+        return self.output_dim
+    @property
+    def config(self):
+        return type('LLaVAConfigWrapper', (), {
+            # 'image_size': 224,
+            'patch_size': 16,
+        })()

ola/model/multimodal_projector/__pycache__/builder.cpython-312.pyc ADDED Viewed

Binary file (10.3 kB). View file

ola/model/multimodal_projector/__pycache__/internvl_projector.cpython-312.pyc ADDED Viewed

Binary file (2.14 kB). View file

ola/model/multimodal_projector/__pycache__/pooler_projector.cpython-312.pyc ADDED Viewed

Binary file (4.68 kB). View file

ola/model/multimodal_projector/builder.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import torch
+import torch.nn as nn
+import re
+import math
+from .pooler_projector import NormalizedDwPooler
+from .internvl_projector import InternVLMultiModalProjector
+import os
+import math
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+class OlaMLP(nn.Module):
+    def __init__(self, in_channels, out_channels, twoview=False):
+        super().__init__()
+        self.proj1 = nn.Linear(in_channels, out_channels)
+        self.proj2 = nn.Linear(out_channels, out_channels)
+        self.act = nn.GELU()
+        self.pooler = NormalizedDwPooler(out_channels)
+        embed_std = 1 / math.sqrt(out_channels)
+        self.image_newline = nn.Parameter(
+            torch.randn(out_channels) * embed_std
+        )
+        self.image_begin = nn.Parameter(
+            torch.randn(out_channels) * embed_std
+        )
+        self.image_end = nn.Parameter(
+            torch.randn(out_channels) * embed_std
+        )
+        if twoview:
+            self.image_sep = nn.Parameter(
+                torch.randn(out_channels) * embed_std
+            )
+    def forward(self, x, size=(16,16), x2=None, size2=(16, 16), modalities='image'):
+        if modalities in ['image', 'text']:
+            h, w = size
+            dtype = x.dtype
+            x = x.reshape(x.shape[0], h, w, -1)
+            x = self.proj1(x)
+            x = self.pooler(x, forward_type='2x')
+            x = self.act(x)
+            x = self.proj2(x)
+            b, h, w, c = x.shape
+            x = torch.cat([
+                x,
+                self.image_newline.reshape(1, 1, 1, c).expand(b, h, 1, c).to(dtype)
+            ], dim=2)
+            x = x.reshape(b, -1, c)
+            if x2 is not None:
+                h2, w2 = size2
+                x2 = x2.reshape(x2.shape[0], h2, w2, -1)
+                x2 = self.proj1(x2)
+                x2 = self.pooler(x2, forward_type='2x')
+                x2 = self.act(x2)
+                x2 = self.proj2(x2)
+                b2, h2, w2, c2 = x2.shape
+                x2 = torch.cat([
+                    x2,
+                    self.image_newline.reshape(1, 1, 1, c).expand(b, h2, 1, c).to(dtype)
+                ], dim=2)
+                x2 = x2.reshape(b, -1, c)
+                sep = self.image_sep.reshape(1, 1, -1).expand(b, 1, c2).to(dtype)
+                x = torch.cat([x, sep, x2], dim=1)
+            begin = self.image_begin.reshape(1, 1, -1).expand(b, 1, c).to(dtype)
+            end = self.image_end.reshape(1, 1, -1).expand(b, 1, c).to(dtype)
+            x = torch.cat([begin, x, end], dim=1)
+            return x
+        elif modalities in ['video']:
+            # x2 is the true feature, ignore x
+            h, w = size
+            dtype = x.dtype
+            x = x.reshape(x.shape[0], h, w, -1)
+            x1 = self.proj1(x)
+            x1 = self.pooler(x1, forward_type='2x')
+            x1 = self.proj2(x1).mean() * 0.0
+            h2, w2 = size2
+            x2 = x2.reshape(x2.shape[0], h2, w2, -1)
+            x2 = self.proj1(x2)
+            x2 = self.pooler(x2, forward_type='2x')
+            x2 = self.act(x2)
+            x2 = self.proj2(x2)
+            b2, h2, w2, c = x2.shape
+            x2 = torch.cat([
+                x2,
+                self.image_newline.reshape(1, 1, 1, c).expand(b2, h2, 1, c).to(dtype)
+            ], dim=2)
+            x2 = x2.reshape(b2, -1, c)
+            sep = self.image_sep.reshape(1, 1, -1).expand(b2, 1, c).to(dtype)
+            x2 = torch.cat([x2, sep], dim=1)
+            x2 = x2.flatten(0, 1)
+            begin = self.image_begin.reshape(1, -1).expand(1, c).to(dtype)
+            end = self.image_end.reshape(1, -1).expand(1, c).to(dtype)
+            x2 = torch.cat([begin, x2, end], dim=0)
+            x2 = x2.unsqueeze(0)
+            return x2
+        else:
+            raise ValueError(f'Unknown modalities: {modalities}')
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+    if projector_type == 'linear':
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+    elif projector_type == 'ola_mlp':
+        return OlaMLP(config.mm_hidden_size, config.hidden_size, twoview=True)
+    elif projector_type == 'ola_internvl':
+        # breakpoint()
+        return InternVLMultiModalProjector(config)
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    mlp_gelu_resnet_match = re.match(r'^mlp(\d+)x_res(\d+)x_gelu$', projector_type)
+    if mlp_gelu_resnet_match:
+        mlp_depth = int(mlp_gelu_resnet_match.group(1))
+        res_depth = int(mlp_gelu_resnet_match.group(2))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        for _ in range(res_depth):
+            modules.append(SimpleResBlock(config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == 'identity':
+        return IdentityMap()
+    raise ValueError(f'Unknown projector type: {projector_type}')