Spaces:

openbmb
/

MiniCPM-V-4_5-Demo

Running on Zero

File size: 79,508 Bytes

7997f38

import os
import json
import uuid
import time
import copy
import base64
import logging
import argparse
import math
import multiprocessing as mp
from io import BytesIO
from typing import Generator, Any, Dict, Optional

import spaces
import torch
import gradio as gr
import numpy as np
from PIL import Image
from decord import VideoReader, cpu
from scipy.spatial import cKDTree
# import modelscope_studio as mgr

# 导入模型相关模块
try:
    from models import ModelMiniCPMV4_5
except ImportError:
    print("Warning: models module not found. Please ensure models.py is available.")
    class ModelMiniCPMV4_5:
        def __init__(self, model_path):
            self.model_path = model_path
            self.model = None
            
        def __call__(self, query):
            return "Model not loaded", 0

# 全局配置
ERROR_MSG = "Error, please retry"
model_name = 'MiniCPM-V 4.5'
disable_text_only = False  # 允许纯文本消息，便于测试
DOUBLE_FRAME_DURATION = 30
MAX_NUM_FRAMES = 180
MAX_NUM_PACKING = 3
TIME_SCALE = 0.1
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}

ENABLE_PARALLEL_ENCODING = True
PARALLEL_PROCESSES = None

# 全局模型实例
global_model = None

# 日志配置
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# 全局模型配置
model_config = {
    'model_path': None,
    'model_type': None,
    'instance_id': 0
}

# 全局模型缓存（在GPU进程中）
_gpu_model_cache = None

def _initialize_gpu_model():
    """在GPU进程中获取模型并移到GPU"""
    global _gpu_model_cache
    if _gpu_model_cache is None:
        logger.info(f"在GPU进程中初始化模型: {model_config['model_type']}")
        
        match model_config['model_type'].lower():
            case 'minicpmv4_5':
                _gpu_model_cache = ModelMiniCPMV4_5(model_config['model_path'])
            case _:
                raise ValueError(f"Unsupported model type: {model_config['model_type']}")
        
        logger.info(f"模型在CPU上初始化完成")
    
    # 每次推理时将模型移到GPU
    if hasattr(_gpu_model_cache, 'model') and hasattr(_gpu_model_cache.model, 'to'):
        logger.info("将模型移到GPU...")
        _gpu_model_cache.model.to('cuda')
    elif hasattr(_gpu_model_cache, 'model') and hasattr(_gpu_model_cache.model, 'model') and hasattr(_gpu_model_cache.model.model, 'to'):
        logger.info("将模型移到GPU（嵌套模型）...")
        _gpu_model_cache.model.model.to('cuda')
    
    return _gpu_model_cache

@spaces.GPU
def gpu_handler(query):
    """GPU推理处理器"""
    model = _initialize_gpu_model()
    
    res, output_tokens = model({
        "image": query["image"],
        "question": query["question"],
        "params": query.get("params", "{}"),
        "temporal_ids": query.get("temporal_ids", None)
    })
    return {
        "result": res,
        "usage": {"output_tokens": output_tokens}
    }

@spaces.GPU  
def gpu_stream_handler(query):
    """GPU流式推理处理器"""
    model = _initialize_gpu_model()
    
    params = json.loads(query.get("params", "{}"))
    params["stream"] = True
    query["params"] = json.dumps(params)
    
    try:
        generator = model({
            "image": query["image"],
            "question": query["question"],
            "params": query["params"],
            "temporal_ids": query.get("temporal_ids", None)
        })
        
        # 收集生成器的所有输出，避免序列化问题
        full_response = ""
        for chunk in generator:
            full_response += chunk
        
        return full_response
    except Exception as e:
        logger.error(f"GPU stream handler error: {e}")
        return f"Stream error: {str(e)}"

class Model:
    """模型封装类，不持有实际模型对象"""
    
    def __init__(self, model_path: str, model_type: str, instance_id: int = 0):
        self.instance_id = instance_id
        self.model_path = model_path
        self.model_type = model_type
        
        # 设置全局配置
        model_config['model_path'] = model_path
        model_config['model_type'] = model_type
        model_config['instance_id'] = instance_id
        
        logger.info(f"实例 {instance_id}: 配置模型类型 {model_type}")
        logger.info(f"实例 {instance_id}: 模型路径 {model_path}")

    def handler(self, query):
        """非流式推理处理器"""
        return gpu_handler(query)

    def stream_handler(self, query):
        """流式推理处理器"""
        return gpu_stream_handler(query)


def initialize_model():
    """初始化全局模型"""
    global global_model, _gpu_model_cache
    
    # 默认配置
    model_path = os.getenv('MODEL_PATH', 'openbmb/MiniCPM-V-4_5')
    model_type = os.getenv('MODEL_TYPE', 'minicpmv4_5')
    
    logger.info(f"="*50)
    logger.info(f"启动MiniCPM-V服务")
    logger.info(f"模型路径: {model_path}")
    logger.info(f"模型类型: {model_type}")
    logger.info(f"="*50)

    # 创建模型封装类
    global_model = Model(model_path, model_type, 0)
    
    # 在主进程中预加载模型到CPU（可选，为了更快的首次推理）
    try:
        logger.info("在主进程中预加载模型到CPU...")
        match model_type.lower():
            case 'minicpmv4_5':
                _gpu_model_cache = ModelMiniCPMV4_5(model_path)
            case _:
                raise ValueError(f"Unsupported model type: {model_type}")
        
        logger.info("模型在主进程CPU上预加载完成")
    except Exception as e:
        logger.warning(f"主进程预加载模型失败，将在GPU进程中加载: {e}")
        _gpu_model_cache = None
    
    return global_model


# 工具函数
def get_file_extension(filename):
    return os.path.splitext(filename)[1].lower()


def is_image(filename):
    return get_file_extension(filename) in IMAGE_EXTENSIONS


def is_video(filename):
    return get_file_extension(filename) in VIDEO_EXTENSIONS


def map_to_nearest_scale(values, scale):
    tree = cKDTree(np.asarray(scale)[:, None])
    _, indices = tree.query(np.asarray(values)[:, None])
    return np.asarray(scale)[indices]


def group_array(arr, size):
    return [arr[i:i+size] for i in range(0, len(arr), size)]


def encode_image(image):
    """编码单张图片"""
    if not isinstance(image, Image.Image):
        if hasattr(image, 'path'):
            image = Image.open(image.path)
        elif hasattr(image, 'file') and hasattr(image.file, 'path'):
            image = Image.open(image.file.path)
        elif hasattr(image, 'name'):
            image = Image.open(image.name)
        else:
            image_path = getattr(image, 'url', getattr(image, 'orig_name', str(image)))
            image = Image.open(image_path)
    
    # 调整图片大小
    max_size = 448*16
    if max(image.size) > max_size:
        w, h = image.size
        if w > h:
            new_w = max_size
            new_h = int(h * max_size / w)
        else:
            new_h = max_size
            new_w = int(w * max_size / h)
        image = image.resize((new_w, new_h), resample=Image.BICUBIC)
    
    # 转换为base64
    buffered = BytesIO()
    image.save(buffered, format="png")
    im_b64 = base64.b64encode(buffered.getvalue()).decode()
    return [{"type": "image", "pairs": im_b64}]


def encode_image_parallel(image_data):
    """并行图片编码包装函数"""
    try:
        return encode_image(image_data)
    except Exception as e:
        print(f"[Parallel encoding error] Image encoding failed: {e}")
        return None


def encode_images_parallel(frames, num_processes=None):
    """多进程并行图片编码"""
    if not ENABLE_PARALLEL_ENCODING:
        print(f"[Parallel encoding] Parallel encoding disabled, using serial processing")
        encoded_frames = []
        for frame in frames:
            encoded = encode_image(frame)
            if encoded:
                encoded_frames.extend(encoded)
        return encoded_frames
    
    if num_processes is None:
        cpu_cores = mp.cpu_count()
        if PARALLEL_PROCESSES:
            num_processes = PARALLEL_PROCESSES
        else:
            if len(frames) >= 50:
                num_processes = min(cpu_cores, len(frames), 32)
            elif len(frames) >= 20:
                num_processes = min(cpu_cores, len(frames), 16)
            else:
                num_processes = min(cpu_cores, len(frames), 8)
    
    print(f"[Parallel encoding] Starting parallel encoding of {len(frames)} frame images, using {num_processes} processes")
    
    if len(frames) <= 2:
        print(f"[Parallel encoding] Few images ({len(frames)} frames), using serial processing")
        encoded_frames = []
        for frame in frames:
            encoded = encode_image(frame)
            if encoded:
                encoded_frames.extend(encoded)
        return encoded_frames
    
    start_time = time.time()
    try:
        with mp.Pool(processes=num_processes) as pool:
            results = pool.map(encode_image_parallel, frames)
            
            encoded_frames = []
            for result in results:
                if result:
                    encoded_frames.extend(result)
            
            total_time = time.time() - start_time
            print(f"[Parallel encoding] Parallel encoding completed, total time: {total_time:.3f}s, encoded {len(encoded_frames)} images")
            
            return encoded_frames
            
    except Exception as e:
        print(f"[Parallel encoding] Parallel processing failed, falling back to serial processing: {e}")
        encoded_frames = []
        for frame in frames:
            encoded = encode_image(frame)
            if encoded:
                encoded_frames.extend(encoded)
        return encoded_frames


def encode_video(video, choose_fps=None):
    """编码视频文件"""
    def uniform_sample(l, n):
        gap = len(l) / n
        idxs = [int(i * gap + gap / 2) for i in range(n)]
        return [l[i] for i in idxs]

    if hasattr(video, 'path'):
        video_path = video.path
    elif hasattr(video, 'file') and hasattr(video.file, 'path'):
        video_path = video.file.path
    elif hasattr(video, 'name'):
        video_path = video.name
    else:
        video_path = getattr(video, 'url', getattr(video, 'orig_name', str(video)))
    
    vr = VideoReader(video_path, ctx=cpu(0))
    fps = vr.get_avg_fps()
    video_duration = len(vr) / fps

    frame_idx = [i for i in range(0, len(vr))]
    
    effective_fps = choose_fps if choose_fps else 1
    
    if video_duration < DOUBLE_FRAME_DURATION and effective_fps <= 5:
        effective_fps = effective_fps * 2
        packing_nums = 2
        choose_frames = round(min(effective_fps, round(fps)) * min(MAX_NUM_FRAMES, video_duration))
    elif effective_fps * int(video_duration) <= MAX_NUM_FRAMES:
        packing_nums = 1
        choose_frames = round(min(effective_fps, round(fps)) * min(MAX_NUM_FRAMES, video_duration))
    else:
        packing_size = math.ceil(video_duration * effective_fps / MAX_NUM_FRAMES)
        if packing_size <= MAX_NUM_PACKING:
            choose_frames = round(video_duration * effective_fps)
            packing_nums = packing_size
        else:
            choose_frames = round(MAX_NUM_FRAMES * MAX_NUM_PACKING)
            packing_nums = MAX_NUM_PACKING
    
    choose_idx = choose_frames
    
    frame_idx = np.array(uniform_sample(frame_idx, choose_idx))
    frames = vr.get_batch(frame_idx).asnumpy()

    frame_idx_ts = frame_idx / fps
    scale = np.arange(0, video_duration, TIME_SCALE)

    frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / TIME_SCALE
    frame_ts_id = frame_ts_id.astype(np.int32)

    assert len(frames) == len(frame_ts_id)

    frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames]
    frame_ts_id_group = group_array(frame_ts_id.tolist(), packing_nums)
    
    print(f"[Performance] Starting image encoding, total {len(frames)} frames")
    
    if ENABLE_PARALLEL_ENCODING:
        print(f"[Image encoding] Using multi-process parallel encoding, CPU cores: {mp.cpu_count()}")
        encoded_frames = encode_images_parallel(frames, PARALLEL_PROCESSES)
    else:
        print("[Warning] Parallel encoding disabled, using serial processing")
        encoded_frames = []
        for frame in frames:
            encoded = encode_image(frame)
            if encoded:
                encoded_frames.extend(encoded)
    
    return encoded_frames, frame_ts_id_group


# 响应处理函数
def parse_thinking_response(response_text):
    """解析包含<think>标签的响应文本，支持流式解析"""
    import re
    
    # 完整的thinking标签匹配
    complete_think_pattern = r'<think>(.*?)</think>'
    thinking_matches = re.findall(complete_think_pattern, response_text, re.DOTALL)
    
    if thinking_matches:
        # 有完整的thinking标签
        thinking_content = "\n\n".join(thinking_matches).strip()
        print("thinking_content---:", thinking_content)
        formal_answer = re.sub(complete_think_pattern, '', response_text, flags=re.DOTALL).strip()
        return thinking_content, formal_answer
    else:
        # 检查是否有未完成的thinking标签
        partial_think_match = re.search(r'<think>(.*?)$', response_text, re.DOTALL)
        if partial_think_match:
            # 有开始标签但没有结束标签，说明thinking内容正在输出中
            # 返回特殊标识，表示正在thinking过程中
            return "STREAMING", ""
        else:
            # 没有thinking标签，直接返回原文作为正式回答
            return "", response_text.strip()


def parse_thinking_response_for_final(response_text):
    """最终解析thinking响应，用于完成时的格式化"""
    import re
    
    # 首先尝试匹配完整的thinking标签
    think_pattern = r'<think>(.*?)</think>'
    thinking_matches = re.findall(think_pattern, response_text, re.DOTALL)
    
    if thinking_matches:
        thinking_content = "\n\n".join(thinking_matches).strip()
        formal_answer = re.sub(think_pattern, '', response_text, flags=re.DOTALL).strip()
        print(f"[parse_final] 找到完整thinking标签, thinking长度: {len(thinking_content)}, answer长度: {len(formal_answer)}")
    else:
        # 如果没有完整标签，检查是否有未闭合的<think>标签
        if '<think>' in response_text:
            think_start = response_text.find('<think>')
            if think_start != -1:
                # 提取thinking内容（从<think>之后到字符串结束）
                thinking_content = response_text[think_start + 7:].strip()  # 跳过<think>
                # formal_answer是<think>之前的内容
                formal_answer = response_text[:think_start].strip()
                
                # 如果formal_answer为空，说明整个响应都是thinking内容
                if not formal_answer:
                    formal_answer = ""  # 没有正式回答
                
                print(f"[parse_final] 找到未闭合thinking标签")
                print(f"[parse_final] thinking内容: '{thinking_content[:100]}...'")
                print(f"[parse_final] formal_answer: '{formal_answer[:100]}...'")
            else:
                thinking_content = ""
                formal_answer = response_text.strip()
                print(f"[parse_final] 无thinking标签, answer长度: {len(formal_answer)}")
        else:
            thinking_content = ""
            formal_answer = response_text.strip()
            print(f"[parse_final] 无thinking标签, answer长度: {len(formal_answer)}")
    
    return thinking_content, formal_answer


def normalize_text_for_html(text):
    """轻量级文本规范化"""
    import re
    
    if not text:
        return ""

    text = re.sub(r"[\u200B\u200C\u200D\uFEFF]", "", text)
    lines = [line.strip() for line in text.split("\n")]
    text = "\n".join(lines)
    text = text.strip()
    return text


def format_response_with_thinking(thinking_content, formal_answer):
    """格式化包含思考过程的响应"""
    print(f"[format_thinking] thinking_content长度: {len(thinking_content) if thinking_content else 0}")
    print(f"[format_thinking] formal_answer长度: {len(formal_answer) if formal_answer else 0}")
    print(f"[format_thinking] thinking_content前100字符: '{thinking_content[:100] if thinking_content else 'None'}...'")
    print(f"[format_thinking] formal_answer前100字符: '{formal_answer[:100] if formal_answer else 'None'}...'")
    
    # 检查内容是否为空
    if not thinking_content and not formal_answer:
        print("[format_thinking] 警告：thinking_content和formal_answer都为空！")
    elif not formal_answer:
        print("[format_thinking] 警告：formal_answer为空！")
    elif not thinking_content:
        print("[format_thinking] 注意：thinking_content为空，将使用简化格式")

    # 添加一个唯一的ID来强制前端重新渲染
    import uuid
    unique_id = uuid.uuid4().hex[:8]

    # 如果有thinking内容，显示完整的thinking格式
    if thinking_content and thinking_content.strip():
        formatted_response = f"""
<div class="response-container" id="response-{unique_id}">
<div class="thinking-section">
<div class="thinking-header">🤔 think</div>
<div class="thinking-content">{thinking_content}</div>
</div>
<div class="formal-section">
<div class="formal-header">💡 answer</div>
<div class="formal-content">{formal_answer if formal_answer else '(无正式回答)'}</div>
</div>
</div>
"""
    else:
        # 如果没有thinking内容，直接显示回答
        content_to_show = formal_answer if formal_answer and formal_answer.strip() else "(空回答)"
        formatted_response = f"""
<div class="response-container" id="response-{unique_id}">
<div class="formal-section">
<div class="formal-content">{content_to_show}</div>
</div>
</div>
"""

    return "\n" + formatted_response.strip() + "\n"


def check_mm_type(mm_file):
    """检查多媒体文件类型"""
    if hasattr(mm_file, 'path'):
        path = mm_file.path
    elif hasattr(mm_file, 'file') and hasattr(mm_file.file, 'path'):
        path = mm_file.file.path
    elif hasattr(mm_file, 'name'):
        path = mm_file.name
    else:
        path = getattr(mm_file, 'url', getattr(mm_file, 'orig_name', str(mm_file)))
    
    if is_image(path):
        return "image"
    if is_video(path):
        return "video"
    return None


def encode_mm_file(mm_file, choose_fps=None):
    """编码多媒体文件"""
    if check_mm_type(mm_file) == 'image':
        return encode_image(mm_file), None
    if check_mm_type(mm_file) == 'video':
        encoded_frames, frame_ts_id_group = encode_video(mm_file, choose_fps)
        return encoded_frames, frame_ts_id_group
    return None, None


def encode_message(_question, choose_fps=None):
    """编码消息"""
    import re
    
    files = _question.files if _question.files else []
    question = _question.text if _question.text else ""
    message = []
    temporal_ids = []
    
    # 检查是否使用旧的占位符格式
    pattern = r"\[mm_media\]\d+\[/mm_media\]"
    if re.search(pattern, question):
        # 旧格式：使用占位符
        matches = re.split(pattern, question)
        
        if len(matches) != len(files) + 1:
            gr.Warning("Number of Images not match the placeholder in text, please refresh the page to restart!")
            # 不使用 assert，而是处理不匹配的情况
            if len(matches) > len(files) + 1:
                matches = matches[:len(files) + 1]
            else:
                while len(matches) < len(files) + 1:
                    matches.append("")

        text = matches[0].strip()
        if text:
            message.append({"type": "text", "pairs": text})
        
        for i in range(len(files)):
            encoded_content, frame_ts_id_group = encode_mm_file(files[i], choose_fps)
            if encoded_content:
                message += encoded_content
            if frame_ts_id_group:
                temporal_ids.extend(frame_ts_id_group)
            
            if i + 1 < len(matches):
                text = matches[i + 1].strip()
                if text:
                    message.append({"type": "text", "pairs": text})
    else:
        # 新格式：简单的文本 + 文件列表
        if question.strip():
            message.append({"type": "text", "pairs": question.strip()})
        
        for file in files:
            encoded_content, frame_ts_id_group = encode_mm_file(file, choose_fps)
            if encoded_content:
                message += encoded_content
            if frame_ts_id_group:
                temporal_ids.extend(frame_ts_id_group)
    
    return message, temporal_ids if temporal_ids else None


def check_has_videos(_question):
    """检查是否包含视频"""
    images_cnt = 0
    videos_cnt = 0
    files = _question.files if _question.files else []
    for file in files:
        if check_mm_type(file) == "image":
            images_cnt += 1
        else:
            videos_cnt += 1
    return images_cnt, videos_cnt


def save_media_to_persistent_cache(_question, session_id):
    """将图片和视频保存到持久化缓存中，返回保存的路径信息"""
    import os
    import shutil
    import uuid
    from pathlib import Path
    
    files = _question.files if _question.files else []
    saved_media = []
    
    # 创建会话专用的媒体缓存目录
    cache_dir = Path("./media_cache") / session_id
    cache_dir.mkdir(parents=True, exist_ok=True)
    
    for file in files:
        file_type = check_mm_type(file)
        if file_type in ["image", "video"]:
            try:
                # 获取原始文件路径
                original_path = None
                if hasattr(file, 'name'):
                    original_path = file.name
                elif hasattr(file, 'path'):
                    original_path = file.path
                elif hasattr(file, 'file') and hasattr(file.file, 'path'):
                    original_path = file.file.path
                else:
                    continue
                
                if original_path and os.path.exists(original_path):
                    # 生成唯一的文件名
                    file_ext = os.path.splitext(original_path)[1]
                    prefix = "img" if file_type == "image" else "vid"
                    unique_filename = f"{prefix}_{uuid.uuid4().hex[:8]}{file_ext}"
                    cached_path = cache_dir / unique_filename
                    
                    # 复制文件到缓存目录
                    shutil.copy2(original_path, cached_path)
                    
                    saved_media.append({
                        'type': file_type,
                        'original_path': original_path,
                        'cached_path': str(cached_path),
                        'filename': unique_filename
                    })
                    print(f"[save_media_to_persistent_cache] {file_type}已保存: {cached_path}")
            except Exception as e:
                print(f"[save_media_to_persistent_cache] 保存{file_type}失败: {e}")
                continue
    
    return saved_media


def format_user_message_with_files(_question, session_id=None):
    """格式化包含文件的用户消息，支持图片和视频显示"""
    user_text = _question.text if _question.text else ""
    files = _question.files if _question.files else []
    
    if not files:
        return user_text, []
    
    # 保存媒体文件到持久化缓存
    saved_media = []
    if session_id:
        saved_media = save_media_to_persistent_cache(_question, session_id)
    
    if len(files) == 1:
        file = files[0]
        file_type = check_mm_type(file)
        
        # 如果是图片或视频且已保存到缓存
        if file_type in ["image", "video"] and saved_media:
            media_info = saved_media[0]
            if file_type == "image":
                if user_text:
                    return f"🖼️ {user_text}", saved_media
                else:
                    return "🖼️ 图片", saved_media
            elif file_type == "video":
                if user_text:
                    return f"🎬 {user_text}", saved_media
                else:
                    return "🎬 视频", saved_media
        else:
            # 其他文件类型，使用文本描述
            return f"[1 file uploaded] {user_text}", saved_media
    else:
        # 多个文件，统计不同类型
        image_count = len([m for m in saved_media if m['type'] == 'image'])
        video_count = len([m for m in saved_media if m['type'] == 'video'])
        other_count = len(files) - image_count - video_count
        
        # 构建描述文本
        parts = []
        if image_count > 0:
            parts.append(f"{image_count} image{'s' if image_count > 1 else ''}")
        if video_count > 0:
            parts.append(f"{video_count} video{'s' if video_count > 1 else ''}")
        if other_count > 0:
            parts.append(f"{other_count} other file{'s' if other_count > 1 else ''}")
        
        if parts:
            files_desc = ", ".join(parts)
            return f"[{files_desc} uploaded] {user_text}", saved_media
        else:
            return f"[{len(files)} files uploaded] {user_text}", saved_media


def update_media_gallery(app_session):
    """更新媒体画廊显示（图片和视频）"""
    import os
    media_cache = app_session.get('media_cache', [])
    
    if not media_cache:
        return gr.update(value=[], visible=False)
    
    # 获取所有缓存媒体文件的路径（图片和视频都支持）
    media_paths = [media_info['cached_path'] for media_info in media_cache if os.path.exists(media_info['cached_path'])]
    
    if media_paths:
        return gr.update(value=media_paths, visible=True)
    else:
        return gr.update(value=[], visible=False)


def format_fewshot_user_message(image_path, user_text):
    """格式化FewShot用户消息，支持图片显示"""
    if image_path and user_text:
        return (user_text, image_path)
    elif image_path:
        return ("", image_path) 
    else:
        return user_text


# 主要的聊天函数
def chat_direct(img_b64, msgs, ctx, params=None, vision_hidden_states=None, temporal_ids=None, session_id=None):
    """直接调用模型进行聊天（非流式）"""
    default_params = {"num_beams": 3, "repetition_penalty": 1.2, "max_new_tokens": 16284}
    if params is None:
        params = default_params

    use_streaming = params.get('stream', False)
    
    if use_streaming:
        return chat_stream_direct(img_b64, msgs, ctx, params, vision_hidden_states, temporal_ids, session_id)
    else:
        # 构建请求数据
        query = {
            "image": img_b64,
            "question": json.dumps(msgs, ensure_ascii=True),
            "params": json.dumps(params, ensure_ascii=True),
        }

        if temporal_ids:
            query["temporal_ids"] = json.dumps(temporal_ids, ensure_ascii=True)

        if session_id:
            query["session_id"] = session_id
        
        try:
            # 直接调用模型
            result = global_model.handler(query)
            raw_result = result['result']
            
            # 清理结果
            import re
            cleaned_result = re.sub(r'(<box>.*</box>)', '', raw_result)
            cleaned_result = cleaned_result.replace('<ref>', '')
            cleaned_result = cleaned_result.replace('</ref>', '')
            cleaned_result = cleaned_result.replace('<box>', '')
            cleaned_result = cleaned_result.replace('</box>', '')
            
            # 解析思考过程
            thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(cleaned_result)
            thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
            formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
            formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)

            context_result = formal_answer_raw if formal_answer_raw else cleaned_result
            return 0, formatted_result, context_result, None
            
        except Exception as e:
            print(f"Chat error: {e}")
            import traceback
            traceback.print_exc()
            return -1, ERROR_MSG, None, None


def chat_stream_direct(img_b64, msgs, ctx, params=None, vision_hidden_states=None, temporal_ids=None, session_id=None):
    """直接调用模型进行流式聊天"""
    try:
        # 构建请求数据
        query = {
            "image": img_b64,
            "question": json.dumps(msgs, ensure_ascii=True),
            "params": json.dumps(params, ensure_ascii=True),
        }

        if temporal_ids:
            query["temporal_ids"] = json.dumps(temporal_ids, ensure_ascii=True)

        if session_id:
            query["session_id"] = session_id
        
        # 直接调用流式处理器
        generator = global_model.stream_handler(query)
        
        full_response = ""
        for chunk in generator:
            full_response += chunk
        
        if not full_response:
            return -1, ERROR_MSG, None, None
        
        # 清理结果
        import re
        cleaned_result = re.sub(r'(<box>.*</box>)', '', full_response)
        cleaned_result = cleaned_result.replace('<ref>', '')
        cleaned_result = cleaned_result.replace('</ref>', '')
        cleaned_result = cleaned_result.replace('<box>', '')
        cleaned_result = cleaned_result.replace('</box>', '')
        
        # 解析思考过程
        thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(cleaned_result)
        thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
        formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
        formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
        
        context_result = formal_answer_raw if formal_answer_raw else cleaned_result
        return 0, formatted_result, context_result, None
        
    except Exception as e:
        print(f"Stream chat error: {e}")
        import traceback
        traceback.print_exc()
        return -1, ERROR_MSG, None, None


def chat_stream_character_generator(img_b64, msgs, ctx, params=None, vision_hidden_states=None, temporal_ids=None, stop_control=None, session_id=None):
    """字符级流式生成器"""
    print(f"[chat_stream_character_generator] Starting character-level streaming")
    print(f"[chat_stream_character_generator] stop_control: {stop_control}")
    
    try:
        # 构建请求数据
        query = {
            "image": img_b64,
            "question": json.dumps(msgs, ensure_ascii=True),
            "params": json.dumps(params, ensure_ascii=True),
        }

        if temporal_ids:
            query["temporal_ids"] = json.dumps(temporal_ids, ensure_ascii=True)

        if session_id:
            query["session_id"] = session_id
        
        # 调用流式处理器 - 现在返回完整响应而不是生成器
        full_response = global_model.stream_handler(query)
        
        # 清理响应
        import re
        clean_response = re.sub(r'(<box>.*</box>)', '', full_response)
        clean_response = clean_response.replace('<ref>', '')
        clean_response = clean_response.replace('</ref>', '')
        clean_response = clean_response.replace('<box>', '')
        clean_response = clean_response.replace('</box>', '')
        
        # 逐字符yield以模拟流式输出
        char_count = 0
        for char in clean_response:
            # 检查停止标志
            if stop_control and stop_control.get('stop_streaming', False):
                print(f"[chat_stream_character_generator] *** 在第{char_count}个字符处收到停止信号 ***")
                break
                
            char_count += 1
            if char_count % 10 == 0:
                print(f"[chat_stream_character_generator] 已输出{char_count}个字符，stop_flag: {stop_control.get('stop_streaming', False) if stop_control else 'None'}")
            
            yield char
            
            # 添加小延迟以模拟流式效果
            import time
            time.sleep(0.01)
        
        print(f"[chat_stream_character_generator] 流式输出完成，总共输出{char_count}个字符")
        
    except Exception as e:
        print(f"[chat_stream_character_generator] 异常: {e}")
        error_msg = f"Stream error: {str(e)}"
        for char in error_msg:
            yield char


# UI组件创建函数
def create_component(params, comp='Slider'):
    if comp == 'Slider':
        return gr.Slider(
            minimum=params['minimum'],
            maximum=params['maximum'],
            value=params['value'],
            step=params['step'],
            interactive=params['interactive'],
            label=params['label']
        )
    elif comp == 'Radio':
        return gr.Radio(
            choices=params['choices'],
            value=params['value'],
            interactive=params['interactive'],
            label=params['label']
        )
    elif comp == 'Button':
        return gr.Button(
            value=params['value'],
            interactive=True
        )
    elif comp == 'Checkbox':
        return gr.Checkbox(
            value=params['value'],
            interactive=params['interactive'],
            label=params['label'],
            info=params.get('info', None)
        )


def create_multimodal_input(upload_image_disabled=False, upload_video_disabled=False):
    # 使用标准的 Gradio 组件替代 MultimodalInput，添加预览功能
    return gr.File(
        file_count="multiple",
        file_types=["image", "video"],
        label="Upload Images/Videos",
        interactive=not (upload_image_disabled and upload_video_disabled),
        show_label=True,
        height=200  # 设置高度以显示预览
    )


# UI控制函数
def update_streaming_mode_state(params_form):
    """根据解码类型更新流式模式状态"""
    if params_form == 'Beam Search':
        return gr.update(value=False, interactive=False, info="Beam Search mode does not support streaming output")
    else:
        return gr.update(value=True, interactive=True, info="Enable real-time streaming response")


def stop_streaming(_app_cfg):
    """停止流式输出"""
    _app_cfg['stop_streaming'] = True
    print(f"[stop_streaming] Set stop flag to True")
    return _app_cfg


def reset_stop_flag(_app_cfg):
    """重置停止标志"""
    _app_cfg['stop_streaming'] = False
    print(f"[reset_stop_flag] Reset stop flag to False")
    return _app_cfg


def check_and_handle_stop(_app_cfg, context="unknown"):
    """检查停止标志"""
    should_stop = _app_cfg.get('stop_streaming', False)
    is_streaming = _app_cfg.get('is_streaming', False)
    
    if should_stop:
        print(f"[check_and_handle_stop] *** Stop signal detected at {context} ***")
        print(f"[check_and_handle_stop] stop_streaming: {should_stop}, is_streaming: {is_streaming}")
        return True
    return False


def stop_button_clicked(_app_cfg):
    """处理停止按钮点击"""
    print("[stop_button_clicked] *** Stop button clicked ***")
    print(f"[stop_button_clicked] Current state - is_streaming: {_app_cfg.get('is_streaming', False)}")
    print(f"[stop_button_clicked] Current state - stop_streaming: {_app_cfg.get('stop_streaming', False)}")
    
    _app_cfg['stop_streaming'] = True
    _app_cfg['is_streaming'] = False
    print(f"[stop_button_clicked] Set stop_streaming = True, is_streaming = False")
    
    return _app_cfg, gr.update(visible=False)


# 主要的响应函数
def respond_stream(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
    """流式响应生成器"""
    print(f"[respond_stream] Called with streaming_mode: {streaming_mode}, fps_setting: {fps_setting}")
    
    _app_cfg['is_streaming'] = True
    _app_cfg['stop_streaming'] = False
    
    if params_form == 'Beam Search':
        streaming_mode = False
        print(f"[respond_stream] Beam Search模式，强制禁用流式模式")
        _app_cfg['is_streaming'] = False
    
    _context = _app_cfg['ctx'].copy()
    encoded_message, temporal_ids = encode_message(_question, fps_setting)
    _context.append({'role': 'user', 'contents': encoded_message})

    images_cnt = _app_cfg['images_cnt']
    videos_cnt = _app_cfg['videos_cnt']
    files_cnts = check_has_videos(_question)
    
    if files_cnts[1] + videos_cnt > 1 or (files_cnts[1] + videos_cnt == 1 and files_cnts[0] + images_cnt > 0):
        gr.Warning("Only supports single video file input right now!")
        yield create_multimodal_input(True, True), _chat_bot, _app_cfg, gr.update(visible=False)
        return
        
    if disable_text_only and files_cnts[1] + videos_cnt + files_cnts[0] + images_cnt <= 0:
        gr.Warning("Please chat with at least one image or video.")
        yield create_multimodal_input(False, False), _chat_bot, _app_cfg, gr.update(visible=False)
        return

    if params_form == 'Beam Search':
        params = {
            'sampling': False,
            'num_beams': 3,
            'repetition_penalty': 1.2,
            "max_new_tokens": 16284,
            "enable_thinking": thinking_mode,
            "stream": False
        }
    else:
        params = {
            'sampling': True,
            'top_p': 0.8,
            'top_k': 100,
            'temperature': 0.7,
            'repetition_penalty': 1.03,
            "max_new_tokens": 16284,
            "enable_thinking": thinking_mode,
            "stream": streaming_mode
        }

    if files_cnts[1] + videos_cnt > 0:
        params["max_inp_length"] = 2048 * 10
        params["use_image_id"] = False
        params["max_slice_nums"] = 1

    images_cnt += files_cnts[0]
    videos_cnt += files_cnts[1]

    # 构建用户消息显示（流式模式）
    user_message, saved_images = format_user_message_with_files(_question, _app_cfg.get('session_id'))
    
    # 将媒体信息保存到会话状态中
    if saved_images:
        if 'media_cache' not in _app_cfg:
            _app_cfg['media_cache'] = []
        _app_cfg['media_cache'].extend(saved_images)
    
    _chat_bot.append((user_message, ""))
    _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": ""}]}) 

    gen = chat_stream_character_generator("", _context[:-1], None, params, None, temporal_ids, _app_cfg, _app_cfg['session_id'])
    
    upload_image_disabled = videos_cnt > 0
    upload_video_disabled = videos_cnt > 0 or images_cnt > 0
    
    yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=True)
    
    print(f"[respond_stream] 开始字符级流式输出循环")
    char_count = 0
    accumulated_content = ""
    
    for _char in gen:
        char_count += 1
        
        if check_and_handle_stop(_app_cfg, f"字符{char_count}"):
            break
            
        accumulated_content += _char
        _context[-1]["contents"][0]["pairs"] += _char
        
        # 实时显示内容（thinking模式也实时显示）
        if thinking_mode:
            # 尝试解析当前累积的内容
            thinking_content_raw, formal_answer_raw = parse_thinking_response(accumulated_content)
            
            # 如果解析出了完整的thinking内容，使用格式化显示
            if thinking_content_raw and thinking_content_raw != "STREAMING" and formal_answer_raw:
                thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
                formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
                formatted_display = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
                _chat_bot[-1] = (user_message, formatted_display)
            else:
                # 正在thinking过程中或者还没有完整标签，直接显示原始内容（实时流式）
                _chat_bot[-1] = (user_message, accumulated_content)
        else:
            # 非thinking模式，直接显示累积内容
            _chat_bot[-1] = (user_message, accumulated_content)
        
        if char_count % 5 == 0:  # 更频繁的更新以提供更好的流式体验
            print(f"[respond_stream] 已处理{char_count}个字符，stop_flag: {_app_cfg.get('stop_streaming', False)}")
            yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=True)
            time.sleep(0.02)  # 稍微增加延迟以避免过于频繁的更新
        else:
            yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=True)
    
    if _app_cfg.get('stop_streaming', False):
        print("[respond_stream] 流式输出已停止")
    
    # 最终处理thinking格式化
    final_content = accumulated_content
    if thinking_mode:
        thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(final_content)
        thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
        formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
        formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
        
        _chat_bot[-1] = (user_message, formatted_result)
        _context[-1]["contents"][0]["pairs"] = formal_answer_raw if formal_answer_raw else final_content
    else:
        _chat_bot[-1] = (user_message, final_content)
        _context[-1]["contents"][0]["pairs"] = final_content
    
    _app_cfg['ctx'] = _context
    _app_cfg['images_cnt'] = images_cnt
    _app_cfg['videos_cnt'] = videos_cnt
    _app_cfg['is_streaming'] = False
    
    upload_image_disabled = videos_cnt > 0
    upload_video_disabled = videos_cnt > 0 or images_cnt > 0
    yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)


def respond(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
    """主响应函数"""
    if 'session_id' not in _app_cfg:
        _app_cfg['session_id'] = uuid.uuid4().hex[:16]
        print(f"[会话] 为现有会话生成session_id: {_app_cfg['session_id']}")
    
    # 记录thinking模式状态变化
    prev_thinking_mode = _app_cfg.get('last_thinking_mode', False)
    _app_cfg['last_thinking_mode'] = thinking_mode
    
    if prev_thinking_mode != thinking_mode:
        print(f"[respond] Thinking模式切换: {prev_thinking_mode} -> {thinking_mode}")
        # 强制清理可能的缓存状态
        if hasattr(_app_cfg, 'thinking_cache'):
            del _app_cfg['thinking_cache']
        # 添加额外的状态重置
        if thinking_mode and not prev_thinking_mode:
            print("[respond] 启用thinking模式，重置相关状态")
            _app_cfg['thinking_enabled'] = True
        elif not thinking_mode and prev_thinking_mode:
            print("[respond] 禁用thinking模式")
            _app_cfg['thinking_enabled'] = False
    
    if params_form == 'Beam Search':
        streaming_mode = False
        print(f"[respond] Beam Search模式，强制禁用流式模式")
    
    if streaming_mode:
        print("[respond] 选择流式模式")
        yield from respond_stream(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting)
        return
    
    # 非流式模式
    _context = _app_cfg['ctx'].copy()
    encoded_message, temporal_ids = encode_message(_question, fps_setting)
    _context.append({'role': 'user', 'contents': encoded_message})

    images_cnt = _app_cfg['images_cnt']
    videos_cnt = _app_cfg['videos_cnt']
    files_cnts = check_has_videos(_question)
    if files_cnts[1] + videos_cnt > 1 or (files_cnts[1] + videos_cnt == 1 and files_cnts[0] + images_cnt > 0):
        gr.Warning("Only supports single video file input right now!")
        upload_image_disabled = videos_cnt > 0
        upload_video_disabled = videos_cnt > 0 or images_cnt > 0
        yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
        return
    if disable_text_only and files_cnts[1] + videos_cnt + files_cnts[0] + images_cnt <= 0:
        gr.Warning("Please chat with at least one image or video.")
        upload_image_disabled = videos_cnt > 0
        upload_video_disabled = videos_cnt > 0 or images_cnt > 0
        yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)
        return
        
    if params_form == 'Beam Search':
        params = {
            'sampling': False,
            'num_beams': 3,
            'repetition_penalty': 1.2,
            "max_new_tokens": 16284,
            "enable_thinking": thinking_mode,
            "stream": False
        }
    else:
        params = {
            'sampling': True,
            'top_p': 0.8,
            'top_k': 100,
            'temperature': 0.7,
            'repetition_penalty': 1.03,
            "max_new_tokens": 16284,
            "enable_thinking": thinking_mode,
            "stream": False
        }

    if files_cnts[1] + videos_cnt > 0:
        params["max_inp_length"] = 2048 * 10
        params["use_image_id"] = False
        params["max_slice_nums"] = 1

    # 调用聊天函数
    code, _answer, _context_answer, sts = chat_direct("", _context, None, params, None, temporal_ids, _app_cfg['session_id'])

    images_cnt += files_cnts[0]
    videos_cnt += files_cnts[1]
    
    if code == 0:
        context_content = _context_answer if _context_answer else _answer
        _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": context_content}]})
        
        # 根据thinking_mode决定是否应用thinking格式化
        if thinking_mode:
            thinking_content_raw, formal_answer_raw = parse_thinking_response_for_final(_answer)
            thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
            formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
            print(f"[respond] 非流式模式 - thinking_mode: {thinking_mode}, thinking_content: '{thinking_content_raw[:50]}...'")
            formatted_result = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
        else:
            print(f"[respond] 非流式模式 - thinking_mode: {thinking_mode}, 使用原始回答")
            formatted_result = _answer
        
        # 构建用户消息显示
        user_message, saved_images = format_user_message_with_files(_question, _app_cfg.get('session_id'))
        
        # 将媒体信息保存到会话状态中
        if saved_images:
            if 'media_cache' not in _app_cfg:
                _app_cfg['media_cache'] = []
            _app_cfg['media_cache'].extend(saved_images)
        
        _chat_bot.append((user_message, formatted_result))
        
        _app_cfg['ctx'] = _context
        _app_cfg['sts'] = sts
    else:
        _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": "Error occurred during processing"}]})
        # 构建用户消息显示（错误情况）
        user_message, saved_images = format_user_message_with_files(_question, _app_cfg.get('session_id'))
        
        # 将媒体信息保存到会话状态中
        if saved_images:
            if 'media_cache' not in _app_cfg:
                _app_cfg['media_cache'] = []
            _app_cfg['media_cache'].extend(saved_images)
        
        _chat_bot.append((user_message, "Error occurred during processing"))
    
    _app_cfg['images_cnt'] = images_cnt
    _app_cfg['videos_cnt'] = videos_cnt
    _app_cfg['is_streaming'] = False

    upload_image_disabled = videos_cnt > 0
    upload_video_disabled = videos_cnt > 0 or images_cnt > 0
    
    # 统一使用yield返回结果，确保与流式模式兼容
    yield create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg, gr.update(visible=False)


# FewShot相关函数
def fewshot_add_demonstration(_image, _user_message, _assistant_message, _chat_bot, _app_cfg):
    if 'session_id' not in _app_cfg:
        _app_cfg['session_id'] = uuid.uuid4().hex[:16]
        print(f"[会话] 为FewShot示例生成session_id: {_app_cfg['session_id']}")
    
    ctx = _app_cfg["ctx"]
    
    # 构建用户消息
    user_msg = ""
    if _image is not None:
        image = Image.open(_image).convert("RGB")
        ctx.append({"role": "user", "contents": [
            *encode_image(image),
            {"type": "text", "pairs": _user_message}
        ]})
        user_msg = f"[Image uploaded] {_user_message}"
    else:
        if _user_message:
            ctx.append({"role": "user", "contents": [{"type": "text", "pairs": _user_message}]})
            user_msg = _user_message
    
    # 构建助手消息
    if _assistant_message:
        ctx.append({"role": "assistant", "contents": [{"type": "text", "pairs": _assistant_message}]})
    
    # 只有当用户消息和助手消息都存在时才添加到聊天记录
    if user_msg and _assistant_message:
        formatted_user_msg = format_fewshot_user_message(_image, _user_message) if _image else user_msg
        _chat_bot.append([formatted_user_msg, _assistant_message])
    
    return None, "", "", _chat_bot, _app_cfg


def fewshot_respond(_image, _user_message, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
    """FewShot响应函数"""
    print(f"[fewshot_respond] Called with streaming_mode: {streaming_mode}")
    
    if 'session_id' not in _app_cfg:
        _app_cfg['session_id'] = uuid.uuid4().hex[:16]
        print(f"[会话] 为FewShot会话生成session_id: {_app_cfg['session_id']}")
    
    if params_form == 'Beam Search':
        streaming_mode = False
        print(f"[fewshot_respond] Beam Search模式，强制禁用流式模式")
    
    user_message_contents = []
    _context = _app_cfg["ctx"].copy()
    images_cnt = _app_cfg["images_cnt"]
    temporal_ids = None
    
    if _image:
        image = Image.open(_image).convert("RGB")
        user_message_contents += encode_image(image)
        images_cnt += 1
    if _user_message:
        user_message_contents += [{"type": "text", "pairs": _user_message}]
    if user_message_contents:
        _context.append({"role": "user", "contents": user_message_contents})

    if params_form == 'Beam Search':
        params = {
            'sampling': False,
            'num_beams': 3,
            'repetition_penalty': 1.2,
            "max_new_tokens": 16284,
            "enable_thinking": thinking_mode,
            "stream": False
        }
    else:
        params = {
            'sampling': True,
            'top_p': 0.8,
            'top_k': 100,
            'temperature': 0.7,
            'repetition_penalty': 1.03,
            "max_new_tokens": 16284,
            "enable_thinking": thinking_mode,
            "stream": streaming_mode
        }

    if disable_text_only and images_cnt == 0:
        gr.Warning("Please chat with at least one image or video.")
        yield _image, _user_message, '', _chat_bot, _app_cfg
        return

    if streaming_mode:
        print(f"[fewshot_respond] Using streaming mode")
        _app_cfg['is_streaming'] = True
        _app_cfg['stop_streaming'] = False
        
        if _image:
            user_msg = format_fewshot_user_message(_image, _user_message)
            _chat_bot.append([user_msg, ""])
        else:
            _chat_bot.append([_user_message, ""])
        
        _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": ""}]})
        
        _app_cfg['stop_streaming'] = False
        
        gen = chat_stream_character_generator("", _context[:-1], None, params, None, temporal_ids, _app_cfg, _app_cfg['session_id'])
        
        yield _image, _user_message, '', _chat_bot, _app_cfg
        
        accumulated_content = ""
        for _char in gen:
            if _app_cfg.get('stop_streaming', False):
                print("[fewshot_respond] 收到停止信号，中断流式响应")
                break
                
            accumulated_content += _char
            _context[-1]["contents"][0]["pairs"] += _char
            
            # 实时解析和格式化thinking内容
            if thinking_mode:
                # 尝试解析当前累积的内容
                thinking_content_raw, formal_answer_raw = parse_thinking_response(accumulated_content)
                
                # 如果解析出了完整的thinking内容，使用格式化显示
                if thinking_content_raw and thinking_content_raw != "STREAMING" and formal_answer_raw:
                    thinking_content_fmt = normalize_text_for_html(thinking_content_raw)
                    formal_answer_fmt = normalize_text_for_html(formal_answer_raw)
                    formatted_display = format_response_with_thinking(thinking_content_fmt, formal_answer_fmt)
                    _chat_bot[-1] = (_chat_bot[-1][0], formatted_display)
                else:
                    # 正在thinking过程中或者还没有完整标签，直接显示原始内容（实时流式）
                    _chat_bot[-1] = (_chat_bot[-1][0], accumulated_content)
            else:
                # 非thinking模式，直接显示累积内容
                _chat_bot[-1] = (_chat_bot[-1][0], accumulated_content)
            
            yield _image, _user_message, '', _chat_bot, _app_cfg
        
        final_content = _context[-1]["contents"][0]["pairs"]
        
        _app_cfg['ctx'] = _context
        _app_cfg['images_cnt'] = images_cnt
        _app_cfg['is_streaming'] = False
        
        yield _image, '', '', _chat_bot, _app_cfg
        
    else:
        # 非流式模式
        code, _answer, _context_answer, sts = chat_direct("", _context, None, params, None, temporal_ids, _app_cfg['session_id'])

        context_content = _context_answer if _context_answer else _answer
        _context.append({"role": "assistant", "contents": [{"type": "text", "pairs": context_content}]})

        if _image:
            user_msg = format_fewshot_user_message(_image, _user_message)
            _chat_bot.append([user_msg, _answer])
        else:
            _chat_bot.append([_user_message, _answer])
        
        if code == 0:
            _app_cfg['ctx'] = _context
            _app_cfg['sts'] = sts
            _app_cfg['images_cnt'] = images_cnt
        
        _app_cfg['is_streaming'] = False
        yield None, '', '', _chat_bot, _app_cfg


# 其他UI函数
def regenerate_button_clicked(_question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
    print(f"[regenerate] streaming_mode: {streaming_mode}")
    print(f"[regenerate] thinking_mode: {thinking_mode}")
    print(f"[regenerate] chat_type: {_app_cfg.get('chat_type', 'unknown')}")
    
    if params_form == 'Beam Search':
        streaming_mode = False
        print(f"[regenerate] Beam Search模式，强制禁用流式模式")
    
    if len(_chat_bot) <= 1 or not _chat_bot[-1][1]:
        gr.Warning('No question for regeneration.')
        yield _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
        return
        
    if _app_cfg["chat_type"] == "Chat":
        images_cnt = _app_cfg['images_cnt']
        videos_cnt = _app_cfg['videos_cnt']
        _question = _chat_bot[-1][0]
        _chat_bot = _chat_bot[:-1]
        _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
        files_cnts = check_has_videos(_question)
        images_cnt -= files_cnts[0]
        videos_cnt -= files_cnts[1]
        _app_cfg['images_cnt'] = images_cnt
        _app_cfg['videos_cnt'] = videos_cnt
        
        print(f"[regenerate] About to call respond with streaming_mode: {streaming_mode}")
        for result in respond(_question, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
            new_input, _chat_bot, _app_cfg, _stop_button = result
            _question = new_input
            yield _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
    else:
        # 在 tuples 格式下，_chat_bot[-1][0] 是字符串
        last_user_message = _chat_bot[-1][0]
        last_image = None
        
        # 检查消息是否包含图片标识
        if "[Image uploaded]" in last_user_message:
            # 从消息中提取实际的用户消息
            last_user_message = last_user_message.replace("[Image uploaded] ", "")
            # 注意：在简化的 tuples 格式下，我们无法直接获取图片文件
            # 这里需要根据实际需要进行处理
        _chat_bot = _chat_bot[:-1]
        _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
        
        print(f"[regenerate] About to call fewshot_respond with streaming_mode: {streaming_mode}")
        for result in fewshot_respond(last_image, last_user_message, _chat_bot, _app_cfg, params_form, thinking_mode, streaming_mode, fps_setting):
            _image, _user_message, _assistant_message, _chat_bot, _app_cfg = result
            yield _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg


def flushed():
    return gr.update(interactive=True)


def clear_media_cache(session_id):
    """清理指定会话的媒体缓存"""
    import shutil
    from pathlib import Path
    
    try:
        cache_dir = Path("./media_cache") / session_id
        if cache_dir.exists():
            shutil.rmtree(cache_dir)
            print(f"[clear_media_cache] 已清理会话 {session_id} 的媒体缓存")
    except Exception as e:
        print(f"[clear_media_cache] 清理缓存失败: {e}")


def clear(txt_input, file_upload, chat_bot, app_session):
    # 清理旧会话的媒体缓存
    if 'session_id' in app_session:
        clear_media_cache(app_session['session_id'])
    
    chat_bot = copy.deepcopy(init_conversation)
    app_session['sts'] = None
    app_session['ctx'] = []
    app_session['images_cnt'] = 0
    app_session['videos_cnt'] = 0
    app_session['stop_streaming'] = False
    app_session['is_streaming'] = False
    app_session['media_cache'] = []  # 清空媒体缓存信息
    app_session['last_thinking_mode'] = False  # 重置thinking模式状态
    app_session['session_id'] = uuid.uuid4().hex[:16]
    print(f"[会话] 生成新会话ID: {app_session['session_id']}")
    return "", None, gr.update(value=[], visible=False), gr.update(value=[], visible=False), chat_bot, app_session, None, '', ''


def select_chat_type(_tab, _app_cfg):
    _app_cfg["chat_type"] = _tab
    return _app_cfg


# UI配置
form_radio = {
    'choices': ['Beam Search', 'Sampling'],
    'value': 'Sampling',
    'interactive': True,
    'label': 'Decode Type'
}

thinking_checkbox = {
    'value': False,
    'interactive': True,
    'label': 'Enable Thinking Mode',
}

streaming_checkbox = {
    'value': True,
    'interactive': True,
    'label': 'Enable Streaming Mode',
}

fps_slider = {
    'minimum': 1,
    'maximum': 20,
    'value': 3,
    'step': 1,
    'interactive': True,
    'label': 'Custom FPS for Video Processing'
}

init_conversation = [
    ["", "You can talk to me now"]
]

css = """
video { height: auto !important; }
.example label { font-size: 16px;}

/* Current Media Gallery 滚动条样式 - 使用class选择器更安全 */
.current-media-gallery {
    overflow-y: auto !important;
    max-height: 600px !important;
    position: relative !important;
}

/* 确保只影响特定的Gallery容器内部 */
.current-media-gallery > div,
.current-media-gallery .gallery-container {
    overflow-y: auto !important;
    max-height: 580px !important;
}

.current-media-gallery .gallery-item {
    margin-bottom: 10px !important;
}

/* 只为Current Media Gallery自定义滚动条样式 */
.current-media-gallery::-webkit-scrollbar,
.current-media-gallery > div::-webkit-scrollbar,
.current-media-gallery .gallery-container::-webkit-scrollbar {
    width: 8px !important;
}

.current-media-gallery::-webkit-scrollbar-track,
.current-media-gallery > div::-webkit-scrollbar-track,
.current-media-gallery .gallery-container::-webkit-scrollbar-track {
    background: #f1f1f1 !important;
    border-radius: 4px !important;
}

.current-media-gallery::-webkit-scrollbar-thumb,
.current-media-gallery > div::-webkit-scrollbar-thumb,
.current-media-gallery .gallery-container::-webkit-scrollbar-thumb {
    background: #c1c1c1 !important;
    border-radius: 4px !important;
}

.current-media-gallery::-webkit-scrollbar-thumb:hover,
.current-media-gallery > div::-webkit-scrollbar-thumb:hover,
.current-media-gallery .gallery-container::-webkit-scrollbar-thumb:hover {
    background: #a8a8a8 !important;
}

/* 隐藏Current Media的不必要元素 */
.current-media-gallery .upload-container,
.current-media-gallery .drop-zone,
.current-media-gallery .file-upload,
.current-media-gallery .upload-text,
.current-media-gallery .drop-text {
    display: none !important;
}

.current-media-gallery .clear-button,
.current-media-gallery .delete-button,
.current-media-gallery .remove-button {
    display: none !important;
}

/* 当Gallery为空时隐藏标签和占位文本 */
.current-media-gallery:not([style*="display: none"]) .gallery-container:empty::after {
    content: "";
    display: none;
}

.current-media-gallery .empty-gallery-text,
.current-media-gallery .placeholder-text {
    display: none !important;
}

/* 确保滚动条不会影响到其他组件 */
.current-media-gallery {
    isolation: isolate !important;
}

/* 重置其他Gallery组件的滚动条样式，防止被污染 */
.gradio-gallery:not(.current-media-gallery)::-webkit-scrollbar {
    width: initial !important;
}

.gradio-gallery:not(.current-media-gallery)::-webkit-scrollbar-track {
    background: initial !important;
    border-radius: initial !important;
}

.gradio-gallery:not(.current-media-gallery)::-webkit-scrollbar-thumb {
    background: initial !important;
    border-radius: initial !important;
}

/* 确保chatbot不受影响 */
.thinking-chatbot::-webkit-scrollbar {
    width: initial !important;
}

.thinking-chatbot::-webkit-scrollbar-track {
    background: initial !important;
}

.thinking-chatbot::-webkit-scrollbar-thumb {
    background: initial !important;
}

/* 思考过程和正式回答的样式 */
.response-container {
    margin: 10px 0;
}

.thinking-section {
    background: linear-gradient(135deg, #f8f9ff 0%, #f0f4ff 100%);
    border: 1px solid #d1d9ff;
    border-radius: 12px;
    padding: 16px;
    margin-bottom: 0px;
    box-shadow: 0 2px 8px rgba(67, 90, 235, 0.1);
}

.thinking-header {
    font-weight: 600;
    color: #4c5aa3;
    font-size: 14px;
    margin-bottom: 12px;
    display: flex;
    align-items: center;
    gap: 8px;
}

.thinking-content {
    color: #5a6ba8;
    font-size: 13px;
    line-height: 1;
    font-style: italic;
    background: rgba(255, 255, 255, 0.6);
    padding: 12px;
    border-radius: 8px;
    border-left: 3px solid #4c5aa3;
    white-space: pre-wrap;
}

.formal-section {
    background: linear-gradient(135deg, #ffffff 0%, #f8f9fa 100%);
    border: 1px solid #e9ecef;
    border-radius: 12px;
    padding: 16px;
    box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
}

.formal-header {
    font-weight: 600;
    color: #28a745;
    font-size: 14px;
    margin-bottom: 12px;
    display: flex;
    align-items: center;
    gap: 8px;
}

.formal-content {
    color: #333;
    font-size: 14px;
    line-height: 1;
    white-space: pre-wrap;
}

/* 聊天机器人容器样式 */
.thinking-chatbot .message {
    border-radius: 12px;
    overflow: visible;
    margin-top: 0 !important;
    margin-bottom: 0 !important;
}

.thinking-chatbot .message-wrap {
    margin-top: 0 !important;
    margin-bottom: 0 !important;
}

.thinking-chatbot .message.bot {
    background: transparent !important;
    border: none !important;
    padding: 8px !important;
}

.thinking-chatbot .message.bot .content {
    background: transparent !important;
}
"""

introduction = """
## Features:
1. Chat with single image
2. Chat with multiple images  
3. Chat with video
4. Streaming Mode: Real-time response streaming
5. Thinking Mode: Show model reasoning process

Click `How to use` tab to see examples.
"""


# 主应用
def create_app():
    with gr.Blocks(css=css) as demo:
        with gr.Tab(model_name):
            with gr.Row():
                with gr.Column(scale=1, min_width=300):
                    gr.Markdown(value=introduction)
                    params_form = create_component(form_radio, comp='Radio')
                    thinking_mode = create_component(thinking_checkbox, comp='Checkbox')
                    streaming_mode = create_component(streaming_checkbox, comp='Checkbox')

                    fps_setting = create_component(fps_slider, comp='Slider')
                    regenerate = create_component({'value': 'Regenerate'}, comp='Button')
                    clear_button = create_component({'value': 'Clear History'}, comp='Button')
                    
                    stop_button = gr.Button("Stop", visible=False)

                with gr.Column(scale=3, min_width=500):
                    initial_session_id = uuid.uuid4().hex[:16]
                    print(f"[会话] 初始化会话，生成session_id: {initial_session_id}")
                    app_session = gr.State({
                        'sts': None, 'ctx': [], 'images_cnt': 0, 'videos_cnt': 0, 
                        'chat_type': 'Chat', 'stop_streaming': False, 'is_streaming': False, 
                        'session_id': initial_session_id, 'media_cache': [], 'last_thinking_mode': False
                    })
                    with gr.Row():
                        with gr.Column(scale=4):
                            chat_bot = gr.Chatbot(
                                label=f"Chat with {model_name}", 
                                value=copy.deepcopy(init_conversation), 
                                height=600,
                                elem_classes="thinking-chatbot"
                            )
                        with gr.Column(scale=1, min_width=200):
                            current_images = gr.Gallery(
                                label="Current Media",
                                show_label=True,
                                elem_id="current_media",
                                elem_classes="current-media-gallery",
                                columns=1,
                                rows=1,  # 设为1行，让内容可以垂直滚动
                                height=600,
                                visible=False,
                                container=True,  # 启用容器模式
                                allow_preview=True,  # 允许预览
                                show_download_button=False,  # 隐藏下载按钮
                                interactive=False,  # 禁用交互，防止用户上传/删除
                                show_share_button=False  # 隐藏分享按钮
                            )

                    with gr.Tab("Chat") as chat_tab:
                        chat_tab_label = gr.Textbox(value="Chat", interactive=False, visible=False)
                        
                        with gr.Row():
                            with gr.Column(scale=4):
                                txt_input = gr.Textbox(
                                    placeholder="Type your message here...",
                                    label="Message",
                                    lines=2
                                )
                            with gr.Column(scale=1):
                                submit_btn = gr.Button("Submit", variant="primary")
                        
                        with gr.Row():
                            with gr.Column():
                                file_upload = create_multimodal_input()
                                # 添加图片预览组件
                                file_preview = gr.Gallery(
                                    label="Uploaded Files Preview",
                                    show_label=True,
                                    elem_id="file_preview",
                                    columns=3,
                                    rows=2,
                                    height="auto",
                                    visible=False
                                )
                        
                        # 添加文件上传时的预览更新
                        def update_file_preview(files):
                            if files:
                                # 过滤出图片文件进行预览
                                image_files = []
                                for file in files:
                                    if hasattr(file, 'name'):
                                        file_path = file.name
                                    else:
                                        file_path = str(file)
                                    
                                    # 检查是否是图片文件
                                    if any(file_path.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']):
                                        image_files.append(file_path)
                                
                                if image_files:
                                    return gr.update(value=image_files, visible=True)
                            
                            return gr.update(value=[], visible=False)
                        
                        file_upload.change(
                            update_file_preview,
                            inputs=[file_upload],
                            outputs=[file_preview]
                        )

                        # 创建一个包装函数来处理新的输入格式
                        def handle_submit(message, files, chat_bot, current_images_gallery, app_session, params_form, thinking_mode, streaming_mode, fps_setting):
                            print(f"[handle_submit] 收到输入: message='{message}', files={files}, chat_bot长度={len(chat_bot)}")
                            
                            # 如果消息为空且没有文件，直接返回
                            if not message and not files:
                                print("[handle_submit] 消息和文件都为空，直接返回")
                                return message, files, chat_bot, current_images_gallery, app_session, gr.update(visible=False)
                            
                            # 模拟原来的 MultimodalInput 格式
                            class MockInput:
                                def __init__(self, text, files):
                                    self.text = text
                                    self.files = files if files else []
                            
                            mock_question = MockInput(message, files)
                            print(f"[handle_submit] 创建MockInput: text='{mock_question.text}', files={len(mock_question.files)}")
                            
                            # respond 函数返回生成器，我们需要逐步yield结果
                            result_generator = respond(mock_question, chat_bot, app_session, params_form, thinking_mode, streaming_mode, fps_setting)
                            
                            # 如果是生成器，逐步yield
                            if hasattr(result_generator, '__iter__') and not isinstance(result_generator, (str, bytes, tuple)):
                                print("[handle_submit] 使用生成器模式")
                                for result in result_generator:
                                    new_file_input, updated_chat_bot, updated_app_session, stop_btn_update = result
                                    print(f"[handle_submit] yield结果: chat_bot长度={len(updated_chat_bot)}")
                                    
                                    # 更新媒体显示
                                    media_gallery_update = update_media_gallery(updated_app_session)
                                    
                                    # 返回正确的输出格式
                                    yield "", None, updated_chat_bot, media_gallery_update, updated_app_session, stop_btn_update
                            else:
                                print("[handle_submit] 使用非生成器模式")
                                # 如果不是生成器，直接返回
                                new_file_input, updated_chat_bot, updated_app_session, stop_btn_update = result_generator
                                print(f"[handle_submit] 直接返回结果: chat_bot长度={len(updated_chat_bot)}")
                                
                                # 更新图片显示
                                image_gallery_update = update_image_gallery(updated_app_session)
                                
                                yield "", None, updated_chat_bot, image_gallery_update, updated_app_session, stop_btn_update

                        submit_btn.click(
                            handle_submit,
                            [txt_input, file_upload, chat_bot, current_images, app_session, params_form, thinking_mode, streaming_mode, fps_setting],
                            [txt_input, file_upload, chat_bot, current_images, app_session, stop_button]
                        )

                    with gr.Tab("Few Shot", visible=False) as fewshot_tab:
                        fewshot_tab_label = gr.Textbox(value="Few Shot", interactive=False, visible=False)
                        with gr.Row():
                            with gr.Column(scale=1):
                                image_input = gr.Image(type="filepath", sources=["upload"])
                            with gr.Column(scale=3):
                                user_message = gr.Textbox(label="User")
                                assistant_message = gr.Textbox(label="Assistant")
                                with gr.Row():
                                    add_demonstration_button = gr.Button("Add Example")
                                    generate_button = gr.Button(value="Generate", variant="primary")
                        
                        add_demonstration_button.click(
                            fewshot_add_demonstration,
                            [image_input, user_message, assistant_message, chat_bot, app_session],
                            [image_input, user_message, assistant_message, chat_bot, app_session]
                        )
                        generate_button.click(
                            fewshot_respond,
                            [image_input, user_message, chat_bot, app_session, params_form, thinking_mode, streaming_mode, fps_setting],
                            [image_input, user_message, assistant_message, chat_bot, app_session]
                        )

                    chat_tab.select(
                        select_chat_type,
                        [chat_tab_label, app_session],
                        [app_session]
                    )
                    chat_tab.select(
                        clear,
                        [txt_input, file_upload, chat_bot, app_session],
                        [txt_input, file_upload, file_preview, chat_bot, app_session, image_input, user_message, assistant_message]
                    )
                    fewshot_tab.select(
                        select_chat_type,
                        [fewshot_tab_label, app_session],
                        [app_session]
                    )
                    fewshot_tab.select(
                        clear,
                        [txt_input, file_upload, chat_bot, app_session],
                        [txt_input, file_upload, file_preview, chat_bot, app_session, image_input, user_message, assistant_message]
                    )
                    # chat_bot.flushed(flushed, outputs=[txt_input])  # 标准 Chatbot 可能不支持 flushed
                    
                    params_form.change(
                        update_streaming_mode_state,
                        inputs=[params_form],
                        outputs=[streaming_mode]
                    )
                    
                    regenerate.click(
                        regenerate_button_clicked,
                        [txt_input, image_input, user_message, assistant_message, chat_bot, app_session, params_form, thinking_mode, streaming_mode, fps_setting],
                        [txt_input, image_input, user_message, assistant_message, chat_bot, app_session]
                    )
                    clear_button.click(
                        clear,
                        [txt_input, file_upload, chat_bot, app_session],
                        [txt_input, file_upload, file_preview, current_images, chat_bot, app_session, image_input, user_message, assistant_message]
                    )
                    
                    stop_button.click(
                        stop_button_clicked,
                        [app_session],
                        [app_session, stop_button]
                    )

    return demo


if __name__ == "__main__":
    # 解析命令行参数
    parser = argparse.ArgumentParser(description='Web Demo for MiniCPM-V 4.5')
    parser.add_argument('--port', type=int, default=7860, help='Port to run the web demo on')
    parser.add_argument('--no-parallel-encoding', action='store_true', help='Disable parallel image encoding')
    parser.add_argument('--parallel-processes', type=int, default=None, help='Number of parallel processes for image encoding')
    args = parser.parse_args()
    
    # 配置并行编码
    if args.no_parallel_encoding:
        ENABLE_PARALLEL_ENCODING = False
        print("[性能优化] 并行图像编码已禁用")
    else:
        ENABLE_PARALLEL_ENCODING = True
        print("[性能优化] 并行图像编码已启用")
    
    if args.parallel_processes:
        PARALLEL_PROCESSES = args.parallel_processes
        print(f"[性能优化] 设置并行进程数为: {PARALLEL_PROCESSES}")
    else:
        print(f"[性能优化] 自动检测并行进程数，CPU核心数: {mp.cpu_count()}")
    
    # 初始化模型
    initialize_model()
    
    # 创建并启动应用
    demo = create_app()
    demo.launch(
        share=False, 
        debug=True, 
        show_api=False,
        server_port=args.port, 
        server_name="0.0.0.0"
    )