🤖 ERNIE-4.5-VL Chatbot

import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import spaces

# Load model and processor
model_path = 'baidu/ERNIE-4.5-VL-28B-A3B-Thinking'

print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model.add_image_preprocess(processor)
print("Model loaded successfully!")

@spaces.GPU(duration=120)
def respond(message, history):
    """
    Process chat messages and generate responses
    
    Args:
        message: dict with 'text' and optional 'files' keys
        history: list of past conversation messages
    
    Yields:
        str: Generated response text (streamed)
    """
    # Build message history for the model
    messages = []
    
    # Add conversation history
    for user_msg, assistant_msg in history:
        if isinstance(user_msg, dict):
            # Handle multimodal messages
            content = []
            if user_msg.get("text"):
                content.append({"type": "text", "text": user_msg["text"]})
            if user_msg.get("files"):
                for file_path in user_msg["files"]:
                    content.append({
                        "type": "image_url",
                        "image_url": {"url": file_path}
                    })
            messages.append({"role": "user", "content": content})
        else:
            # Handle text-only messages
            messages.append({"role": "user", "content": [{"type": "text", "text": user_msg}]})
        
        if assistant_msg:
            messages.append({"role": "assistant", "content": [{"type": "text", "text": assistant_msg}]})
    
    # Add current message
    content = []
    if message.get("text"):
        content.append({"type": "text", "text": message["text"]})
    if message.get("files"):
        for file_path in message["files"]:
            content.append({
                "type": "image_url",
                "image_url": {"url": file_path}
            })
    
    messages.append({"role": "user", "content": content})
    
    # Prepare inputs
    text = processor.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    
    image_inputs, video_inputs = processor.process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    
    device = next(model.parameters()).device
    inputs = inputs.to(device)
    
    # Generate response with streaming
    generated_ids = model.generate(
        inputs=inputs['input_ids'].to(device),
        **inputs,
        max_new_tokens=1024,
        use_cache=False
    )
    
    output_text = processor.decode(
        generated_ids[0][len(inputs['input_ids'][0]):],
        skip_special_tokens=True
    )
    
    # Stream the response
    for i in range(len(output_text)):
        yield output_text[:i+1]

# Create Gradio interface
with gr.Blocks(fill_height=True, theme=gr.themes.Soft()) as demo:
    gr.HTML("""
        <div style='text-align: center; margin-bottom: 20px;'>
            <h1>🤖 ERNIE-4.5-VL Chatbot</h1>
            <p>Powered by Baidu's ERNIE-4.5-VL-28B with Advanced Reasoning</p>
            <p style='font-size: 14px; color: #666;'>
                Built with <a href='https://huggingface.co/spaces/akhaliq/anycoder' target='_blank' style='color: #4CAF50; text-decoration: none;'>anycoder</a>
            </p>
        </div>
    """)
    
    chatbot = gr.ChatInterface(
        fn=respond,
        type="messages",
        multimodal=True,
        title="",
        description="Upload images and ask questions! ERNIE-4.5-VL can understand and reason about visual content.",
        examples=[
            {
                "text": "What can you see in this image?",
                "files": []
            },
            {
                "text": "Describe this image in detail",
                "files": []
            },
            {
                "text": "What is happening in this picture?",
                "files": []
            }
        ],
        cache_examples=False,
    )

if __name__ == "__main__":
    demo.launch()