import gradio as gr import torch from transformers import AutoProcessor, AutoModelForCausalLM from PIL import Image import spaces # Load model and processor model_path = 'baidu/ERNIE-4.5-VL-28B-A3B-Thinking' print("Loading model...") model = AutoModelForCausalLM.from_pretrained( model_path, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True ) processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) model.add_image_preprocess(processor) print("Model loaded successfully!") @spaces.GPU(duration=120) def respond(message, history): """ Process chat messages and generate responses Args: message: dict with 'text' and optional 'files' keys history: list of past conversation messages Yields: str: Generated response text (streamed) """ # Build message history for the model messages = [] # Add conversation history for user_msg, assistant_msg in history: if isinstance(user_msg, dict): # Handle multimodal messages content = [] if user_msg.get("text"): content.append({"type": "text", "text": user_msg["text"]}) if user_msg.get("files"): for file_path in user_msg["files"]: content.append({ "type": "image_url", "image_url": {"url": file_path} }) messages.append({"role": "user", "content": content}) else: # Handle text-only messages messages.append({"role": "user", "content": [{"type": "text", "text": user_msg}]}) if assistant_msg: messages.append({"role": "assistant", "content": [{"type": "text", "text": assistant_msg}]}) # Add current message content = [] if message.get("text"): content.append({"type": "text", "text": message["text"]}) if message.get("files"): for file_path in message["files"]: content.append({ "type": "image_url", "image_url": {"url": file_path} }) messages.append({"role": "user", "content": content}) # Prepare inputs text = processor.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) image_inputs, video_inputs = processor.process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) device = next(model.parameters()).device inputs = inputs.to(device) # Generate response with streaming generated_ids = model.generate( inputs=inputs['input_ids'].to(device), **inputs, max_new_tokens=1024, use_cache=False ) output_text = processor.decode( generated_ids[0][len(inputs['input_ids'][0]):], skip_special_tokens=True ) # Stream the response for i in range(len(output_text)): yield output_text[:i+1] # Create Gradio interface with gr.Blocks(fill_height=True, theme=gr.themes.Soft()) as demo: gr.HTML("""

🤖 ERNIE-4.5-VL Chatbot

Powered by Baidu's ERNIE-4.5-VL-28B with Advanced Reasoning

Built with anycoder

""") chatbot = gr.ChatInterface( fn=respond, type="messages", multimodal=True, title="", description="Upload images and ask questions! ERNIE-4.5-VL can understand and reason about visual content.", examples=[ { "text": "What can you see in this image?", "files": [] }, { "text": "Describe this image in detail", "files": [] }, { "text": "What is happening in this picture?", "files": [] } ], cache_examples=False, ) if __name__ == "__main__": demo.launch()