inferenece script

#1
by Phase-Technologies - opened

import torch
import gc
from transformers import TextStreamer, pipeline, BitsAndBytesConfig

1. Clear GPU memory to recover from the previous OOM

if 'pipe' in locals(): del pipe
if 'outputs' in locals(): del outputs
gc.collect()
torch.cuda.empty_cache()

2. Configure 4-bit quantization properly for custom architectures

bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True
)

model_id = "Xerv-AI/tarn"
pipe = pipeline(
"image-text-to-text",
model=model_id,
device_map="auto",
trust_remote_code=True,
model_kwargs={
"quantization_config": bnb_config,
"torch_dtype": torch.float16
}
)

streamer = TextStreamer(pipe.tokenizer, skip_prompt=True)

messages = [
{
"role": "user",
"content": [
{
"type": "image",
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"
},
{
"type": "text",
"text": "Analyze the visual artifacts present in this image and define the principles of triboelectricity."
}
]
},
]

print("=== Initiating Real-Time Telemetry Stream (Quantized) ===")
outputs = pipe(
text=messages,
generate_kwargs={
"max_new_tokens": 512,
"streamer": streamer
}
)

Sign up or log in to comment