ERDES β Ocular Ultrasound Classification All Tasks
Collection
ERDES benchmark: 8 architectures Γ 5 ocular ultrasound classification tasks. Paper: arXiv:2508.04735 | Dataset: pcvlab/erdes β’ 59 items β’ Updated
β’ 1
Trained model weights for PVD classification (normal vs. PVD) using ocular ultrasound videos.
| Property | Value |
|---|---|
| Architecture | ResNet3D (block=basic, layers=[4,4,4,4], block_inplanes=[64,128,256,512]) |
| Input modality | 3D ocular ultrasound video |
| Input shape | [1, 96, 128, 128] (C, D, H, W) |
| Pooling | Global Average Pooling |
| Output | Binary classification (sigmoid) |
| Label | Class |
|---|---|
| 0 | Normal |
| 1 | Posterior Vitreous Detachment |
pip install git+https://github.com/OSUPCVLab/ERDES.git ultralytics
import torch
import numpy as np
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
from ultralytics import YOLO
from erdes.models.components.cls_model import ResNet3DClassifier
from erdes.data.components.utils import resize
# --- 1. Load YOLO for ocular globe detection ---
yolo = YOLO(hf_hub_download("pcvlab/yolov8_ocular_ultrasound_globe_detection", "yolov8_ocular_ultrasound_globe_detection.pt"))
# --- 2. Crop your POCUS ultrasound video using YOLO (finds largest globe bbox across all frames) ---
def crop_video(video_path, model, conf=0.8):
# First pass: find the largest bounding box across all frames
area_max, cropping_bbox = 0, None
for frame in model.predict(video_path, stream=True, verbose=False, conf=conf):
if len(frame.boxes.xywhn):
bbox = frame.boxes.xywhn[0].cpu().numpy()
area = bbox[2] * bbox[3]
if area > area_max:
area_max, cropping_bbox = area, bbox
if cropping_bbox is None:
raise ValueError("YOLO could not detect ocular globe in video.")
# Second pass: crop every frame with the largest bbox
frames = []
for frame in model.predict(video_path, stream=True, verbose=False, conf=conf):
img = frame.orig_img # [H, W, C] BGR
h, w, _ = img.shape
x_c, y_c, bw, bh = cropping_bbox
x1, y1 = int((x_c - bw/2) * w), int((y_c - bh/2) * h)
x2, y2 = int((x_c + bw/2) * w), int((y_c + bh/2) * h)
frames.append(img[y1:y2, x1:x2])
return np.stack(frames) # [D, H, W, C]
frames = crop_video("your_video.mp4", yolo) # [D, H, W, C]
# --- 3. Preprocess ---
video = torch.from_numpy(frames).float() # [D, H, W, C]
video = video.permute(3, 0, 1, 2) # [C, D, H, W]
if video.shape[0] == 3:
video = video.mean(dim=0, keepdim=True) # grayscale [1, D, H, W]
video = resize((96, 128, 128))(video) / 255.0 # pad + resize + normalize
video = video.unsqueeze(0) # [1, 1, 96, 128, 128]
# --- 4. Load model and run inference ---
model = ResNet3DClassifier(in_channels=1, num_classes=1, block='basic', layers=[4, 4, 4, 4], block_inplanes=[64, 128, 256, 512], pooling="avg")
weights = load_file(hf_hub_download("pcvlab/resnet3d_normal_vs_pvd", "model.safetensors"))
model.load_state_dict(weights)
model.eval()
with torch.no_grad():
logit = model(video)
prob = torch.sigmoid(logit).item()
pred = int(prob > 0.5)
labels = {'0': 'Normal', '1': 'Posterior Vitreous Detachment'}
print(f"Prediction: {labels[str(pred)]} (confidence: {prob:.3f})")
If you use this model, please cite the ERDES paper:
@misc{ozkut2026erdes,
title={ERDES: A Benchmark Video Dataset for Retinal Detachment and Macular Status Classification in Ocular Ultrasound},
author={Yasemin Ozkut and Pouyan Navard and Srikar Adhikari and Elaine Situ-LaCasse and Josie Acu{\~n}a and Adrienne Yarnish and Alper Yilmaz},
year={2026},
eprint={2508.04735},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2508.04735}
}