ezpz.data.vision⚓︎
ezpz/data/vision.py
Sam Foreman 2024-12-27
HFImageNet1K
⚓︎
Bases: Dataset
Thin wrapper to use HF imagenet-1k with torchvision transforms.
Source code in src/ezpz/data/vision.py
class HFImageNet1K(Dataset):
"""Thin wrapper to use HF imagenet-1k with torchvision transforms."""
def __init__(self, hf_dataset, transform=None):
self.ds = hf_dataset
self.transform = transform
def __len__(self) -> int:
return len(self.ds)
def __getitem__(self, idx: int):
example = self.ds[int(idx)]
img = example["image"] # PIL.Image or array
label = int(example["label"])
if self.transform is not None:
img = self.transform(img)
return img, label
get_imagenet(train_batch_size=128, test_batch_size=128, outdir=None, num_workers=1, shuffle=False, pin_memory=True)
⚓︎
Return train/test ImageNet datasets, loaders, and (optional) samplers.
Expects directory layout:
<outdir>/data/imagenet/
train/
class1/
class2/
...
val/
class1/
class2/
...
where train/ and val/ are standard ImageNet-style folders.
Source code in src/ezpz/data/vision.py
def get_imagenet(
train_batch_size: int = 128,
test_batch_size: int = 128,
outdir: Optional[str | Path] = None,
num_workers: int = 1,
shuffle: bool = False,
pin_memory: bool = True,
) -> dict:
"""Return train/test ImageNet datasets, loaders, and (optional) samplers.
Expects directory layout:
<outdir>/data/imagenet/
train/
class1/
class2/
...
val/
class1/
class2/
...
where `train/` and `val/` are standard ImageNet-style folders.
"""
outdir = OUTPUT_DIR if outdir is None else Path(outdir)
datadir = Path(outdir).joinpath("data", "imagenet")
# Standard ImageNet normalization
normalize = transforms.Normalize(
mean=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225),
)
train_transform = transforms.Compose(
[
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]
)
test_transform = transforms.Compose(
[
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
]
)
# Basic sanity check only on rank 0 (no auto-download for ImageNet)
if ezpz.dist.get_rank() == 0:
train_dir = datadir / "train"
val_dir = datadir / "val"
if not train_dir.is_dir() or not val_dir.is_dir():
raise FileNotFoundError(
f"Expected ImageNet data under:\n"
f" {train_dir}\n"
f" {val_dir}\n"
"with standard ImageFolder layout."
)
if ezpz.dist.get_world_size() > 1:
ezpz.dist.barrier()
dataset1 = datasets.ImageFolder(
root=datadir / "train", transform=train_transform
)
dataset2 = datasets.ImageFolder(
root=datadir / "val", transform=test_transform
)
train_kwargs: dict = {
"batch_size": train_batch_size,
"pin_memory": pin_memory,
"num_workers": num_workers,
}
test_kwargs: dict = {
"batch_size": test_batch_size,
"pin_memory": pin_memory,
"num_workers": num_workers,
}
sampler1, sampler2 = None, None
rank = ezpz.dist.get_rank()
world_size = ezpz.dist.get_world_size()
if world_size > 1:
sampler1 = DistributedSampler(
dataset1,
rank=rank,
num_replicas=world_size,
shuffle=True,
)
sampler2 = DistributedSampler(
dataset2,
rank=rank,
num_replicas=world_size,
shuffle=False,
)
train_kwargs["sampler"] = sampler1
test_kwargs["sampler"] = sampler2
else:
train_kwargs["shuffle"] = shuffle
loader_train = torch.utils.data.DataLoader(
dataset=dataset1, **train_kwargs
)
loader_test = torch.utils.data.DataLoader(dataset=dataset2, **test_kwargs)
return {
"train": {
"data": dataset1,
"loader": loader_train,
"sampler": sampler1,
},
"test": {
"data": dataset2,
"loader": loader_test,
"sampler": sampler2,
},
}
get_imagenet1k(train_batch_size=128, test_batch_size=128, outdir=None, num_workers=1, download=True, shuffle=False, pin_memory=True)
⚓︎
ILSVRC/imagenet-1k via Hugging Face, mirroring get_mnist API/behavior.
Source code in src/ezpz/data/vision.py
def get_imagenet1k(
train_batch_size: int = 128,
test_batch_size: int = 128,
outdir: Optional[str | Path] = None,
num_workers: int = 1,
download: bool = True,
shuffle: bool = False,
pin_memory: bool = True,
) -> dict:
"""ILSVRC/imagenet-1k via Hugging Face, mirroring get_mnist API/behavior."""
try:
from datasets import load_dataset
except ImportError as exc:
raise ImportError(
"The `datasets` library is required for get_imagenet_hf.\n"
"Install via `pip install datasets`."
) from exc
outdir = OUTPUT_DIR if outdir is None else Path(outdir)
datadir = Path(outdir).joinpath("data", "imagenet_hf")
datadir.mkdir(parents=True, exist_ok=True)
# Optional "don't download" behavior
if not download and not any(datadir.iterdir()):
raise FileNotFoundError(
f"No cached imagenet-1k dataset found in {datadir} and download=False."
)
rank = ezpz.dist.get_rank()
world_size = ezpz.dist.get_world_size()
# Only rank 0 triggers the initial download into cache_dir
if rank == 0 and download:
_ = load_dataset(
"ILSVRC/imagenet-1k",
split="train",
cache_dir=datadir.as_posix(),
)
_ = load_dataset(
"ILSVRC/imagenet-1k",
split="validation",
cache_dir=datadir.as_posix(),
)
if world_size > 1:
ezpz.dist.barrier()
# Now every rank loads from the shared cache_dir
hf_train = load_dataset(
"ILSVRC/imagenet-1k",
split="train",
cache_dir=datadir.as_posix(),
)
hf_val = load_dataset(
"ILSVRC/imagenet-1k",
split="validation",
cache_dir=datadir.as_posix(),
)
# ImageNet-style normalization and transforms
normalize = transforms.Normalize(
mean=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225),
)
train_transform = transforms.Compose(
[
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]
)
test_transform = transforms.Compose(
[
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
]
)
dataset1 = HFImageNet1K(hf_train, transform=train_transform)
dataset2 = HFImageNet1K(hf_val, transform=test_transform)
train_kwargs: dict = {
"batch_size": train_batch_size,
"pin_memory": pin_memory,
"num_workers": num_workers,
}
test_kwargs: dict = {
"batch_size": test_batch_size,
"pin_memory": pin_memory,
"num_workers": num_workers,
}
sampler1, sampler2 = None, None
if world_size > 1:
sampler1 = DistributedSampler(
dataset1,
rank=rank,
num_replicas=world_size,
shuffle=True,
)
sampler2 = DistributedSampler(
dataset2,
rank=rank,
num_replicas=world_size,
shuffle=False,
)
train_kwargs["sampler"] = sampler1
test_kwargs["sampler"] = sampler2
else:
train_kwargs["shuffle"] = shuffle
train_loader = torch.utils.data.DataLoader(
dataset=dataset1,
**train_kwargs,
)
test_loader = torch.utils.data.DataLoader(
dataset=dataset2,
**test_kwargs,
)
return {
"train": {
"data": dataset1,
"loader": train_loader,
"sampler": sampler1,
},
"test": {
"data": dataset2,
"loader": test_loader,
"sampler": sampler2,
},
}
get_openimages(train_batch_size=128, test_batch_size=128, outdir=None, num_workers=1, download=False, shuffle=False, pin_memory=True)
⚓︎
Return train/test OpenImages datasets, loaders, and samplers.
Expects an ImageFolder-style layout:
<outdir>/data/openimages/
train/
class_000/
class_001/
...
val/
class_000/
class_001/
...
download is a no-op here; you need to stage the data yourself.
Source code in src/ezpz/data/vision.py
def get_openimages(
train_batch_size: int = 128,
test_batch_size: int = 128,
outdir: Optional[str | Path] = None,
num_workers: int = 1,
download: bool = False, # kept for API parity; not used
shuffle: bool = False,
pin_memory: bool = True,
) -> dict:
"""Return train/test OpenImages datasets, loaders, and samplers.
Expects an ImageFolder-style layout:
<outdir>/data/openimages/
train/
class_000/
class_001/
...
val/
class_000/
class_001/
...
`download` is a no-op here; you need to stage the data yourself.
"""
outdir = OUTPUT_DIR if outdir is None else Path(outdir)
datadir = Path(outdir).joinpath("data", "openimages")
train_dir = datadir / "train"
val_dir = datadir / "val"
# Sanity check (only on rank 0)
if ezpz.dist.get_rank() == 0:
if not train_dir.is_dir() or not val_dir.is_dir():
from ezpz.data.utils import download_openimages_subset
download_openimages_subset(
outdir=datadir,
split="train",
max_classes=50,
num_workers=num_workers,
)
# raise FileNotFoundError(
# f"Expected OpenImages data under:\n"
# f" {train_dir}\n"
# f" {val_dir}\n"
# "with standard ImageFolder layout."
# )
if ezpz.dist.get_world_size() > 1:
ezpz.dist.barrier()
# Use standard ImageNet/OpenImages-like normalization
normalize = transforms.Normalize(
mean=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225),
)
train_transform = transforms.Compose(
[
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]
)
test_transform = transforms.Compose(
[
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
]
)
# Datasets
dataset1 = datasets.ImageFolder(
root=train_dir,
transform=train_transform,
)
dataset2 = datasets.ImageFolder(
root=val_dir,
transform=test_transform,
)
train_kwargs: dict = {
"batch_size": train_batch_size,
"pin_memory": pin_memory,
"num_workers": num_workers,
}
test_kwargs: dict = {
"batch_size": test_batch_size,
"pin_memory": pin_memory,
"num_workers": num_workers,
}
sampler1, sampler2 = None, None
rank = ezpz.dist.get_rank()
world_size = ezpz.dist.get_world_size()
if world_size > 1:
sampler1 = DistributedSampler(
dataset1,
rank=rank,
num_replicas=world_size,
shuffle=True,
)
sampler2 = DistributedSampler(
dataset2,
rank=rank,
num_replicas=world_size,
shuffle=False,
)
train_kwargs["sampler"] = sampler1
test_kwargs["sampler"] = sampler2
else:
train_kwargs["shuffle"] = shuffle
train_loader = torch.utils.data.DataLoader(
dataset=dataset1,
**train_kwargs,
)
test_loader = torch.utils.data.DataLoader(
dataset=dataset2,
**test_kwargs,
)
return {
"train": {
"data": dataset1,
"loader": train_loader,
"sampler": sampler1,
},
"test": {
"data": dataset2,
"loader": test_loader,
"sampler": sampler2,
},
}