
Resize an image corpus
Dataset: source images in S3
import io
import json
import os
from pathlib import Path
import boto3
from PIL import Image, ImageOps
from burla import remote_parallel_map
SRC_BUCKET = "my-photos"
DST_BUCKET = "my-photos-resized"
SRC_PREFIX = "originals/"
OUT_PREFIX = "resized/"
MANIFEST_PATH = Path("/workspace/shared/image-resize/manifest.jsonl")
CHUNK_SIZE = 1_000
SIZES = [256, 512, 1024]Step 1: Chunk the image keys
Step 2: Resize inside the worker
Step 3: Smoke test one chunk
Step 4: Stream the manifest
What's the point?
Last updated