
Run a 2M-user API backfill
Dataset: user ids to backfill
import json
import os
import time
from pathlib import Path
import httpx
from burla import remote_parallel_map
API_KEY = os.environ["API_KEY"]
OUT_PATH = Path("/workspace/shared/api-backfill/users.jsonl")
CHUNK = 1_000
MAX_PARALLELISM = 1_000
SECONDS_BETWEEN_CALLS_PER_WORKER = 1.0Step 1: Chunk the ids
Step 2: Put pacing near the HTTP call
Step 3: Smoke test the real behavior
Step 4: Cap live workers
What's the point?
Last updated