
ETL 10K S3 files to Postgres
Dataset: daily S3 file drop
import gzip
import json
import os
from pathlib import Path
import boto3
import psycopg2
from burla import remote_parallel_map
from psycopg2.extras import execute_values
S3_BUCKET = "my-events-bucket"
DATE = "2025-04-19"
DATABASE_URL = os.environ["DATABASE_URL"]
MAX_DB_LOADERS = 25
REPORT_PATH = Path("/workspace/shared/file-drop-etl/load-report.jsonl")Step 1: List the files
Step 2: Transform and insert one file
Step 3: Smoke test one file
Step 4: Protect Postgres
What's the point?
Last updated