
Process one giant file
Dataset: one large JSONL export
/workspace/shared/giant/events.jsonlimport json
from pathlib import Path
from burla import remote_parallel_map
INPUT_PATH = Path("/workspace/shared/giant/events.jsonl")
CHUNK_DIR = Path("/workspace/shared/giant/chunks")
REPORT_DIR = Path("/workspace/shared/giant/reports")
FINAL_DIR = Path("/workspace/shared/giant/final")
LINES_PER_CHUNK = 50_000Step 1: Split without loading the file into memory
Step 2: Process one chunk
Step 3: Test one chunk, then run all chunks
Step 4: Reduce the chunk reports
What's the point?
Last updated