
Find NOAA's rainiest day
Dataset: NOAA GHCN-Daily by-year files
import csv
import gzip
import heapq
import io
import json
from datetime import date
from pathlib import Path
import requests
from burla import remote_parallel_map
BASE = "https://www.ncei.noaa.gov/pub/data/ghcn/daily/by_year"
PART_DIR = Path("/workspace/shared/ghcn-rain/parts")
FINAL_DIR = Path("/workspace/shared/ghcn-rain/final")
TOP_PER_YEAR = 100
START_YEAR = 1750
END_YEAR = date.today().yearStep 1: Stream one year per worker
Step 2: Keep a heap, not the whole file
Step 3: Smoke test one year
Step 4: Reduce the years
What's the point?
Last updated