
Scan 2.76B NYC taxi trips
Dataset: NYC TLC monthly trip files
import io
from dataclasses import dataclass
import pandas as pd
import pyarrow.parquet as pq
import requests
from burla import remote_parallel_map
BASE = "https://d37ci6vzurychx.cloudfront.net/trip-data"
TAXI_TYPES = ["yellow", "green", "fhv", "fhvhv"]
YEARS = range(2011, 2025)
@dataclass(frozen=True)
class MonthJob:
taxi_type: str
year: int
month: int
def monthly_url(taxi_type: str, year: int, month: int) -> str:
return f"{BASE}/{taxi_type}_tripdata_{year}-{month:02d}.parquet"Step 1: Make one task per month file
Step 2: Count pickups for one file
Step 3: Smoke test a few months
Step 4: Build the time series
What's the point?
Last updated