Process data in your database quickly.
A way to process database rows in parallel by splitting work into ID ranges.
A way to process database rows in parallel by splitting work into ID ranges.
def build_id_ranges(start_id, end_id, rows_per_range):
return [
(range_start_id, min(range_start_id + rows_per_range - 1, end_id))
for range_start_id in range(start_id, end_id + 1, rows_per_range)
]
id_ranges = build_id_ranges(start_id=1, end_id=100_000, rows_per_range=10_000)import psycopg2
def process_id_range(id_range):
start_id, end_id = id_range
with psycopg2.connect(
host="localhost",
dbname="app",
user="app",
password="app",
) as connection:
with connection.cursor() as cursor:
cursor.execute(
"SELECT amount FROM orders WHERE id BETWEEN %s AND %s",
(start_id, end_id),
)
amounts = [row[0] for row in cursor.fetchall()]
return {"row_count": len(amounts), "total_amount": float(sum(amounts))}from burla import remote_parallel_map
range_results = remote_parallel_map(process_id_range, id_ranges)total_rows = sum(range_result["row_count"] for range_result in range_results)
total_amount = sum(range_result["total_amount"] for range_result in range_results)
print(f"Total rows processed: {total_rows}")
print(f"Total amount: {total_amount}")from burla import remote_parallel_map
small_test_ranges = build_id_ranges(start_id=1, end_id=5_000, rows_per_range=1_000)
remote_parallel_map(process_id_range, small_test_ranges)