
Ranking 572M Amazon reviews
Dataset: Amazon Reviews 2023
import heapq
import json
import math
from pathlib import Path
import requests
from burla import remote_parallel_map
from huggingface_hub import HfApi
REPO_ID = "McAuley-Lab/Amazon-Reviews-2023"
HF_BASE = f"https://huggingface.co/datasets/{REPO_ID}/resolve/main/"
SHARD_DIR = Path("/workspace/shared/amazon-reviews/shards")
FINAL_DIR = Path("/workspace/shared/amazon-reviews/final")
TOP_K_PER_SHARD = 200Step 1: Plan byte ranges
Step 2: Stream records safely
Step 3: Score one chunk
Step 4: Run both scoring passes
Step 5: Reduce into site artifacts
What's the point?
Last updated