
Scrape 1M web pages
Dataset: URL archive
import json
import random
import time
from pathlib import Path
import httpx
from burla import remote_parallel_map
from selectolax.parser import HTMLParser
OUT_PATH = Path("/workspace/shared/web-scrape/pages.jsonl")
CHUNK = 500
MAX_PARALLELISM = 1_000Step 1: Chunk URLs
Step 2: Fetch and parse inside the worker
Step 3: Smoke test a chunk
Step 4: Stream the crawl
What's the point?
Last updated