
Map geotagged Flickr photos
Dataset: YFCC100M metadata shards
import gzip
import json
import os
from collections import Counter, defaultdict
from pathlib import Path
import requests
from huggingface_hub import hf_hub_url
import reverse_geocoder as rg
from burla import remote_parallel_map
REPO_ID = "dalle-mini/YFCC100M_OpenAI_subset"
SHARD_DIR = Path("/workspace/shared/wpi/shards")
FINAL_DIR = Path("/workspace/shared/wpi/final")
SHARD_IDS = [f"{i:05d}" for i in range(96)]Step 1: Process one metadata shard per worker
Step 2: Run the shard workers
Step 3: Reduce counters
What's the point?
Last updated