
Summarize 1M GitHub READMEs
Dataset: README Parquet export
import heapq
import json
from collections import Counter, defaultdict
from pathlib import Path
import pandas as pd
import pyarrow.dataset as ds
from burla import remote_parallel_map
PARQUET_PATH = "/workspace/shared/grs/readmes.parquet"
SHARD_DIR = Path("/workspace/shared/grs/shards")
FINAL_DIR = Path("/workspace/shared/grs/final")
N_SHARDS = 600
CATEGORIES = {
"ml": {"tensorflow": 4, "pytorch": 4, "embedding": 2, "llm": 4},
"web": {"react": 3, "django": 2, "graphql": 3, "frontend": 2},
"devops": {"docker": 3, "kubernetes": 4, "terraform": 4},
}Step 1: Score one README
Step 2: Summarize one shard
Step 3: Run the shards
Step 4: Reduce counters and examples
What's the point?
Last updated