
Clustering 2.7M arXiv abstracts
Dataset: arXiv metadata JSONL
import json
from pathlib import Path
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from burla import remote_parallel_map
from sentence_transformers import SentenceTransformer
RAW_JSONL = Path("/workspace/shared/arxiv/arxiv-metadata-oai-snapshot.json")
RAW_DIR = Path("/workspace/shared/arxiv/raw")
VEC_DIR = Path("/workspace/shared/arxiv/vectors")
FINAL_DIR = Path("/workspace/shared/arxiv/final")
PAPERS_PER_SHARD = 10_000
EMBED_BATCH = 128
MODEL_NAME = "BAAI/bge-small-en-v1.5"Step 1: Shard the metadata
Step 2: Embed each shard
Step 3: Reduce the whole corpus
What's the point?
Last updated