
Embed 50K Wikipedia articles
Dataset: English Wikipedia
import json
import math
import os
from itertools import islice
from pathlib import Path
import numpy as np
from burla import remote_parallel_map
from datasets import load_dataset
MODEL_NAME = "BAAI/bge-large-en-v1.5"
GPU_IMAGE = "jakezuliani/burla-embedder:latest"
SHARED_ROOT = Path("/workspace/shared/vector_embeddings_demo")
ARTICLE_COUNT = 50_000
TEXT_SHARDS = 50
ARTICLES_PER_SHARD = math.ceil(ARTICLE_COUNT / TEXT_SHARDS)
MAX_GPU_PARALLELISM = int(os.environ.get("DEMO_MAX_GPU_PARALLELISM", 8))Step 1: Use a CUDA image
Step 2: Prepare text shards on CPU workers
Step 3: Embed each shard on A100s
Step 4: Search the vectors
What's the point?
Last updated