Process one giant file quickly.
A guide for splitting one large file into chunks and processing chunks in parallel.
Before you start
Step 1: Split one big file into chunk files
from pathlib import Path
def create_chunk_files(
input_file_path="/workspace/shared/giant/input.txt",
output_directory_path="/workspace/shared/giant/chunks",
lines_per_chunk=50000,
):
output_directory = Path(output_directory_path)
output_directory.mkdir(parents=True, exist_ok=True)
lines = Path(input_file_path).read_text().splitlines()
chunk_paths = []
for index in range(0, len(lines), lines_per_chunk):
chunk_path = output_directory / f"chunk-{index // lines_per_chunk}.txt"
chunk_path.write_text("\n".join(lines[index:index + lines_per_chunk]) + "\n")
chunk_paths.append(str(chunk_path))
return chunk_paths
chunk_file_paths = create_chunk_files()