Files
Marlin/bench/dirty-vs-full.sh
2025-05-18 23:53:50 -04:00

92 lines
2.8 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
#
# bench/dirty-vs-full.sh
#
# Compare full-scan vs dirty-scan performance on a large corpus,
# simulating a random set of file modifications before each dirty scan,
# and reporting corpus size, number of dirty files, and speedup.
#
set -euo pipefail
IFS=$'\n\t'
# Path to the marlin binary (adjust if you build elsewhere)
MARLIN_BIN=${MARLIN_BIN:-target/release/marlin}
# Directory containing your test corpus (100k+ files)
CORPUS_DIR=${CORPUS_DIR:-bench/corpus}
# Where to put the ephemeral DB
DB_PATH=${DB_PATH:-bench/index.db}
# How many files to mark dirty before each dirtyscan run
DIRTY_COUNT=${DIRTY_COUNT:-100}
# Number of warmup runs
WARMUPS=${WARMUPS:-3}
# Tell Marlin where to write its DB
export MARLIN_DB_PATH="$DB_PATH"
# Ensure hyperfine is installed
if ! command -v hyperfine &>/dev/null; then
echo "Error: hyperfine not found. Please install it and try again." >&2
exit 1
fi
# Ensure our corpus exists
if [ ! -d "$CORPUS_DIR" ]; then
echo "Error: corpus directory '$CORPUS_DIR' not found." >&2
exit 1
fi
# Count corpus size
CORPUS_SIZE=$(find "$CORPUS_DIR" -type f | wc -l | tr -d ' ')
echo "→ Corpus size: $CORPUS_SIZE files"
echo "→ Will mark $DIRTY_COUNT files dirty per dirtyscan run"
echo
# Clean up any old database
rm -f "$DB_PATH"
# First, populate the DB once so that dirty-scan has something to do
echo "→ Initial full scan to populate DB"
"$MARLIN_BIN" scan "$CORPUS_DIR" >/dev/null 2>&1
echo
echo "→ Benchmarking full vs dirty scan with hyperfine"
hyperfine \
--warmup "$WARMUPS" \
--prepare "
# wipe and re-populate
rm -f '$DB_PATH'
mkdir -p bench
export MARLIN_DB_PATH='$DB_PATH'
$MARLIN_BIN scan '$CORPUS_DIR' >/dev/null 2>&1
# seed $DIRTY_COUNT random files as 'dirty' in the DB
sqlite3 '$DB_PATH' \"INSERT OR IGNORE INTO file_changes(file_id, marked_at)
SELECT id, strftime('%s','now') FROM files
ORDER BY RANDOM()
LIMIT $DIRTY_COUNT;\"
" \
--command-name "full-scan" "MARLIN_DB_PATH='$DB_PATH' $MARLIN_BIN scan '$CORPUS_DIR' >/dev/null 2>&1" \
--command-name "dirty-scan" "MARLIN_DB_PATH='$DB_PATH' $MARLIN_BIN scan --dirty '$CORPUS_DIR' >/dev/null 2>&1" \
--export-markdown bench/dirty-vs-full.md
echo
echo "Results written to bench/dirty-vs-full.md"
# Extract the speedup factor from the markdown table:
# the "Relative" column on the full-scan row tells us how many times
# slower full-scan is relative to dirty-scan (baseline = 1.00).
SPEEDUP=$(grep '\`full-scan\`' bench/dirty-vs-full.md \
| awk -F'|' '{print $5}' \
| xargs)
echo
echo "→ Summary:"
echo " Corpus size: $CORPUS_SIZE files"
echo " Dirty files seeded: $DIRTY_COUNT"
echo " Dirtyscan speedup: dirty-scan ran $SPEEDUP times faster than full-scan"