Marlin/bench/dirty-vs-full.sh

#!/usr/bin/env bash
#
# bench/dirty-vs-full.sh
#
# Compare full-scan vs dirty-scan performance on a large corpus,
# simulating a random set of file modifications before each dirty scan,
# and reporting corpus size, number of dirty files, and speedup.
#

set -euo pipefail
IFS=$'\n\t'

# Path to the marlin binary (adjust if you build elsewhere)
MARLIN_BIN=${MARLIN_BIN:-target/release/marlin}

# Directory containing your test corpus (100k+ files)
CORPUS_DIR=${CORPUS_DIR:-bench/corpus}

# Where to put the ephemeral DB
DB_PATH=${DB_PATH:-bench/index.db}

# How many files to mark dirty before each dirty‐scan run
DIRTY_COUNT=${DIRTY_COUNT:-100}

# Number of warm‐up runs
WARMUPS=${WARMUPS:-3}

# Tell Marlin where to write its DB
export MARLIN_DB_PATH="$DB_PATH"

# Ensure hyperfine is installed
if ! command -v hyperfine &>/dev/null; then
  echo "Error: hyperfine not found. Please install it and try again." >&2
  exit 1
fi

# Ensure our corpus exists
if [ ! -d "$CORPUS_DIR" ]; then
  echo "Error: corpus directory '$CORPUS_DIR' not found." >&2
  exit 1
fi

# Count corpus size
CORPUS_SIZE=$(find "$CORPUS_DIR" -type f | wc -l | tr -d ' ')
echo "→ Corpus size: $CORPUS_SIZE files"
echo "→ Will mark $DIRTY_COUNT files dirty per dirty‐scan run"
echo

# Clean up any old database
rm -f "$DB_PATH"

# First, populate the DB once so that dirty-scan has something to do
echo "→ Initial full scan to populate DB"
"$MARLIN_BIN" scan "$CORPUS_DIR" >/dev/null 2>&1

echo
echo "→ Benchmarking full vs dirty scan with hyperfine"
hyperfine \
  --warmup "$WARMUPS" \
  --prepare "
    # wipe and re-populate
    rm -f '$DB_PATH'
    mkdir -p bench
    export MARLIN_DB_PATH='$DB_PATH'
    $MARLIN_BIN scan '$CORPUS_DIR' >/dev/null 2>&1

    # seed $DIRTY_COUNT random files as 'dirty' in the DB
    sqlite3 '$DB_PATH' \"INSERT OR IGNORE INTO file_changes(file_id, marked_at)
      SELECT id, strftime('%s','now') FROM files
      ORDER BY RANDOM()
      LIMIT $DIRTY_COUNT;\"
  " \
  --command-name "full-scan"  "MARLIN_DB_PATH='$DB_PATH' $MARLIN_BIN scan       '$CORPUS_DIR' >/dev/null 2>&1" \
  --command-name "dirty-scan" "MARLIN_DB_PATH='$DB_PATH' $MARLIN_BIN scan --dirty '$CORPUS_DIR' >/dev/null 2>&1" \
  --export-markdown bench/dirty-vs-full.md

echo
echo "Results written to bench/dirty-vs-full.md"

# Extract the speedup factor from the markdown table:
#   the "Relative" column on the full-scan row tells us how many times
#   slower full-scan is relative to dirty-scan (baseline = 1.00).
SPEEDUP=$(grep '\`full-scan\`' bench/dirty-vs-full.md \
         | awk -F'|' '{print $5}' \
         | xargs)

echo
echo "→ Summary:"
echo "   Corpus size:        $CORPUS_SIZE files"
echo "   Dirty files seeded: $DIRTY_COUNT"
echo "   Dirty‐scan speedup: dirty-scan ran $SPEEDUP times faster than full-scan"