dirty scan speed testing

2025-09-08 23:28:44 +00:00 · 2025-05-18 23:53:50 -04:00
parent e9b9606c9a
commit 7b596ab17a
12 changed files with 234 additions and 24 deletions
--- a/bench/dirty-vs-full.md
+++ b/bench/dirty-vs-full.md
@@ -0,0 +1,4 @@
+| Command | Mean [ms] | Min [ms] | Max [ms] | Relative |
+|:---|---:|---:|---:|---:|
+| `full-scan` | 427.0 ± 30.5 | 402.2 | 467.4 | 6.36 ± 0.49 |
+| `dirty-scan` | 67.2 ± 2.1 | 64.7 | 71.6 | 1.00 |
--- a/bench/dirty-vs-full.sh
+++ b/bench/dirty-vs-full.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+#
+# bench/dirty-vs-full.sh
+#
+# Compare full-scan vs dirty-scan performance on a large corpus,
+# simulating a random set of file modifications before each dirty scan,
+# and reporting corpus size, number of dirty files, and speedup.
+#
+
+set -euo pipefail
+IFS=$'\n\t'
+
+# Path to the marlin binary (adjust if you build elsewhere)
+MARLIN_BIN=${MARLIN_BIN:-target/release/marlin}
+
+# Directory containing your test corpus (100k+ files)
+CORPUS_DIR=${CORPUS_DIR:-bench/corpus}
+
+# Where to put the ephemeral DB
+DB_PATH=${DB_PATH:-bench/index.db}
+
+# How many files to mark dirty before each dirty‐scan run
+DIRTY_COUNT=${DIRTY_COUNT:-100}
+
+# Number of warm‐up runs
+WARMUPS=${WARMUPS:-3}
+
+# Tell Marlin where to write its DB
+export MARLIN_DB_PATH="$DB_PATH"
+
+# Ensure hyperfine is installed
+if ! command -v hyperfine &>/dev/null; then
+  echo "Error: hyperfine not found. Please install it and try again." >&2
+  exit 1
+fi
+
+# Ensure our corpus exists
+if [ ! -d "$CORPUS_DIR" ]; then
+  echo "Error: corpus directory '$CORPUS_DIR' not found." >&2
+  exit 1
+fi
+
+# Count corpus size
+CORPUS_SIZE=$(find "$CORPUS_DIR" -type f | wc -l | tr -d ' ')
+echo "→ Corpus size: $CORPUS_SIZE files"
+echo "→ Will mark $DIRTY_COUNT files dirty per dirty‐scan run"
+echo
+
+# Clean up any old database
+rm -f "$DB_PATH"
+
+# First, populate the DB once so that dirty-scan has something to do
+echo "→ Initial full scan to populate DB"
+"$MARLIN_BIN" scan "$CORPUS_DIR" >/dev/null 2>&1
+
+echo
+echo "→ Benchmarking full vs dirty scan with hyperfine"
+hyperfine \
+  --warmup "$WARMUPS" \
+  --prepare "
+    # wipe and re-populate
+    rm -f '$DB_PATH'
+    mkdir -p bench
+    export MARLIN_DB_PATH='$DB_PATH'
+    $MARLIN_BIN scan '$CORPUS_DIR' >/dev/null 2>&1
+
+    # seed $DIRTY_COUNT random files as 'dirty' in the DB
+    sqlite3 '$DB_PATH' \"INSERT OR IGNORE INTO file_changes(file_id, marked_at)
+      SELECT id, strftime('%s','now') FROM files
+      ORDER BY RANDOM()
+      LIMIT $DIRTY_COUNT;\"
+  " \
+  --command-name "full-scan"  "MARLIN_DB_PATH='$DB_PATH' $MARLIN_BIN scan       '$CORPUS_DIR' >/dev/null 2>&1" \
+  --command-name "dirty-scan" "MARLIN_DB_PATH='$DB_PATH' $MARLIN_BIN scan --dirty '$CORPUS_DIR' >/dev/null 2>&1" \
+  --export-markdown bench/dirty-vs-full.md
+
+echo
+echo "Results written to bench/dirty-vs-full.md"
+
+# Extract the speedup factor from the markdown table:
+#   the "Relative" column on the full-scan row tells us how many times
+#   slower full-scan is relative to dirty-scan (baseline = 1.00).
+SPEEDUP=$(grep '\`full-scan\`' bench/dirty-vs-full.md \
+         | awk -F'|' '{print $5}' \
+         | xargs)
+
+echo
+echo "→ Summary:"
+echo "   Corpus size:        $CORPUS_SIZE files"
+echo "   Dirty files seeded: $DIRTY_COUNT"
+echo "   Dirty‐scan speedup: dirty-scan ran $SPEEDUP times faster than full-scan"
--- a/bench/gen-corpus.sh
+++ b/bench/gen-corpus.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+#
+# bench/gen-corpus.sh
+#
+# Generate a synthetic corpus of N files in nested directories.
+# Defaults to 1 000 files so it stays laptop-friendly.
+#
+
+set -euo pipefail
+IFS=$'\n\t'
+
+# How many files? (default: 1 000)
+COUNT=${COUNT:-100000}
+# Where to put them
+TARGET=${TARGET:-bench/corpus}
+
+# Wipe any old corpus
+rm -rf "$TARGET"
+mkdir -p "$TARGET"
+
+echo "🚀 Generating $COUNT files under $TARGET…"
+for i in $(seq 1 "$COUNT"); do
+  # bucket into 100 sub-dirs so walkdir has some structure
+  dir_index=$(( (i - 1) / (COUNT / 100 + 1) ))
+  subdir="$TARGET/dir$(printf "%03d" "$dir_index")"
+  mkdir -p "$subdir"
+  echo "This is file #$i" > "$subdir/file_$i.txt"
+done
+
+echo "✅ Done: $(find "$TARGET" -type f | wc -l) files created."