mirror of
https://github.com/PR0M3TH3AN/Marlin.git
synced 2025-09-08 23:28:44 +00:00
dirty scan speed testing
This commit is contained in:
4
bench/dirty-vs-full.md
Normal file
4
bench/dirty-vs-full.md
Normal file
@@ -0,0 +1,4 @@
|
||||
| Command | Mean [ms] | Min [ms] | Max [ms] | Relative |
|
||||
|:---|---:|---:|---:|---:|
|
||||
| `full-scan` | 427.0 ± 30.5 | 402.2 | 467.4 | 6.36 ± 0.49 |
|
||||
| `dirty-scan` | 67.2 ± 2.1 | 64.7 | 71.6 | 1.00 |
|
91
bench/dirty-vs-full.sh
Executable file
91
bench/dirty-vs-full.sh
Executable file
@@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# bench/dirty-vs-full.sh
|
||||
#
|
||||
# Compare full-scan vs dirty-scan performance on a large corpus,
|
||||
# simulating a random set of file modifications before each dirty scan,
|
||||
# and reporting corpus size, number of dirty files, and speedup.
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
IFS=$'\n\t'
|
||||
|
||||
# Path to the marlin binary (adjust if you build elsewhere)
|
||||
MARLIN_BIN=${MARLIN_BIN:-target/release/marlin}
|
||||
|
||||
# Directory containing your test corpus (100k+ files)
|
||||
CORPUS_DIR=${CORPUS_DIR:-bench/corpus}
|
||||
|
||||
# Where to put the ephemeral DB
|
||||
DB_PATH=${DB_PATH:-bench/index.db}
|
||||
|
||||
# How many files to mark dirty before each dirty‐scan run
|
||||
DIRTY_COUNT=${DIRTY_COUNT:-100}
|
||||
|
||||
# Number of warm‐up runs
|
||||
WARMUPS=${WARMUPS:-3}
|
||||
|
||||
# Tell Marlin where to write its DB
|
||||
export MARLIN_DB_PATH="$DB_PATH"
|
||||
|
||||
# Ensure hyperfine is installed
|
||||
if ! command -v hyperfine &>/dev/null; then
|
||||
echo "Error: hyperfine not found. Please install it and try again." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Ensure our corpus exists
|
||||
if [ ! -d "$CORPUS_DIR" ]; then
|
||||
echo "Error: corpus directory '$CORPUS_DIR' not found." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Count corpus size
|
||||
CORPUS_SIZE=$(find "$CORPUS_DIR" -type f | wc -l | tr -d ' ')
|
||||
echo "→ Corpus size: $CORPUS_SIZE files"
|
||||
echo "→ Will mark $DIRTY_COUNT files dirty per dirty‐scan run"
|
||||
echo
|
||||
|
||||
# Clean up any old database
|
||||
rm -f "$DB_PATH"
|
||||
|
||||
# First, populate the DB once so that dirty-scan has something to do
|
||||
echo "→ Initial full scan to populate DB"
|
||||
"$MARLIN_BIN" scan "$CORPUS_DIR" >/dev/null 2>&1
|
||||
|
||||
echo
|
||||
echo "→ Benchmarking full vs dirty scan with hyperfine"
|
||||
hyperfine \
|
||||
--warmup "$WARMUPS" \
|
||||
--prepare "
|
||||
# wipe and re-populate
|
||||
rm -f '$DB_PATH'
|
||||
mkdir -p bench
|
||||
export MARLIN_DB_PATH='$DB_PATH'
|
||||
$MARLIN_BIN scan '$CORPUS_DIR' >/dev/null 2>&1
|
||||
|
||||
# seed $DIRTY_COUNT random files as 'dirty' in the DB
|
||||
sqlite3 '$DB_PATH' \"INSERT OR IGNORE INTO file_changes(file_id, marked_at)
|
||||
SELECT id, strftime('%s','now') FROM files
|
||||
ORDER BY RANDOM()
|
||||
LIMIT $DIRTY_COUNT;\"
|
||||
" \
|
||||
--command-name "full-scan" "MARLIN_DB_PATH='$DB_PATH' $MARLIN_BIN scan '$CORPUS_DIR' >/dev/null 2>&1" \
|
||||
--command-name "dirty-scan" "MARLIN_DB_PATH='$DB_PATH' $MARLIN_BIN scan --dirty '$CORPUS_DIR' >/dev/null 2>&1" \
|
||||
--export-markdown bench/dirty-vs-full.md
|
||||
|
||||
echo
|
||||
echo "Results written to bench/dirty-vs-full.md"
|
||||
|
||||
# Extract the speedup factor from the markdown table:
|
||||
# the "Relative" column on the full-scan row tells us how many times
|
||||
# slower full-scan is relative to dirty-scan (baseline = 1.00).
|
||||
SPEEDUP=$(grep '\`full-scan\`' bench/dirty-vs-full.md \
|
||||
| awk -F'|' '{print $5}' \
|
||||
| xargs)
|
||||
|
||||
echo
|
||||
echo "→ Summary:"
|
||||
echo " Corpus size: $CORPUS_SIZE files"
|
||||
echo " Dirty files seeded: $DIRTY_COUNT"
|
||||
echo " Dirty‐scan speedup: dirty-scan ran $SPEEDUP times faster than full-scan"
|
30
bench/gen-corpus.sh
Executable file
30
bench/gen-corpus.sh
Executable file
@@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# bench/gen-corpus.sh
|
||||
#
|
||||
# Generate a synthetic corpus of N files in nested directories.
|
||||
# Defaults to 1 000 files so it stays laptop-friendly.
|
||||
#
|
||||
|
||||
set -euo pipefail
|
||||
IFS=$'\n\t'
|
||||
|
||||
# How many files? (default: 1 000)
|
||||
COUNT=${COUNT:-100000}
|
||||
# Where to put them
|
||||
TARGET=${TARGET:-bench/corpus}
|
||||
|
||||
# Wipe any old corpus
|
||||
rm -rf "$TARGET"
|
||||
mkdir -p "$TARGET"
|
||||
|
||||
echo "🚀 Generating $COUNT files under $TARGET…"
|
||||
for i in $(seq 1 "$COUNT"); do
|
||||
# bucket into 100 sub-dirs so walkdir has some structure
|
||||
dir_index=$(( (i - 1) / (COUNT / 100 + 1) ))
|
||||
subdir="$TARGET/dir$(printf "%03d" "$dir_index")"
|
||||
mkdir -p "$subdir"
|
||||
echo "This is file #$i" > "$subdir/file_$i.txt"
|
||||
done
|
||||
|
||||
echo "✅ Done: $(find "$TARGET" -type f | wc -l) files created."
|
Reference in New Issue
Block a user