Piper-TTS-Script/make_audiobook.sh

#!/usr/bin/env bash
set -euo pipefail

# ============================================================
# make_audiobook.sh
# One-shot installer + large-text → MP3 audiobook using Piper
# Works on Linux Mint / Ubuntu (needs sudo for apt installs)
# ============================================================

# -----------------------
# CONFIG (edit as needed)
# -----------------------
# Piper model (female by default). Set MODEL_FILE to a local .onnx if you already downloaded one.
# Otherwise, the script will try to download the URLs below (you can swap them with your preferred voice).
MODEL_NAME="en_US-hfc_male-medium"
MODEL_DIR="${HOME}/.local/share/piper/voices"
MODEL_FILE="${MODEL_DIR}/${MODEL_NAME}.onnx"
MODEL_JSON="${MODEL_FILE}.json"

MODEL_ONNX_URL="https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/hfc_male/medium/en_US-hfc_male-medium.onnx"
MODEL_JSON_URL="https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/hfc_male/medium/en_US-hfc_male-medium.onnx.json"

# Piper synthesis knobs (tune to taste)
LENGTH_SCALE="1.0"   # >1.0 = slower, <1.0 = faster
NOISE_SCALE="0.60"   # lower = flatter; higher = more expressive
MAX_CHARS="3000"     # characters per chunk sent to Piper
SILENCE_MS="300"     # silence between chunks (ms)

# MP3 quality (VBR via -q:a: 0=best, 2~high, 5=medium)
MP3_Q="2"

# -----------------------
# END CONFIG
# -----------------------

# --- helpers ---
say() { printf "\n\033[1;32m%s\033[0m\n" "$*"; }
warn() { printf "\n\033[1;33m%s\033[0m\n" "$*"; }
die() { printf "\n\033[1;31mERROR:\033[0m %s\n" "$*" >&2; exit 1; }

# --- args ---
if [[ $# -lt 1 ]]; then
  cat >&2 <<EOF
Usage: $0 /path/to/mybook.txt [--model /path/to/voice.onnx] [--name OutputName.mp3]

Options:
  --model PATH    Use a specific Piper .onnx model file (overrides CONFIG)
  --name  NAME    Output mp3 name (default: input basename with .mp3)
EOF
  exit 1
fi

INPUT_TXT=""
CUSTOM_MODEL=""
OUT_NAME=""

while [[ $# -gt 0 ]]; do
  case "$1" in
    --model) CUSTOM_MODEL="${2:-}"; shift 2;;
    --name)  OUT_NAME="${2:-}"; shift 2;;
    *) INPUT_TXT="$1"; shift;;
  esac
done

[[ -f "$INPUT_TXT" ]] || die "Input text not found: $INPUT_TXT"
INPUT_TXT="$(realpath "$INPUT_TXT")"
OUT_DIR="$(dirname "$INPUT_TXT")"
BASE="$(basename "${INPUT_TXT%.*}")"
OUTPUT_MP3="${OUT_NAME:-${BASE}.mp3}"
OUTPUT_MP3="${OUT_DIR}/${OUTPUT_MP3}"

# --- install prereqs ---
say "Installing prerequisites (sudo required for apt)…"
sudo apt update -y
sudo apt install -y python3-venv pipx ffmpeg curl wget

# Ensure pipx path is active in this session
if ! command -v pipx >/dev/null 2>&1; then
  die "pipx not found after install. Please re-open your terminal and re-run this script."
fi
pipx ensurepath >/dev/null 2>&1 || true
# shellcheck disable=SC1090
source "${HOME}/.profile" 2>/dev/null || true
hash -r

# Install Piper CLI via pipx if missing
if ! command -v piper >/dev/null 2>&1; then
  say "Installing Piper CLI with pipx…"
  pipx install piper-tts
fi

command -v piper >/dev/null 2>&1 || die "Piper CLI not found on PATH after install."

# --- model setup ---
mkdir -p "${MODEL_DIR}"

if [[ -n "${CUSTOM_MODEL}" ]]; then
  [[ -f "$CUSTOM_MODEL" ]] || die "Custom model not found: $CUSTOM_MODEL"
  MODEL_FILE="$(realpath "$CUSTOM_MODEL")"
  MODEL_JSON="${MODEL_FILE}.json"
  [[ -f "$MODEL_JSON" ]] || warn "Note: ${MODEL_JSON} not found. Piper usually works without it, but voice metadata will be missing."
else
  if [[ ! -f "$MODEL_FILE" ]]; then
    say "Downloading Piper model: ${MODEL_NAME}"
    wget -O "${MODEL_FILE}" "${MODEL_ONNX_URL}" || die "Failed to download model .onnx"
  fi
  if [[ ! -f "$MODEL_JSON" ]]; then
    say "Downloading Piper model metadata (.json)…"
    wget -O "${MODEL_JSON}" "${MODEL_JSON_URL}" || warn "Could not download ${MODEL_JSON}. Continuing without metadata."
  fi
fi

# --- temp working dir ---
WORKDIR="$(mktemp -d -t piperbook-XXXXXX)"
trap 'rm -rf "$WORKDIR"' EXIT

# --- python chunk + synth ---
say "Starting synthesis with Piper (this can take a while)…"
PY_SCRIPT="${WORKDIR}/run_piper.py"
cat > "${PY_SCRIPT}" <<'PYCODE'
import argparse, os, re, subprocess, sys, tempfile, shutil

def split_sentences(text: str):
    text = re.sub(r'\s+', ' ', text.strip())
    # basic sentence split: keep punctuation; avoid splitting on common decimals/abbrevs heuristically
    pieces = re.split(r'(?<=[.!?])\s+(?=[A-Z0-9(])', text)
    return [p.strip() for p in pieces if p.strip()]

def chunk(sentences, max_chars=3000):
    buf, n = [], 0
    for s in sentences:
        if n + len(s) + 1 > max_chars and buf:
            yield ' '.join(buf)
            buf, n = [], 0
        buf.append(s)
        n += len(s) + 1
    if buf:
        yield ' '.join(buf)

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--input", required=True)
    ap.add_argument("--model", required=True)
    ap.add_argument("--outdir", required=True)
    ap.add_argument("--length_scale", type=float, default=1.0)
    ap.add_argument("--noise_scale", type=float, default=0.667)
    ap.add_argument("--max_chars", type=int, default=3000)
    ap.add_argument("--silence_ms", type=int, default=300)
    args = ap.parse_args()

    if not shutil.which("piper"):
        sys.exit("piper not found in PATH")
    if not shutil.which("ffmpeg"):
        sys.exit("ffmpeg not found in PATH")

    with open(args.input, "r", encoding="utf-8") as f:
        raw = f.read()
    sents = split_sentences(raw)
    chunks = list(chunk(sents, max_chars=args.max_chars))
    if not chunks:
        sys.exit("Nothing to synthesize.")

    os.makedirs(args.outdir, exist_ok=True)
    wavs = []
    for i, text in enumerate(chunks):
        wav_path = os.path.join(args.outdir, f"part_{i:05d}.wav")
        p = subprocess.run(
            ["piper",
             "--model", args.model,
             "--length_scale", str(args.length_scale),
             "--noise_scale", str(args.noise_scale),
             "--output_file", wav_path],
            input=text.encode("utf-8"),
            stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        if p.returncode != 0 or not os.path.exists(wav_path):
            sys.stderr.write(p.stderr.decode("utf-8", errors="ignore"))
            sys.exit(f"Piper failed on chunk {i}")
        wavs.append(wav_path)

    # Build concat list with optional silence
    list_path = os.path.join(args.outdir, "concat.txt")
    with open(list_path, "w", encoding="utf-8") as lf:
        for i, w in enumerate(wavs):
            lf.write(f"file '{w}'\n")
            if i != len(wavs)-1 and args.silence_ms > 0:
                sil = os.path.join(args.outdir, f"sil_{i:05d}.wav")
                # 22.05k mono silence
                subprocess.run([
                    "ffmpeg","-f","lavfi","-i",
                    f"anullsrc=r=22050:cl=mono",
                    "-t", str(args.silence_ms/1000.0),
                    "-y", sil
                ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
                lf.write(f"file '{sil}'\n")

    # Concatenate WAVs losslessly
    joined_wav = os.path.join(args.outdir, "joined.wav")
    subprocess.run([
        "ffmpeg","-f","concat","-safe","0","-i", list_path,
        "-c","copy","-y", joined_wav
    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)

    print(joined_wav)

if __name__ == "__main__":
    main()
PYCODE

# Run the Python worker to synthesize chunks → joined.wav
JOINED_WAV="$(
  python3 "${PY_SCRIPT}" \
    --input "${INPUT_TXT}" \
    --model "${MODEL_FILE}" \
    --outdir "${WORKDIR}/audio" \
    --length_scale "${LENGTH_SCALE}" \
    --noise_scale "${NOISE_SCALE}" \
    --max_chars "${MAX_CHARS}" \
    --silence_ms "${SILENCE_MS}"
)"

[[ -f "${JOINED_WAV}" ]] || die "Synthesis did not produce ${JOINED_WAV}"

# Convert to MP3
say "Encoding MP3 → ${OUTPUT_MP3}"
ffmpeg -y -i "${JOINED_WAV}" -codec:a libmp3lame -q:a "${MP3_Q}" "${OUTPUT_MP3}" >/dev/null 2>&1

say "Done! MP3 written to:"
echo "  ${OUTPUT_MP3}"

echo
warn "Tip: To use a different voice, edit the CONFIG section at the top or pass --model /path/to/voice.onnx"
warn "Latest voice list (copy links): https://github.com/rhasspy/piper/blob/master/VOICES.md"