Files
VoxVera/voxvera/templates/blank/extract_form_fields.sh
2025-06-21 12:20:51 -04:00

118 lines
4.0 KiB
Bash

#!/bin/bash
# Define the input PDF, output text file, and output JSON
pdf_file="./from_client/submission_form.pdf"
text_file="submission_form.txt"
json_file="config.json"
# Log the start of the script
echo "Starting script..."
echo "Looking for PDF at: $pdf_file"
echo "Config JSON: $json_file"
# Convert PDF to text
echo "Converting PDF to text..."
pdftotext -layout "$pdf_file" "$text_file"
# Extract individual fields using grep, logging each step
echo "Extracting fields from the text file..."
# Adjusting the extraction of the name field
name=$(grep -A 3 'name (max 25 characters):' "$text_file" | tail -n 1 | awk '{$1=$1;print}')
echo "Extracted name: '$name'"
subdomain=$(grep -A 1 'subdomain (max 30 characters):' "$text_file" | tail -n 1 | awk '{$1=$1;print}')
echo "Extracted subdomain: '$subdomain'"
title=$(grep -A 1 'title (max 30 characters):' "$text_file" | tail -n 1 | awk '{$1=$1;print}')
echo "Extracted title: '$title'"
subtitle=$(grep -A 1 'subtitle (max 45 characters):' "$text_file" | tail -n 1 | awk '{$1=$1;print}')
echo "Extracted subtitle: '$subtitle'"
headline=$(grep -A 1 'headline (max 50 characters):' "$text_file" | tail -n 1 | awk '{$1=$1;print}')
echo "Extracted headline: '$headline'"
# Process the content to remove soft wraps while keeping hard returns
content=$(grep -A 100 'content (max 1,400 characters):' "$text_file" | sed -n '2,/url_message/p' | sed '$d' | awk '
{
if (NR == 1) {
prev = $0;
next;
}
# Handle paragraph breaks explicitly
if ($0 ~ /^[[:space:]]*$/) {
if (prev) print prev "\n";
prev = "";
}
# Handle sentences that should stay together
else if (prev ~ /[.?!]$/ && $0 ~ /^[[:upper:]]/) {
prev = prev " " $0;
}
# Join lines that are part of the same sentence (soft wrap)
else if (prev !~ /[.?!]$/) {
prev = prev " " $0;
}
# Otherwise, treat as a new sentence/paragraph
else {
if (prev) print prev;
prev = $0;
}
}
END { if (prev) print prev }' | sed -E 's/\n\n[[:space:]]+/\n\n/g' | sed -E 's/^[[:space:]]+//g')
echo "Extracted content: '$content'"
url_message=$(grep -A 1 'url_message (max 60 characters):' "$text_file" | tail -n 1 | awk '{$1=$1;print}')
echo "Extracted url_message: '$url_message'"
url=$(grep -A 1 'url (max 90 characters):' "$text_file" | tail -n 1 | awk '{$1=$1;print}')
echo "Extracted url: '$url'"
# Construct the URLs with the subdomain
onion_base="6dshf2gnj7yzxlfcaczlyi57up4mvbtd5orinuj5bjsfycnhz2w456yd.onion"
constructed_url="http://$subdomain.$onion_base"
# Use an existing tear_off_link from the config if provided; otherwise default
# to the constructed onion URL.
existing_tear_off_link=$(jq -r '.tear_off_link // empty' "$json_file")
tear_off_link="${existing_tear_off_link:-http://$subdomain.$onion_base}"
echo "Constructed URL: '$constructed_url'"
echo "Using Tear-off Link: '$tear_off_link'"
# Check if the extracted fields are not empty
if [ -z "$name" ] || [ -z "$subdomain" ] || [ -z "$title" ] || [ -z "$subtitle" ] || [ -z "$headline" ] || [ -z "$content" ] || [ -z "$url_message" ] || [ -z "$url" ]; then
echo "Error: One or more extracted fields are empty. Please check the PDF form and try again."
exit 1
fi
# Update the JSON using jq and log the operation
echo "Updating config.json..."
jq --arg name "$name" \
--arg subdomain "$subdomain" \
--arg title "$title" \
--arg subtitle "$subtitle" \
--arg headline "$headline" \
--arg content "$content" \
--arg url_message "$url_message" \
--arg url "$url" \
--arg tear_off_link "$tear_off_link" \
'.name = $name |
.subdomain = $subdomain |
.title = $title |
.subtitle = $subtitle |
.headline = $headline |
.content = $content |
.url_message = $url_message |
.url = $url |
.tear_off_link = $tear_off_link' "$json_file" > tmp.json && mv tmp.json "$json_file"
if [ $? -eq 0 ]; then
echo "Config file updated successfully."
else
echo "Error: Failed to update config file."
exit 1
fi