VoxVera/voxvera/templates/blank/extract_form_fields.sh

#!/bin/bash

# Define the input PDF, output text file, and output JSON
pdf_file="./from_client/submission_form.pdf"
text_file="submission_form.txt"
json_file="config.json"

# Log the start of the script
echo "Starting script..."
echo "Looking for PDF at: $pdf_file"
echo "Config JSON: $json_file"

# Convert PDF to text
echo "Converting PDF to text..."
pdftotext -layout "$pdf_file" "$text_file"

# Extract individual fields using grep, logging each step
echo "Extracting fields from the text file..."

# Adjusting the extraction of the name field
name=$(grep -A 3 'name (max 25 characters):' "$text_file" | tail -n 1 | awk '{$1=$1;print}')
echo "Extracted name: '$name'"

subdomain=$(grep -A 1 'subdomain (max 30 characters):' "$text_file" | tail -n 1 | awk '{$1=$1;print}')
echo "Extracted subdomain: '$subdomain'"

title=$(grep -A 1 'title (max 30 characters):' "$text_file" | tail -n 1 | awk '{$1=$1;print}')
echo "Extracted title: '$title'"

subtitle=$(grep -A 1 'subtitle (max 45 characters):' "$text_file" | tail -n 1 | awk '{$1=$1;print}')
echo "Extracted subtitle: '$subtitle'"

headline=$(grep -A 1 'headline (max 50 characters):' "$text_file" | tail -n 1 | awk '{$1=$1;print}')
echo "Extracted headline: '$headline'"

# Process the content to remove soft wraps while keeping hard returns
content=$(grep -A 100 'content (max 1,400 characters):' "$text_file" | sed -n '2,/url_message/p' | sed '$d' | awk '
{
    if (NR == 1) {
        prev = $0;
        next;
    }

    # Handle paragraph breaks explicitly
    if ($0 ~ /^[[:space:]]*$/) {
        if (prev) print prev "\n";
        prev = "";
    }
    # Handle sentences that should stay together
    else if (prev ~ /[.?!]$/ && $0 ~ /^[[:upper:]]/) {
        prev = prev " " $0;
    }
    # Join lines that are part of the same sentence (soft wrap)
    else if (prev !~ /[.?!]$/) {
        prev = prev " " $0;
    }
    # Otherwise, treat as a new sentence/paragraph
    else {
        if (prev) print prev;
        prev = $0;
    }
}
END { if (prev) print prev }' | sed -E 's/\n\n[[:space:]]+/\n\n/g' | sed -E 's/^[[:space:]]+//g')

echo "Extracted content: '$content'"

url_message=$(grep -A 1 'url_message (max 60 characters):' "$text_file" | tail -n 1 | awk '{$1=$1;print}')
echo "Extracted url_message: '$url_message'"

url=$(grep -A 1 'url (max 90 characters):' "$text_file" | tail -n 1 | awk '{$1=$1;print}')
echo "Extracted url: '$url'"

# Construct the URLs with the subdomain
onion_base="6dshf2gnj7yzxlfcaczlyi57up4mvbtd5orinuj5bjsfycnhz2w456yd.onion"
constructed_url="http://$subdomain.$onion_base"

# Use an existing tear_off_link from the config if provided; otherwise default
# to the constructed onion URL.
existing_tear_off_link=$(jq -r '.tear_off_link // empty' "$json_file")
tear_off_link="${existing_tear_off_link:-http://$subdomain.$onion_base}"

echo "Constructed URL: '$constructed_url'"
echo "Using Tear-off Link: '$tear_off_link'"

# Check if the extracted fields are not empty
if [ -z "$name" ] || [ -z "$subdomain" ] || [ -z "$title" ] || [ -z "$subtitle" ] || [ -z "$headline" ] || [ -z "$content" ] || [ -z "$url_message" ] || [ -z "$url" ]; then
    echo "Error: One or more extracted fields are empty. Please check the PDF form and try again."
    exit 1
fi

# Update the JSON using jq and log the operation
echo "Updating config.json..."
jq --arg name "$name" \
   --arg subdomain "$subdomain" \
   --arg title "$title" \
   --arg subtitle "$subtitle" \
   --arg headline "$headline" \
   --arg content "$content" \
   --arg url_message "$url_message" \
   --arg url "$url" \
   --arg tear_off_link "$tear_off_link" \
   '.name = $name |
    .subdomain = $subdomain |
    .title = $title |
    .subtitle = $subtitle |
    .headline = $headline |
    .content = $content |
    .url_message = $url_message |
    .url = $url |
    .tear_off_link = $tear_off_link' "$json_file" > tmp.json && mv tmp.json "$json_file"

if [ $? -eq 0 ]; then
    echo "Config file updated successfully."
else
    echo "Error: Failed to update config file."
    exit 1
fi