poe2-bot/tools/training/train.sh

#!/usr/bin/env bash
# Fine-tune Tesseract 5 LSTM on Fontin / Fontin SmallCaps for POE2
#
# IMPORTANT: Run inside WSL Ubuntu (NOT docker-desktop).
#
# From PowerShell:
#   wsl -d Ubuntu-22.04 bash /mnt/c/Users/boki/repos/poe2trade/tools/training/train.sh
#
# Or from inside WSL Ubuntu:
#   cd /mnt/c/Users/boki/repos/poe2trade/tools/training
#   bash train.sh
#
# Prerequisites:
#   - WSL Ubuntu installed: wsl --install -d Ubuntu-22.04
#   - Internet access (downloads fonts + Tesseract tools if needed)

set -euo pipefail

# ── Sanity checks ────────────────────────────────────────────────────────────

if [ -z "${BASH_VERSION:-}" ]; then
    echo "ERROR: This script requires bash. Run with: bash train.sh"
    echo "If you're in docker-desktop WSL, switch to Ubuntu:"
    echo "  wsl -d Ubuntu-22.04 bash /mnt/c/Users/boki/repos/poe2trade/tools/training/train.sh"
    exit 1
fi

# Detect which WSL distro we're in
if grep -qi alpine /etc/os-release 2>/dev/null || [ -f /etc/alpine-release ]; then
    echo "ERROR: You're in docker-desktop (Alpine). Use Ubuntu instead:"
    echo "  wsl -d Ubuntu-22.04 bash /mnt/c/Users/boki/repos/poe2trade/tools/training/train.sh"
    exit 1
fi

# ── Resolve paths ────────────────────────────────────────────────────────────

# Handle both /mnt/c/ (Ubuntu) and /mnt/host/c/ (docker-desktop) paths
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
TESSDATA_DIR="$PROJECT_ROOT/tools/OcrDaemon/tessdata"
TRAINING_TEXT="$SCRIPT_DIR/poe2_training_text.txt"
WORK_DIR="$HOME/poe2-tesseract-training"
FONT_DIR="/usr/local/share/fonts/fontin"
MAX_ITERATIONS=800
TARGET_ERROR=0.01

if [ ! -f "$TRAINING_TEXT" ]; then
    echo "ERROR: Training text not found at: $TRAINING_TEXT"
    exit 1
fi

if [ ! -f "$TESSDATA_DIR/eng.traineddata" ]; then
    echo "ERROR: eng.traineddata not found at: $TESSDATA_DIR/"
    exit 1
fi

echo "=== POE2 Tesseract Fine-Tuning ==="
echo "Script dir:    $SCRIPT_DIR"
echo "Work dir:      $WORK_DIR"
echo "Training text: $TRAINING_TEXT"
echo "Tessdata:      $TESSDATA_DIR"
echo ""

# ── Step 1: Install Tesseract training tools ─────────────────────────────────

install_tesseract_tools() {
    echo "── Step 1: Installing Tesseract training tools ──"

    if command -v text2image >/dev/null 2>&1 && command -v lstmtraining >/dev/null 2>&1; then
        echo "Tesseract training tools already installed."
        tesseract --version 2>&1 | head -1
        return 0
    fi

    echo "Installing Tesseract 5.x and training tools..."

    # Update package list first
    sudo apt-get update -qq

    # Try PPA for latest Tesseract
    if sudo add-apt-repository -y ppa:alex-p/tesseract-ocr-devel 2>/dev/null; then
        sudo apt-get update -qq
    fi

    sudo apt-get install -y tesseract-ocr tesseract-ocr-eng \
        libicu-dev libpango1.0-dev libcairo2-dev \
        wget unzip fontconfig 2>&1 | tail -3

    # Check if we got text2image from the package
    if command -v text2image >/dev/null 2>&1; then
        echo "Installed via apt."
        tesseract --version 2>&1 | head -1
        return 0
    fi

    # Build from source as fallback
    echo "Package didn't include training tools, building from source..."
    sudo apt-get install -y automake g++ git libtool libleptonica-dev \
        make pkg-config libicu-dev libpango1.0-dev libcairo2-dev 2>&1 | tail -3

    local BUILD_DIR="$HOME/tesseract-build"
    if [ ! -d "$BUILD_DIR" ]; then
        git clone --depth 1 --branch 5.3.4 https://github.com/tesseract-ocr/tesseract.git "$BUILD_DIR"
    fi

    pushd "$BUILD_DIR" > /dev/null
    ./autogen.sh
    ./configure
    make -j"$(nproc)"
    make training
    sudo make install
    sudo make training-install
    sudo ldconfig
    popd > /dev/null

    echo "Built from source."
    tesseract --version 2>&1 | head -1
}

# ── Step 2: Download and install Fontin fonts ────────────────────────────────

install_fonts() {
    echo ""
    echo "── Step 2: Installing Fontin fonts ──"

    if fc-list 2>/dev/null | grep -qi fontin; then
        echo "Fontin fonts already installed:"
        fc-list | grep -i fontin
        return 0
    fi

    local FONT_TMP="$HOME/fontin-download"
    mkdir -p "$FONT_TMP"
    pushd "$FONT_TMP" > /dev/null

    echo "Downloading Fontin fonts..."
    # wfonts.com zip includes Regular, Bold, Italic, SmallCaps (all in one)
    curl -sL "https://static.wfonts.com/download/data/2015/03/10/fontin/fontin.zip" -o fontin.zip || {
        echo "ERROR: Failed to download Fontin fonts."
        echo "Download manually and place .otf files in $FONT_DIR"
        popd > /dev/null
        return 1
    }

    # Verify it's a real zip
    if ! file fontin.zip | grep -q "Zip archive"; then
        echo "ERROR: Downloaded file is not a zip. Font site may be blocking downloads."
        echo "Download manually from https://www.wfonts.com/font/fontin"
        echo "Place .otf files in $FONT_DIR"
        popd > /dev/null
        return 1
    fi

    unzip -qo fontin.zip -d fontin/ 2>/dev/null || true

    sudo mkdir -p "$FONT_DIR"
    # Copy OTF files
    find fontin/ \( -name '*.otf' -o -name '*.OTF' \) \
        -exec sudo cp {} "$FONT_DIR/" \; 2>/dev/null || true
    sudo fc-cache -fv > /dev/null 2>&1

    echo "Installed fonts:"
    fc-list | grep -i fontin || echo "WARNING: Fonts not found after install!"

    popd > /dev/null
    rm -rf "$FONT_TMP"
}

# ── Step 3: Generate training images ─────────────────────────────────────────

generate_training_images() {
    echo ""
    echo "── Step 3: Generating training images ──"

    mkdir -p "$WORK_DIR/ground-truth" "$WORK_DIR/output"
    cd "$WORK_DIR"

    # Copy eng.traineddata from project
    if [ ! -f "$WORK_DIR/eng.traineddata" ]; then
        cp "$TESSDATA_DIR/eng.traineddata" "$WORK_DIR/"
    fi

    # Extract LSTM model for fine-tuning
    if [ ! -f "$WORK_DIR/eng.lstm" ]; then
        echo "Extracting LSTM model from eng.traineddata..."
        combine_tessdata -e eng.traineddata eng.lstm
    fi

    # Discover available Fontin font names
    echo "Available Fontin fonts:"
    text2image --list_available_fonts --fonts_dir "$FONT_DIR" 2>/dev/null | grep -i fontin || true

    # Read font names into a file (avoids bash array portability issues)
    # text2image output format is "  6: Fontin Bold" — extract font name after first colon
    local FONT_LIST="$WORK_DIR/font_list.txt"
    text2image --list_available_fonts --fonts_dir "$FONT_DIR" 2>/dev/null \
        | grep -i fontin \
        | cut -d: -f2- \
        | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \
        > "$FONT_LIST" 2>/dev/null || true

    if [ ! -s "$FONT_LIST" ]; then
        echo "WARNING: No Fontin fonts auto-detected. Trying common names..."
        cat > "$FONT_LIST" <<'NAMES'
Fontin Bold
Fontin Medium
Fontin Medium Italic
Fontin SmallCaps, Medium
NAMES
    fi

    echo "Will generate images for:"
    cat "$FONT_LIST"

    local count=0
    for exp in -1 0 1; do
        while IFS= read -r font_name; do
            [ -z "$font_name" ] && continue
            local safe
            safe=$(echo "$font_name" | tr ' ' '-' | tr '[:upper:]' '[:lower:]')
            local outbase="ground-truth/poe2.${safe}.exp${exp}"

            echo "  Generating: $outbase (font='$font_name', exposure=$exp)"
            if text2image \
                --text "$TRAINING_TEXT" \
                --outputbase "$outbase" \
                --font "$font_name" \
                --fonts_dir "$FONT_DIR" \
                --ptsize 16 \
                --xsize 3600 \
                --ysize 480 \
                --char_spacing 0.0 \
                --exposure "$exp" \
                --leading 32 2>&1; then
                count=$((count + 1))
            else
                echo "  WARNING: Failed for font '$font_name' exposure=$exp, skipping"
            fi
        done < "$FONT_LIST"
    done

    echo "Generated $count training image sets."

    if [ "$count" -eq 0 ]; then
        echo "ERROR: No training images generated. Check font installation."
        return 1
    fi
}

# ── Step 4: Generate LSTMF files ─────────────────────────────────────────────

generate_lstmf() {
    echo ""
    echo "── Step 4: Generating .lstmf training files ──"

    cd "$WORK_DIR"

    local count=0
    for tif in ground-truth/*.tif; do
        [ -f "$tif" ] || continue
        local base="${tif%.tif}"

        if [ -f "${base}.lstmf" ]; then
            echo "  Skipping (exists): ${base}.lstmf"
            count=$((count + 1))
            continue
        fi

        echo "  Processing: $tif"
        if tesseract "$tif" "$base" --psm 6 lstm.train 2>&1; then
            count=$((count + 1))
        else
            echo "  WARNING: Failed to process $tif"
        fi
    done

    # Create training file list
    : > training_files.txt
    for f in ground-truth/*.lstmf; do
        [ -f "$f" ] && echo "$f" >> training_files.txt
    done
    local total
    total=$(wc -l < training_files.txt)

    echo "Created $total .lstmf files."

    if [ "$total" -eq 0 ]; then
        echo "ERROR: No .lstmf files generated."
        return 1
    fi
}

# ── Step 5: Fine-tune LSTM ───────────────────────────────────────────────────

fine_tune() {
    echo ""
    echo "── Step 5: Fine-tuning LSTM (max $MAX_ITERATIONS iterations) ──"

    cd "$WORK_DIR"

    lstmtraining \
        --continue_from eng.lstm \
        --traineddata eng.traineddata \
        --train_listfile training_files.txt \
        --model_output output/poe2 \
        --max_iterations "$MAX_ITERATIONS" \
        --target_error_rate "$TARGET_ERROR" \
        --debug_interval -1

    echo "Fine-tuning complete."
}

# ── Step 6: Package model ────────────────────────────────────────────────────

package_model() {
    echo ""
    echo "── Step 6: Packaging poe2.traineddata ──"

    cd "$WORK_DIR"

    # Find the checkpoint file
    local checkpoint=""
    if [ -f "output/poe2_checkpoint" ]; then
        checkpoint="output/poe2_checkpoint"
    elif [ -f "output/poe2checkpoint" ]; then
        checkpoint="output/poe2checkpoint"
    else
        # Find most recent checkpoint
        checkpoint=$(ls -t output/poe2* 2>/dev/null | head -1)
    fi

    if [ -z "$checkpoint" ] || [ ! -f "$checkpoint" ]; then
        echo "ERROR: No checkpoint found in output/"
        ls -la output/ 2>/dev/null || true
        return 1
    fi

    echo "Using checkpoint: $checkpoint"

    lstmtraining --stop_training \
        --continue_from "$checkpoint" \
        --traineddata eng.traineddata \
        --model_output output/poe2.traineddata

    # Copy to project tessdata
    cp output/poe2.traineddata "$TESSDATA_DIR/poe2.traineddata"

    echo "Model saved to: $TESSDATA_DIR/poe2.traineddata"
    ls -lh "$TESSDATA_DIR/poe2.traineddata"
}

# ── Step 7: Quick validation ─────────────────────────────────────────────────

validate() {
    echo ""
    echo "── Step 7: Quick validation ──"

    cd "$WORK_DIR"

    # Create a small test file
    printf 'Quality: +20%%\nAdds 12 to 24 Fire Damage\n+45%% to Fire Resistance\n' > /tmp/poe2_test.txt

    # Get first available fontin font name
    local test_font
    test_font=$(text2image --list_available_fonts --fonts_dir "$FONT_DIR" 2>/dev/null | grep -i fontin | head -1 | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')

    if [ -z "$test_font" ]; then
        echo "Skipping validation (no Fontin font available)"
        return 0
    fi

    if ! text2image \
        --text /tmp/poe2_test.txt \
        --outputbase /tmp/poe2_test \
        --font "$test_font" \
        --fonts_dir "$FONT_DIR" \
        --ptsize 16 --xsize 1200 --ysize 200 \
        --exposure 0 --leading 32 2>&1; then
        echo "Skipping validation (could not generate test image)"
        return 0
    fi

    echo "=== eng model ==="
    tesseract /tmp/poe2_test.tif stdout -l eng --psm 6 2>/dev/null || true

    echo ""
    echo "=== poe2 model ==="
    TESSDATA_PREFIX="$TESSDATA_DIR" tesseract /tmp/poe2_test.tif stdout -l poe2 --psm 6 2>/dev/null || {
        tesseract /tmp/poe2_test.tif stdout --tessdata-dir "$TESSDATA_DIR" -l poe2 --psm 6 2>/dev/null || true
    }

    rm -f /tmp/poe2_test.txt /tmp/poe2_test.tif /tmp/poe2_test.box

    echo ""
    echo "Compare the outputs above — poe2 should be more accurate on Fontin text."
}

# ── Main ─────────────────────────────────────────────────────────────────────

main() {
    install_tesseract_tools
    install_fonts
    generate_training_images
    generate_lstmf
    fine_tune
    package_model
    validate

    echo ""
    echo "=== Done! ==="
    echo "poe2.traineddata has been copied to: $TESSDATA_DIR/"
    echo ""
    echo "Next steps:"
    echo "  1. Build the daemon:  dotnet build tools/OcrDaemon -c Release"
    echo "  2. Start the bot:    npx tsx src/index.ts"
    echo "  3. Test OCR quality in the dashboard"
}

main "$@"