added training
This commit is contained in:
parent
528453a321
commit
cc50368d3b
7 changed files with 901 additions and 1 deletions
419
tools/training/train.sh
Normal file
419
tools/training/train.sh
Normal file
|
|
@ -0,0 +1,419 @@
|
|||
#!/usr/bin/env bash
|
||||
# Fine-tune Tesseract 5 LSTM on Fontin / Fontin SmallCaps for POE2
|
||||
#
|
||||
# IMPORTANT: Run inside WSL Ubuntu (NOT docker-desktop).
|
||||
#
|
||||
# From PowerShell:
|
||||
# wsl -d Ubuntu-22.04 bash /mnt/c/Users/boki/repos/poe2trade/tools/training/train.sh
|
||||
#
|
||||
# Or from inside WSL Ubuntu:
|
||||
# cd /mnt/c/Users/boki/repos/poe2trade/tools/training
|
||||
# bash train.sh
|
||||
#
|
||||
# Prerequisites:
|
||||
# - WSL Ubuntu installed: wsl --install -d Ubuntu-22.04
|
||||
# - Internet access (downloads fonts + Tesseract tools if needed)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ── Sanity checks ────────────────────────────────────────────────────────────
|
||||
|
||||
if [ -z "${BASH_VERSION:-}" ]; then
|
||||
echo "ERROR: This script requires bash. Run with: bash train.sh"
|
||||
echo "If you're in docker-desktop WSL, switch to Ubuntu:"
|
||||
echo " wsl -d Ubuntu-22.04 bash /mnt/c/Users/boki/repos/poe2trade/tools/training/train.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Detect which WSL distro we're in
|
||||
if grep -qi alpine /etc/os-release 2>/dev/null || [ -f /etc/alpine-release ]; then
|
||||
echo "ERROR: You're in docker-desktop (Alpine). Use Ubuntu instead:"
|
||||
echo " wsl -d Ubuntu-22.04 bash /mnt/c/Users/boki/repos/poe2trade/tools/training/train.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# ── Resolve paths ────────────────────────────────────────────────────────────
|
||||
|
||||
# Handle both /mnt/c/ (Ubuntu) and /mnt/host/c/ (docker-desktop) paths
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
TESSDATA_DIR="$PROJECT_ROOT/tools/OcrDaemon/tessdata"
|
||||
TRAINING_TEXT="$SCRIPT_DIR/poe2_training_text.txt"
|
||||
WORK_DIR="$HOME/poe2-tesseract-training"
|
||||
FONT_DIR="/usr/local/share/fonts/fontin"
|
||||
MAX_ITERATIONS=800
|
||||
TARGET_ERROR=0.01
|
||||
|
||||
if [ ! -f "$TRAINING_TEXT" ]; then
|
||||
echo "ERROR: Training text not found at: $TRAINING_TEXT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f "$TESSDATA_DIR/eng.traineddata" ]; then
|
||||
echo "ERROR: eng.traineddata not found at: $TESSDATA_DIR/"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=== POE2 Tesseract Fine-Tuning ==="
|
||||
echo "Script dir: $SCRIPT_DIR"
|
||||
echo "Work dir: $WORK_DIR"
|
||||
echo "Training text: $TRAINING_TEXT"
|
||||
echo "Tessdata: $TESSDATA_DIR"
|
||||
echo ""
|
||||
|
||||
# ── Step 1: Install Tesseract training tools ─────────────────────────────────
|
||||
|
||||
install_tesseract_tools() {
|
||||
echo "── Step 1: Installing Tesseract training tools ──"
|
||||
|
||||
if command -v text2image >/dev/null 2>&1 && command -v lstmtraining >/dev/null 2>&1; then
|
||||
echo "Tesseract training tools already installed."
|
||||
tesseract --version 2>&1 | head -1
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "Installing Tesseract 5.x and training tools..."
|
||||
|
||||
# Update package list first
|
||||
sudo apt-get update -qq
|
||||
|
||||
# Try PPA for latest Tesseract
|
||||
if sudo add-apt-repository -y ppa:alex-p/tesseract-ocr-devel 2>/dev/null; then
|
||||
sudo apt-get update -qq
|
||||
fi
|
||||
|
||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-eng \
|
||||
libicu-dev libpango1.0-dev libcairo2-dev \
|
||||
wget unzip fontconfig 2>&1 | tail -3
|
||||
|
||||
# Check if we got text2image from the package
|
||||
if command -v text2image >/dev/null 2>&1; then
|
||||
echo "Installed via apt."
|
||||
tesseract --version 2>&1 | head -1
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Build from source as fallback
|
||||
echo "Package didn't include training tools, building from source..."
|
||||
sudo apt-get install -y automake g++ git libtool libleptonica-dev \
|
||||
make pkg-config libicu-dev libpango1.0-dev libcairo2-dev 2>&1 | tail -3
|
||||
|
||||
local BUILD_DIR="$HOME/tesseract-build"
|
||||
if [ ! -d "$BUILD_DIR" ]; then
|
||||
git clone --depth 1 --branch 5.3.4 https://github.com/tesseract-ocr/tesseract.git "$BUILD_DIR"
|
||||
fi
|
||||
|
||||
pushd "$BUILD_DIR" > /dev/null
|
||||
./autogen.sh
|
||||
./configure
|
||||
make -j"$(nproc)"
|
||||
make training
|
||||
sudo make install
|
||||
sudo make training-install
|
||||
sudo ldconfig
|
||||
popd > /dev/null
|
||||
|
||||
echo "Built from source."
|
||||
tesseract --version 2>&1 | head -1
|
||||
}
|
||||
|
||||
# ── Step 2: Download and install Fontin fonts ────────────────────────────────
|
||||
|
||||
install_fonts() {
|
||||
echo ""
|
||||
echo "── Step 2: Installing Fontin fonts ──"
|
||||
|
||||
if fc-list 2>/dev/null | grep -qi fontin; then
|
||||
echo "Fontin fonts already installed:"
|
||||
fc-list | grep -i fontin
|
||||
return 0
|
||||
fi
|
||||
|
||||
local FONT_TMP="$HOME/fontin-download"
|
||||
mkdir -p "$FONT_TMP"
|
||||
pushd "$FONT_TMP" > /dev/null
|
||||
|
||||
echo "Downloading Fontin fonts..."
|
||||
# wfonts.com zip includes Regular, Bold, Italic, SmallCaps (all in one)
|
||||
curl -sL "https://static.wfonts.com/download/data/2015/03/10/fontin/fontin.zip" -o fontin.zip || {
|
||||
echo "ERROR: Failed to download Fontin fonts."
|
||||
echo "Download manually and place .otf files in $FONT_DIR"
|
||||
popd > /dev/null
|
||||
return 1
|
||||
}
|
||||
|
||||
# Verify it's a real zip
|
||||
if ! file fontin.zip | grep -q "Zip archive"; then
|
||||
echo "ERROR: Downloaded file is not a zip. Font site may be blocking downloads."
|
||||
echo "Download manually from https://www.wfonts.com/font/fontin"
|
||||
echo "Place .otf files in $FONT_DIR"
|
||||
popd > /dev/null
|
||||
return 1
|
||||
fi
|
||||
|
||||
unzip -qo fontin.zip -d fontin/ 2>/dev/null || true
|
||||
|
||||
sudo mkdir -p "$FONT_DIR"
|
||||
# Copy OTF files
|
||||
find fontin/ \( -name '*.otf' -o -name '*.OTF' \) \
|
||||
-exec sudo cp {} "$FONT_DIR/" \; 2>/dev/null || true
|
||||
sudo fc-cache -fv > /dev/null 2>&1
|
||||
|
||||
echo "Installed fonts:"
|
||||
fc-list | grep -i fontin || echo "WARNING: Fonts not found after install!"
|
||||
|
||||
popd > /dev/null
|
||||
rm -rf "$FONT_TMP"
|
||||
}
|
||||
|
||||
# ── Step 3: Generate training images ─────────────────────────────────────────
|
||||
|
||||
generate_training_images() {
|
||||
echo ""
|
||||
echo "── Step 3: Generating training images ──"
|
||||
|
||||
mkdir -p "$WORK_DIR/ground-truth" "$WORK_DIR/output"
|
||||
cd "$WORK_DIR"
|
||||
|
||||
# Copy eng.traineddata from project
|
||||
if [ ! -f "$WORK_DIR/eng.traineddata" ]; then
|
||||
cp "$TESSDATA_DIR/eng.traineddata" "$WORK_DIR/"
|
||||
fi
|
||||
|
||||
# Extract LSTM model for fine-tuning
|
||||
if [ ! -f "$WORK_DIR/eng.lstm" ]; then
|
||||
echo "Extracting LSTM model from eng.traineddata..."
|
||||
combine_tessdata -e eng.traineddata eng.lstm
|
||||
fi
|
||||
|
||||
# Discover available Fontin font names
|
||||
echo "Available Fontin fonts:"
|
||||
text2image --list_available_fonts --fonts_dir "$FONT_DIR" 2>/dev/null | grep -i fontin || true
|
||||
|
||||
# Read font names into a file (avoids bash array portability issues)
|
||||
# text2image output format is " 6: Fontin Bold" — extract font name after first colon
|
||||
local FONT_LIST="$WORK_DIR/font_list.txt"
|
||||
text2image --list_available_fonts --fonts_dir "$FONT_DIR" 2>/dev/null \
|
||||
| grep -i fontin \
|
||||
| cut -d: -f2- \
|
||||
| sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \
|
||||
> "$FONT_LIST" 2>/dev/null || true
|
||||
|
||||
if [ ! -s "$FONT_LIST" ]; then
|
||||
echo "WARNING: No Fontin fonts auto-detected. Trying common names..."
|
||||
cat > "$FONT_LIST" <<'NAMES'
|
||||
Fontin Bold
|
||||
Fontin Medium
|
||||
Fontin Medium Italic
|
||||
Fontin SmallCaps, Medium
|
||||
NAMES
|
||||
fi
|
||||
|
||||
echo "Will generate images for:"
|
||||
cat "$FONT_LIST"
|
||||
|
||||
local count=0
|
||||
for exp in -1 0 1; do
|
||||
while IFS= read -r font_name; do
|
||||
[ -z "$font_name" ] && continue
|
||||
local safe
|
||||
safe=$(echo "$font_name" | tr ' ' '-' | tr '[:upper:]' '[:lower:]')
|
||||
local outbase="ground-truth/poe2.${safe}.exp${exp}"
|
||||
|
||||
echo " Generating: $outbase (font='$font_name', exposure=$exp)"
|
||||
if text2image \
|
||||
--text "$TRAINING_TEXT" \
|
||||
--outputbase "$outbase" \
|
||||
--font "$font_name" \
|
||||
--fonts_dir "$FONT_DIR" \
|
||||
--ptsize 16 \
|
||||
--xsize 3600 \
|
||||
--ysize 480 \
|
||||
--char_spacing 0.0 \
|
||||
--exposure "$exp" \
|
||||
--leading 32 2>&1; then
|
||||
count=$((count + 1))
|
||||
else
|
||||
echo " WARNING: Failed for font '$font_name' exposure=$exp, skipping"
|
||||
fi
|
||||
done < "$FONT_LIST"
|
||||
done
|
||||
|
||||
echo "Generated $count training image sets."
|
||||
|
||||
if [ "$count" -eq 0 ]; then
|
||||
echo "ERROR: No training images generated. Check font installation."
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Step 4: Generate LSTMF files ─────────────────────────────────────────────
|
||||
|
||||
generate_lstmf() {
|
||||
echo ""
|
||||
echo "── Step 4: Generating .lstmf training files ──"
|
||||
|
||||
cd "$WORK_DIR"
|
||||
|
||||
local count=0
|
||||
for tif in ground-truth/*.tif; do
|
||||
[ -f "$tif" ] || continue
|
||||
local base="${tif%.tif}"
|
||||
|
||||
if [ -f "${base}.lstmf" ]; then
|
||||
echo " Skipping (exists): ${base}.lstmf"
|
||||
count=$((count + 1))
|
||||
continue
|
||||
fi
|
||||
|
||||
echo " Processing: $tif"
|
||||
if tesseract "$tif" "$base" --psm 6 lstm.train 2>&1; then
|
||||
count=$((count + 1))
|
||||
else
|
||||
echo " WARNING: Failed to process $tif"
|
||||
fi
|
||||
done
|
||||
|
||||
# Create training file list
|
||||
: > training_files.txt
|
||||
for f in ground-truth/*.lstmf; do
|
||||
[ -f "$f" ] && echo "$f" >> training_files.txt
|
||||
done
|
||||
local total
|
||||
total=$(wc -l < training_files.txt)
|
||||
|
||||
echo "Created $total .lstmf files."
|
||||
|
||||
if [ "$total" -eq 0 ]; then
|
||||
echo "ERROR: No .lstmf files generated."
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Step 5: Fine-tune LSTM ───────────────────────────────────────────────────
|
||||
|
||||
fine_tune() {
|
||||
echo ""
|
||||
echo "── Step 5: Fine-tuning LSTM (max $MAX_ITERATIONS iterations) ──"
|
||||
|
||||
cd "$WORK_DIR"
|
||||
|
||||
lstmtraining \
|
||||
--continue_from eng.lstm \
|
||||
--traineddata eng.traineddata \
|
||||
--train_listfile training_files.txt \
|
||||
--model_output output/poe2 \
|
||||
--max_iterations "$MAX_ITERATIONS" \
|
||||
--target_error_rate "$TARGET_ERROR" \
|
||||
--debug_interval -1
|
||||
|
||||
echo "Fine-tuning complete."
|
||||
}
|
||||
|
||||
# ── Step 6: Package model ────────────────────────────────────────────────────
|
||||
|
||||
package_model() {
|
||||
echo ""
|
||||
echo "── Step 6: Packaging poe2.traineddata ──"
|
||||
|
||||
cd "$WORK_DIR"
|
||||
|
||||
# Find the checkpoint file
|
||||
local checkpoint=""
|
||||
if [ -f "output/poe2_checkpoint" ]; then
|
||||
checkpoint="output/poe2_checkpoint"
|
||||
elif [ -f "output/poe2checkpoint" ]; then
|
||||
checkpoint="output/poe2checkpoint"
|
||||
else
|
||||
# Find most recent checkpoint
|
||||
checkpoint=$(ls -t output/poe2* 2>/dev/null | head -1)
|
||||
fi
|
||||
|
||||
if [ -z "$checkpoint" ] || [ ! -f "$checkpoint" ]; then
|
||||
echo "ERROR: No checkpoint found in output/"
|
||||
ls -la output/ 2>/dev/null || true
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "Using checkpoint: $checkpoint"
|
||||
|
||||
lstmtraining --stop_training \
|
||||
--continue_from "$checkpoint" \
|
||||
--traineddata eng.traineddata \
|
||||
--model_output output/poe2.traineddata
|
||||
|
||||
# Copy to project tessdata
|
||||
cp output/poe2.traineddata "$TESSDATA_DIR/poe2.traineddata"
|
||||
|
||||
echo "Model saved to: $TESSDATA_DIR/poe2.traineddata"
|
||||
ls -lh "$TESSDATA_DIR/poe2.traineddata"
|
||||
}
|
||||
|
||||
# ── Step 7: Quick validation ─────────────────────────────────────────────────
|
||||
|
||||
validate() {
|
||||
echo ""
|
||||
echo "── Step 7: Quick validation ──"
|
||||
|
||||
cd "$WORK_DIR"
|
||||
|
||||
# Create a small test file
|
||||
printf 'Quality: +20%%\nAdds 12 to 24 Fire Damage\n+45%% to Fire Resistance\n' > /tmp/poe2_test.txt
|
||||
|
||||
# Get first available fontin font name
|
||||
local test_font
|
||||
test_font=$(text2image --list_available_fonts --fonts_dir "$FONT_DIR" 2>/dev/null | grep -i fontin | head -1 | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
||||
|
||||
if [ -z "$test_font" ]; then
|
||||
echo "Skipping validation (no Fontin font available)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if ! text2image \
|
||||
--text /tmp/poe2_test.txt \
|
||||
--outputbase /tmp/poe2_test \
|
||||
--font "$test_font" \
|
||||
--fonts_dir "$FONT_DIR" \
|
||||
--ptsize 16 --xsize 1200 --ysize 200 \
|
||||
--exposure 0 --leading 32 2>&1; then
|
||||
echo "Skipping validation (could not generate test image)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "=== eng model ==="
|
||||
tesseract /tmp/poe2_test.tif stdout -l eng --psm 6 2>/dev/null || true
|
||||
|
||||
echo ""
|
||||
echo "=== poe2 model ==="
|
||||
TESSDATA_PREFIX="$TESSDATA_DIR" tesseract /tmp/poe2_test.tif stdout -l poe2 --psm 6 2>/dev/null || {
|
||||
tesseract /tmp/poe2_test.tif stdout --tessdata-dir "$TESSDATA_DIR" -l poe2 --psm 6 2>/dev/null || true
|
||||
}
|
||||
|
||||
rm -f /tmp/poe2_test.txt /tmp/poe2_test.tif /tmp/poe2_test.box
|
||||
|
||||
echo ""
|
||||
echo "Compare the outputs above — poe2 should be more accurate on Fontin text."
|
||||
}
|
||||
|
||||
# ── Main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
main() {
|
||||
install_tesseract_tools
|
||||
install_fonts
|
||||
generate_training_images
|
||||
generate_lstmf
|
||||
fine_tune
|
||||
package_model
|
||||
validate
|
||||
|
||||
echo ""
|
||||
echo "=== Done! ==="
|
||||
echo "poe2.traineddata has been copied to: $TESSDATA_DIR/"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo " 1. Build the daemon: dotnet build tools/OcrDaemon -c Release"
|
||||
echo " 2. Start the bot: npx tsx src/index.ts"
|
||||
echo " 3. Test OCR quality in the dashboard"
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Loading…
Add table
Add a link
Reference in a new issue