poe2-bot/tools/OcrDaemon/ImagePreprocessor.cs

namespace OcrDaemon;

using System.Drawing;
using OpenCvSharp;
using OpenCvSharp.Extensions;

static class ImagePreprocessor
{
    /// <summary>
    /// Pre-process an image for OCR using morphological white top-hat filtering.
    /// Isolates bright tooltip text, suppresses dim background text visible through overlay.
    /// Pipeline: grayscale → morphological top-hat → Otsu binary → upscale
    /// </summary>
    public static Bitmap PreprocessForOcr(Bitmap src, int kernelSize = 41, int upscale = 2)
    {
        using var mat = BitmapConverter.ToMat(src);
        using var gray = new Mat();
        Cv2.CvtColor(mat, gray, ColorConversionCodes.BGRA2GRAY);

        // Morphological white top-hat: isolates bright text on dark background
        using var kernel = Cv2.GetStructuringElement(MorphShapes.Rect, new OpenCvSharp.Size(kernelSize, kernelSize));
        using var tophat = new Mat();
        Cv2.MorphologyEx(gray, tophat, MorphTypes.TopHat, kernel);

        // Otsu binarization: automatic threshold, black text on white
        using var binary = new Mat();
        Cv2.Threshold(tophat, binary, 0, 255, ThresholdTypes.BinaryInv | ThresholdTypes.Otsu);

        // Upscale for better LSTM recognition
        if (upscale > 1)
        {
            using var upscaled = new Mat();
            Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * upscale, binary.Height * upscale),
                interpolation: InterpolationFlags.Cubic);
            return BitmapConverter.ToBitmap(upscaled);
        }

        return BitmapConverter.ToBitmap(binary);
    }

    /// <summary>
    /// Background-subtraction preprocessing: uses the reference frame to remove
    /// background bleed-through from the semi-transparent tooltip overlay.
    /// Pipeline: estimate dimming factor → subtract expected background → threshold → upscale
    /// </summary>
    public static Bitmap PreprocessWithBackgroundSub(Bitmap tooltipCrop, Bitmap referenceCrop,
        int dimPercentile = 25, int textThresh = 30, int upscale = 2)
    {
        using var curMat = BitmapConverter.ToMat(tooltipCrop);
        using var refMat = BitmapConverter.ToMat(referenceCrop);
        using var curGray = new Mat();
        using var refGray = new Mat();
        Cv2.CvtColor(curMat, curGray, ColorConversionCodes.BGRA2GRAY);
        Cv2.CvtColor(refMat, refGray, ColorConversionCodes.BGRA2GRAY);

        int rows = curGray.Rows, cols = curGray.Cols;

        // Estimate the dimming factor of the tooltip overlay.
        // For non-text pixels: current ≈ reference × dim_factor
        // Collect ratios where reference is bright enough to be meaningful
        var ratios = new List<double>();
        unsafe
        {
            byte* curPtr = (byte*)curGray.Data;
            byte* refPtr = (byte*)refGray.Data;
            int curStep = (int)curGray.Step();
            int refStep = (int)refGray.Step();

            for (int y = 0; y < rows; y++)
                for (int x = 0; x < cols; x++)
                {
                    byte r = refPtr[y * refStep + x];
                    byte c = curPtr[y * curStep + x];
                    if (r > 30) // skip very dark reference pixels (no signal)
                        ratios.Add((double)c / r);
                }
        }

        if (ratios.Count == 0)
            return PreprocessForOcr(tooltipCrop, 41, upscale); // fallback

        // Use a low percentile of ratios as the dimming factor.
        // Text pixels have high ratios (bright on dark), overlay pixels have low ratios.
        // A low percentile captures the overlay dimming, ignoring text.
        ratios.Sort();
        int idx = Math.Clamp(ratios.Count * dimPercentile / 100, 0, ratios.Count - 1);
        double dimFactor = ratios[idx];
        // Clamp to sane range
        dimFactor = Math.Clamp(dimFactor, 0.05, 0.95);

        // Subtract expected background: text_signal = current - reference × dimFactor
        using var textSignal = new Mat(rows, cols, MatType.CV_8UC1);
        unsafe
        {
            byte* curPtr = (byte*)curGray.Data;
            byte* refPtr = (byte*)refGray.Data;
            byte* outPtr = (byte*)textSignal.Data;
            int curStep = (int)curGray.Step();
            int refStep = (int)refGray.Step();
            int outStep = (int)textSignal.Step();

            for (int y = 0; y < rows; y++)
                for (int x = 0; x < cols; x++)
                {
                    double expected = refPtr[y * refStep + x] * dimFactor;
                    double signal = curPtr[y * curStep + x] - expected;
                    outPtr[y * outStep + x] = (byte)Math.Clamp(signal, 0, 255);
                }
        }

        // Threshold: pixels above textThresh are text
        using var binary = new Mat();
        Cv2.Threshold(textSignal, binary, textThresh, 255, ThresholdTypes.BinaryInv);

        // Upscale for better LSTM recognition
        if (upscale > 1)
        {
            using var upscaled = new Mat();
            Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * upscale, binary.Height * upscale),
                interpolation: InterpolationFlags.Cubic);
            return BitmapConverter.ToBitmap(upscaled);
        }

        return BitmapConverter.ToBitmap(binary);
    }
}