poe2-bot/tools/OcrDaemon/ImagePreprocessor.cs

namespace OcrDaemon;

using System.Drawing;
using OpenCvSharp;
using OpenCvSharp.Extensions;

static class ImagePreprocessor
{
    /// <summary>
    /// Pre-process an image for OCR using morphological white top-hat filtering.
    /// Isolates bright tooltip text, suppresses dim background text visible through overlay.
    /// Pipeline: grayscale → morphological top-hat → Otsu binary → upscale
    /// </summary>
    public static Bitmap PreprocessForOcr(Bitmap src, int kernelSize = 41, int upscale = 2)
    {
        using var mat = BitmapConverter.ToMat(src);
        using var gray = new Mat();
        Cv2.CvtColor(mat, gray, ColorConversionCodes.BGRA2GRAY);

        // Morphological white top-hat: isolates bright text on dark background
        using var kernel = Cv2.GetStructuringElement(MorphShapes.Rect, new OpenCvSharp.Size(kernelSize, kernelSize));
        using var tophat = new Mat();
        Cv2.MorphologyEx(gray, tophat, MorphTypes.TopHat, kernel);

        // Otsu binarization: automatic threshold, black text on white
        using var binary = new Mat();
        Cv2.Threshold(tophat, binary, 0, 255, ThresholdTypes.BinaryInv | ThresholdTypes.Otsu);

        // Upscale for better LSTM recognition
        if (upscale > 1)
        {
            using var upscaled = new Mat();
            Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * upscale, binary.Height * upscale),
                interpolation: InterpolationFlags.Cubic);
            return BitmapConverter.ToBitmap(upscaled);
        }

        return BitmapConverter.ToBitmap(binary);
    }

    /// <summary>
    /// Background-subtraction preprocessing: uses the reference frame to remove
    /// background bleed-through from the semi-transparent tooltip overlay.
    /// Pipeline: estimate dimming factor → subtract expected background → threshold → upscale
    /// Returns the upscaled binary Mat directly (caller must dispose).
    /// </summary>
    public static Mat PreprocessWithBackgroundSubMat(Bitmap tooltipCrop, Bitmap referenceCrop,
        int dimPercentile = 25, int textThresh = 30, int upscale = 2, bool softThreshold = true)
    {
        using var curMat = BitmapConverter.ToMat(tooltipCrop);
        using var refMat = BitmapConverter.ToMat(referenceCrop);
        using var curGray = new Mat();
        using var refGray = new Mat();
        Cv2.CvtColor(curMat, curGray, ColorConversionCodes.BGRA2GRAY);
        Cv2.CvtColor(refMat, refGray, ColorConversionCodes.BGRA2GRAY);

        int rows = curGray.Rows, cols = curGray.Cols;

        // Estimate the dimming factor of the tooltip overlay.
        // For non-text pixels: current ≈ reference × dim_factor
        // Collect ratios where reference is bright enough to be meaningful
        var ratios = new List<double>();
        unsafe
        {
            byte* curPtr = (byte*)curGray.Data;
            byte* refPtr = (byte*)refGray.Data;
            int curStep = (int)curGray.Step();
            int refStep = (int)refGray.Step();

            for (int y = 0; y < rows; y++)
                for (int x = 0; x < cols; x++)
                {
                    byte r = refPtr[y * refStep + x];
                    byte c = curPtr[y * curStep + x];
                    if (r > 30) // skip very dark reference pixels (no signal)
                        ratios.Add((double)c / r);
                }
        }

        if (ratios.Count == 0)
        {
            // Fallback: use top-hat preprocessing, convert to Mat
            using var fallbackBmp = PreprocessForOcr(tooltipCrop, 41, upscale);
            return BitmapConverter.ToMat(fallbackBmp);
        }

        // Use a low percentile of ratios as the dimming factor.
        // Text pixels have high ratios (bright on dark), overlay pixels have low ratios.
        // A low percentile captures the overlay dimming, ignoring text.
        ratios.Sort();
        int idx = Math.Clamp(ratios.Count * dimPercentile / 100, 0, ratios.Count - 1);
        double dimFactor = ratios[idx];
        // Clamp to sane range
        dimFactor = Math.Clamp(dimFactor, 0.05, 0.95);

        // Subtract expected background: text_signal = current - reference × dimFactor
        using var textSignal = new Mat(rows, cols, MatType.CV_8UC1);
        unsafe
        {
            byte* curPtr = (byte*)curGray.Data;
            byte* refPtr = (byte*)refGray.Data;
            byte* outPtr = (byte*)textSignal.Data;
            int curStep = (int)curGray.Step();
            int refStep = (int)refGray.Step();
            int outStep = (int)textSignal.Step();

            for (int y = 0; y < rows; y++)
                for (int x = 0; x < cols; x++)
                {
                    double expected = refPtr[y * refStep + x] * dimFactor;
                    double signal = curPtr[y * curStep + x] - expected;
                    outPtr[y * outStep + x] = (byte)Math.Clamp(signal, 0, 255);
                }
        }

        Mat result;
        if (softThreshold)
        {
            // Soft threshold: clip below textThresh, contrast-stretch, invert.
            // Produces grayscale anti-aliased text on white background,
            // matching the training data format (text2image renders).
            result = new Mat(rows, cols, MatType.CV_8UC1);
            unsafe
            {
                byte* srcPtr = (byte*)textSignal.Data;
                byte* dstPtr = (byte*)result.Data;
                int srcStep = (int)textSignal.Step();
                int dstStep = (int)result.Step();

                // Find max signal above threshold for contrast stretch
                int maxClipped = 1;
                for (int y = 0; y < rows; y++)
                    for (int x = 0; x < cols; x++)
                    {
                        int val = srcPtr[y * srcStep + x] - textThresh;
                        if (val > maxClipped) maxClipped = val;
                    }

                // Clip, stretch, invert: background → 255 (white), text → dark
                for (int y = 0; y < rows; y++)
                    for (int x = 0; x < cols; x++)
                    {
                        int clipped = srcPtr[y * srcStep + x] - textThresh;
                        if (clipped <= 0)
                        {
                            dstPtr[y * dstStep + x] = 255; // background
                        }
                        else
                        {
                            int stretched = clipped * 255 / maxClipped;
                            dstPtr[y * dstStep + x] = (byte)(255 - stretched); // invert
                        }
                    }
            }
        }
        else
        {
            // Hard binary threshold (original behavior)
            result = new Mat();
            Cv2.Threshold(textSignal, result, textThresh, 255, ThresholdTypes.BinaryInv);
        }

        using var _result = result;
        return UpscaleMat(result, upscale);
    }

    /// <summary>
    /// Background-subtraction preprocessing returning a Bitmap (convenience wrapper).
    /// </summary>
    public static Bitmap PreprocessWithBackgroundSub(Bitmap tooltipCrop, Bitmap referenceCrop,
        int dimPercentile = 25, int textThresh = 30, int upscale = 2, bool softThreshold = true)
    {
        using var mat = PreprocessWithBackgroundSubMat(tooltipCrop, referenceCrop, dimPercentile, textThresh, upscale, softThreshold);
        return BitmapConverter.ToBitmap(mat);
    }

    /// <summary>
    /// Detect text lines via horizontal projection on a binary image.
    /// Binary should be inverted: text=black(0), background=white(255).
    /// Returns list of (yStart, yEnd) row ranges for each detected text line.
    /// </summary>
    public static List<(int yStart, int yEnd)> DetectTextLines(
        Mat binary, int minRowPixels = 2, int gapTolerance = 5)
    {
        int rows = binary.Rows, cols = binary.Cols;

        // Count dark (text) pixels per row — use < 128 threshold since
        // cubic upscaling introduces anti-aliased intermediate values
        var rowCounts = new int[rows];
        unsafe
        {
            byte* ptr = (byte*)binary.Data;
            int step = (int)binary.Step();
            for (int y = 0; y < rows; y++)
                for (int x = 0; x < cols; x++)
                    if (ptr[y * step + x] < 128)
                        rowCounts[y]++;
        }

        // Group into contiguous runs with gap tolerance
        var lines = new List<(int yStart, int yEnd)>();
        int lineStart = -1, lastActive = -1;
        for (int y = 0; y < rows; y++)
        {
            if (rowCounts[y] >= minRowPixels)
            {
                if (lineStart < 0) lineStart = y;
                lastActive = y;
            }
            else if (lineStart >= 0 && y - lastActive > gapTolerance)
            {
                lines.Add((lineStart, lastActive));
                lineStart = -1;
            }
        }
        if (lineStart >= 0)
            lines.Add((lineStart, lastActive));

        return lines;
    }

    /// <summary>Returns a new Mat (caller must dispose). Does NOT dispose src.</summary>
    private static Mat UpscaleMat(Mat src, int factor)
    {
        if (factor > 1)
        {
            var upscaled = new Mat();
            Cv2.Resize(src, upscaled, new OpenCvSharp.Size(src.Width * factor, src.Height * factor),
                interpolation: InterpolationFlags.Cubic);
            return upscaled;
        }
        return src.Clone();
    }
}