poe2-bot/tools/OcrDaemon/ImagePreprocessor.cs
2026-02-11 17:42:28 -05:00

234 lines
9.3 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

namespace OcrDaemon;
using System.Drawing;
using OpenCvSharp;
using OpenCvSharp.Extensions;
static class ImagePreprocessor
{
/// <summary>
/// Pre-process an image for OCR using morphological white top-hat filtering.
/// Isolates bright tooltip text, suppresses dim background text visible through overlay.
/// Pipeline: grayscale → morphological top-hat → Otsu binary → upscale
/// </summary>
public static Bitmap PreprocessForOcr(Bitmap src, int kernelSize = 41, int upscale = 2)
{
using var mat = BitmapConverter.ToMat(src);
using var gray = new Mat();
Cv2.CvtColor(mat, gray, ColorConversionCodes.BGRA2GRAY);
// Morphological white top-hat: isolates bright text on dark background
using var kernel = Cv2.GetStructuringElement(MorphShapes.Rect, new OpenCvSharp.Size(kernelSize, kernelSize));
using var tophat = new Mat();
Cv2.MorphologyEx(gray, tophat, MorphTypes.TopHat, kernel);
// Otsu binarization: automatic threshold, black text on white
using var binary = new Mat();
Cv2.Threshold(tophat, binary, 0, 255, ThresholdTypes.BinaryInv | ThresholdTypes.Otsu);
// Upscale for better LSTM recognition
if (upscale > 1)
{
using var upscaled = new Mat();
Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * upscale, binary.Height * upscale),
interpolation: InterpolationFlags.Cubic);
return BitmapConverter.ToBitmap(upscaled);
}
return BitmapConverter.ToBitmap(binary);
}
/// <summary>
/// Background-subtraction preprocessing: uses the reference frame to remove
/// background bleed-through from the semi-transparent tooltip overlay.
/// Pipeline: estimate dimming factor → subtract expected background → threshold → upscale
/// Returns the upscaled binary Mat directly (caller must dispose).
/// </summary>
public static Mat PreprocessWithBackgroundSubMat(Bitmap tooltipCrop, Bitmap referenceCrop,
int dimPercentile = 25, int textThresh = 30, int upscale = 2, bool softThreshold = true)
{
using var curMat = BitmapConverter.ToMat(tooltipCrop);
using var refMat = BitmapConverter.ToMat(referenceCrop);
using var curGray = new Mat();
using var refGray = new Mat();
Cv2.CvtColor(curMat, curGray, ColorConversionCodes.BGRA2GRAY);
Cv2.CvtColor(refMat, refGray, ColorConversionCodes.BGRA2GRAY);
int rows = curGray.Rows, cols = curGray.Cols;
// Estimate the dimming factor of the tooltip overlay.
// For non-text pixels: current ≈ reference × dim_factor
// Collect ratios where reference is bright enough to be meaningful
var ratios = new List<double>();
unsafe
{
byte* curPtr = (byte*)curGray.Data;
byte* refPtr = (byte*)refGray.Data;
int curStep = (int)curGray.Step();
int refStep = (int)refGray.Step();
for (int y = 0; y < rows; y++)
for (int x = 0; x < cols; x++)
{
byte r = refPtr[y * refStep + x];
byte c = curPtr[y * curStep + x];
if (r > 30) // skip very dark reference pixels (no signal)
ratios.Add((double)c / r);
}
}
if (ratios.Count == 0)
{
// Fallback: use top-hat preprocessing, convert to Mat
using var fallbackBmp = PreprocessForOcr(tooltipCrop, 41, upscale);
return BitmapConverter.ToMat(fallbackBmp);
}
// Use a low percentile of ratios as the dimming factor.
// Text pixels have high ratios (bright on dark), overlay pixels have low ratios.
// A low percentile captures the overlay dimming, ignoring text.
ratios.Sort();
int idx = Math.Clamp(ratios.Count * dimPercentile / 100, 0, ratios.Count - 1);
double dimFactor = ratios[idx];
// Clamp to sane range
dimFactor = Math.Clamp(dimFactor, 0.05, 0.95);
// Subtract expected background: text_signal = current - reference × dimFactor
using var textSignal = new Mat(rows, cols, MatType.CV_8UC1);
unsafe
{
byte* curPtr = (byte*)curGray.Data;
byte* refPtr = (byte*)refGray.Data;
byte* outPtr = (byte*)textSignal.Data;
int curStep = (int)curGray.Step();
int refStep = (int)refGray.Step();
int outStep = (int)textSignal.Step();
for (int y = 0; y < rows; y++)
for (int x = 0; x < cols; x++)
{
double expected = refPtr[y * refStep + x] * dimFactor;
double signal = curPtr[y * curStep + x] - expected;
outPtr[y * outStep + x] = (byte)Math.Clamp(signal, 0, 255);
}
}
Mat result;
if (softThreshold)
{
// Soft threshold: clip below textThresh, contrast-stretch, invert.
// Produces grayscale anti-aliased text on white background,
// matching the training data format (text2image renders).
result = new Mat(rows, cols, MatType.CV_8UC1);
unsafe
{
byte* srcPtr = (byte*)textSignal.Data;
byte* dstPtr = (byte*)result.Data;
int srcStep = (int)textSignal.Step();
int dstStep = (int)result.Step();
// Find max signal above threshold for contrast stretch
int maxClipped = 1;
for (int y = 0; y < rows; y++)
for (int x = 0; x < cols; x++)
{
int val = srcPtr[y * srcStep + x] - textThresh;
if (val > maxClipped) maxClipped = val;
}
// Clip, stretch, invert: background → 255 (white), text → dark
for (int y = 0; y < rows; y++)
for (int x = 0; x < cols; x++)
{
int clipped = srcPtr[y * srcStep + x] - textThresh;
if (clipped <= 0)
{
dstPtr[y * dstStep + x] = 255; // background
}
else
{
int stretched = clipped * 255 / maxClipped;
dstPtr[y * dstStep + x] = (byte)(255 - stretched); // invert
}
}
}
}
else
{
// Hard binary threshold (original behavior)
result = new Mat();
Cv2.Threshold(textSignal, result, textThresh, 255, ThresholdTypes.BinaryInv);
}
using var _result = result;
return UpscaleMat(result, upscale);
}
/// <summary>
/// Background-subtraction preprocessing returning a Bitmap (convenience wrapper).
/// </summary>
public static Bitmap PreprocessWithBackgroundSub(Bitmap tooltipCrop, Bitmap referenceCrop,
int dimPercentile = 25, int textThresh = 30, int upscale = 2, bool softThreshold = true)
{
using var mat = PreprocessWithBackgroundSubMat(tooltipCrop, referenceCrop, dimPercentile, textThresh, upscale, softThreshold);
return BitmapConverter.ToBitmap(mat);
}
/// <summary>
/// Detect text lines via horizontal projection on a binary image.
/// Binary should be inverted: text=black(0), background=white(255).
/// Returns list of (yStart, yEnd) row ranges for each detected text line.
/// </summary>
public static List<(int yStart, int yEnd)> DetectTextLines(
Mat binary, int minRowPixels = 2, int gapTolerance = 5)
{
int rows = binary.Rows, cols = binary.Cols;
// Count dark (text) pixels per row — use < 128 threshold since
// cubic upscaling introduces anti-aliased intermediate values
var rowCounts = new int[rows];
unsafe
{
byte* ptr = (byte*)binary.Data;
int step = (int)binary.Step();
for (int y = 0; y < rows; y++)
for (int x = 0; x < cols; x++)
if (ptr[y * step + x] < 128)
rowCounts[y]++;
}
// Group into contiguous runs with gap tolerance
var lines = new List<(int yStart, int yEnd)>();
int lineStart = -1, lastActive = -1;
for (int y = 0; y < rows; y++)
{
if (rowCounts[y] >= minRowPixels)
{
if (lineStart < 0) lineStart = y;
lastActive = y;
}
else if (lineStart >= 0 && y - lastActive > gapTolerance)
{
lines.Add((lineStart, lastActive));
lineStart = -1;
}
}
if (lineStart >= 0)
lines.Add((lineStart, lastActive));
return lines;
}
/// <summary>Returns a new Mat (caller must dispose). Does NOT dispose src.</summary>
private static Mat UpscaleMat(Mat src, int factor)
{
if (factor > 1)
{
var upscaled = new Mat();
Cv2.Resize(src, upscaled, new OpenCvSharp.Size(src.Width * factor, src.Height * factor),
interpolation: InterpolationFlags.Cubic);
return upscaled;
}
return src.Clone();
}
}