diff --git a/tools/OcrDaemon/ImagePreprocessor.cs b/tools/OcrDaemon/ImagePreprocessor.cs index ecbce81..44605e3 100644 --- a/tools/OcrDaemon/ImagePreprocessor.cs +++ b/tools/OcrDaemon/ImagePreprocessor.cs @@ -11,7 +11,7 @@ static class ImagePreprocessor /// Isolates bright tooltip text, suppresses dim background text visible through overlay. /// Pipeline: grayscale → morphological top-hat → Otsu binary → upscale /// - public static Bitmap PreprocessForOcr(Bitmap src, int kernelSize = 25, int upscale = 2) + public static Bitmap PreprocessForOcr(Bitmap src, int kernelSize = 41, int upscale = 2) { using var mat = BitmapConverter.ToMat(src); using var gray = new Mat(); @@ -37,4 +37,90 @@ static class ImagePreprocessor return BitmapConverter.ToBitmap(binary); } + + /// + /// Background-subtraction preprocessing: uses the reference frame to remove + /// background bleed-through from the semi-transparent tooltip overlay. + /// Pipeline: estimate dimming factor → subtract expected background → threshold → upscale + /// + public static Bitmap PreprocessWithBackgroundSub(Bitmap tooltipCrop, Bitmap referenceCrop, + int dimPercentile = 25, int textThresh = 30, int upscale = 2) + { + using var curMat = BitmapConverter.ToMat(tooltipCrop); + using var refMat = BitmapConverter.ToMat(referenceCrop); + using var curGray = new Mat(); + using var refGray = new Mat(); + Cv2.CvtColor(curMat, curGray, ColorConversionCodes.BGRA2GRAY); + Cv2.CvtColor(refMat, refGray, ColorConversionCodes.BGRA2GRAY); + + int rows = curGray.Rows, cols = curGray.Cols; + + // Estimate the dimming factor of the tooltip overlay. + // For non-text pixels: current ≈ reference × dim_factor + // Collect ratios where reference is bright enough to be meaningful + var ratios = new List(); + unsafe + { + byte* curPtr = (byte*)curGray.Data; + byte* refPtr = (byte*)refGray.Data; + int curStep = (int)curGray.Step(); + int refStep = (int)refGray.Step(); + + for (int y = 0; y < rows; y++) + for (int x = 0; x < cols; x++) + { + byte r = refPtr[y * refStep + x]; + byte c = curPtr[y * curStep + x]; + if (r > 30) // skip very dark reference pixels (no signal) + ratios.Add((double)c / r); + } + } + + if (ratios.Count == 0) + return PreprocessForOcr(tooltipCrop, 41, upscale); // fallback + + // Use a low percentile of ratios as the dimming factor. + // Text pixels have high ratios (bright on dark), overlay pixels have low ratios. + // A low percentile captures the overlay dimming, ignoring text. + ratios.Sort(); + int idx = Math.Clamp(ratios.Count * dimPercentile / 100, 0, ratios.Count - 1); + double dimFactor = ratios[idx]; + // Clamp to sane range + dimFactor = Math.Clamp(dimFactor, 0.05, 0.95); + + // Subtract expected background: text_signal = current - reference × dimFactor + using var textSignal = new Mat(rows, cols, MatType.CV_8UC1); + unsafe + { + byte* curPtr = (byte*)curGray.Data; + byte* refPtr = (byte*)refGray.Data; + byte* outPtr = (byte*)textSignal.Data; + int curStep = (int)curGray.Step(); + int refStep = (int)refGray.Step(); + int outStep = (int)textSignal.Step(); + + for (int y = 0; y < rows; y++) + for (int x = 0; x < cols; x++) + { + double expected = refPtr[y * refStep + x] * dimFactor; + double signal = curPtr[y * curStep + x] - expected; + outPtr[y * outStep + x] = (byte)Math.Clamp(signal, 0, 255); + } + } + + // Threshold: pixels above textThresh are text + using var binary = new Mat(); + Cv2.Threshold(textSignal, binary, textThresh, 255, ThresholdTypes.BinaryInv); + + // Upscale for better LSTM recognition + if (upscale > 1) + { + using var upscaled = new Mat(); + Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * upscale, binary.Height * upscale), + interpolation: InterpolationFlags.Cubic); + return BitmapConverter.ToBitmap(upscaled); + } + + return BitmapConverter.ToBitmap(binary); + } } diff --git a/tools/OcrDaemon/Models.cs b/tools/OcrDaemon/Models.cs index ea5247b..344185a 100644 --- a/tools/OcrDaemon/Models.cs +++ b/tools/OcrDaemon/Models.cs @@ -212,10 +212,10 @@ class DetectGridResponse class DiffOcrParams { [JsonPropertyName("diffThresh")] - public int DiffThresh { get; set; } = 10; + public int DiffThresh { get; set; } = 20; [JsonPropertyName("rowThreshDiv")] - public int RowThreshDiv { get; set; } = 30; + public int RowThreshDiv { get; set; } = 40; [JsonPropertyName("colThreshDiv")] public int ColThreshDiv { get; set; } = 8; @@ -232,10 +232,21 @@ class DiffOcrParams [JsonPropertyName("upscale")] public int Upscale { get; set; } = 2; + [JsonPropertyName("useBackgroundSub")] + public bool UseBackgroundSub { get; set; } = true; + + [JsonPropertyName("dimPercentile")] + public int DimPercentile { get; set; } = 40; + + [JsonPropertyName("textThresh")] + public int TextThresh { get; set; } = 60; + public DiffOcrParams Clone() => (DiffOcrParams)MemberwiseClone(); public override string ToString() => - $"diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} kernelSize={KernelSize} upscale={Upscale}"; + UseBackgroundSub + ? $"bgSub dimPct={DimPercentile} textThresh={TextThresh} diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} upscale={Upscale}" + : $"topHat kernelSize={KernelSize} diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} upscale={Upscale}"; } class TestCase diff --git a/tools/OcrDaemon/OcrDaemon.csproj b/tools/OcrDaemon/OcrDaemon.csproj index 1f750d3..ffb6815 100644 --- a/tools/OcrDaemon/OcrDaemon.csproj +++ b/tools/OcrDaemon/OcrDaemon.csproj @@ -5,6 +5,7 @@ net8.0-windows10.0.19041.0 enable enable + true diff --git a/tools/OcrDaemon/OcrHandler.cs b/tools/OcrDaemon/OcrHandler.cs index bfda313..f5e8d76 100644 --- a/tools/OcrDaemon/OcrHandler.cs +++ b/tools/OcrDaemon/OcrHandler.cs @@ -55,10 +55,9 @@ class OcrHandler(TesseractEngine engine) return new OkResponse(); } - public object HandleDiffOcr(Request req) => HandleDiffOcr(req, new DiffOcrParams - { - DiffThresh = req.Threshold > 0 ? req.Threshold : 30, - }); + public object HandleDiffOcr(Request req) => HandleDiffOcr(req, req.Threshold > 0 + ? new DiffOcrParams { DiffThresh = req.Threshold } + : new DiffOcrParams()); public object HandleDiffOcr(Request req, DiffOcrParams p) { @@ -219,9 +218,9 @@ class OcrHandler(TesseractEngine engine) if (debug) Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}"); - // Simple crop of the tooltip region from the current frame (no per-pixel masking). - // The top-hat preprocessing will handle suppressing background text. + // Crop tooltip region from both current and reference frames using var cropped = current.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb); + using var refCropped = _referenceFrame.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb); // Save before/after preprocessing images if path is provided if (!string.IsNullOrEmpty(req.Path)) @@ -233,8 +232,10 @@ class OcrHandler(TesseractEngine engine) if (debug) Console.Error.WriteLine($" diff-ocr: saved raw to {req.Path}"); } - // Pre-process for OCR: top-hat + binarize + upscale - using var processed = ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale); + // Pre-process for OCR + using var processed = p.UseBackgroundSub + ? ImagePreprocessor.PreprocessWithBackgroundSub(cropped, refCropped, p.DimPercentile, p.TextThresh, p.Upscale) + : ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale); // Save fullscreen and preprocessed versions alongside raw if (!string.IsNullOrEmpty(req.Path)) @@ -266,35 +267,82 @@ class OcrHandler(TesseractEngine engine) public object HandleTune(Request req) { - // Coordinate descent: optimize one parameter at a time, repeat until stable. - var best = new DiffOcrParams(); - double bestScore = ScoreParams(best); - Console.Error.WriteLine($"\n=== Tuning start === baseline score={bestScore:F3} {best}\n"); + int totalEvals = 0; - // Define search ranges for each parameter - var sweeps = new (string Name, int[] Values, Action Set)[] + // --- Phase 1: Tune top-hat approach --- + Console.Error.WriteLine("\n========== Phase 1: Top-Hat =========="); + var topHat = new DiffOcrParams { UseBackgroundSub = false }; + double topHatScore = TuneParams(topHat, ref totalEvals, tuneTopHat: true, tuneBgSub: false); + + // --- Phase 2: Tune background-subtraction approach --- + Console.Error.WriteLine("\n========== Phase 2: Background Subtraction =========="); + // Start bgSub from the best detection params found in phase 1 + var bgSub = topHat.Clone(); + bgSub.UseBackgroundSub = true; + double bgSubScore = TuneParams(bgSub, ref totalEvals, tuneTopHat: false, tuneBgSub: true); + + // Pick the winner + var best = bgSubScore > topHatScore ? bgSub : topHat; + double bestScore = Math.Max(topHatScore, bgSubScore); + + Console.Error.WriteLine($"\n========== Result =========="); + Console.Error.WriteLine($" Top-Hat: {topHatScore:F3} {topHat}"); + Console.Error.WriteLine($" BgSub: {bgSubScore:F3} {bgSub}"); + Console.Error.WriteLine($" Winner: {(best.UseBackgroundSub ? "BgSub" : "TopHat")} evals={totalEvals}\n"); + + // Final verbose report with best params + RunTestCases(best, verbose: true); + + return new TuneResponse + { + BestScore = bestScore, + BestParams = best, + Iterations = totalEvals, + }; + } + + private double TuneParams(DiffOcrParams best, ref int totalEvals, bool tuneTopHat, bool tuneBgSub) + { + double bestScore = ScoreParams(best); + Console.Error.WriteLine($" baseline score={bestScore:F3} {best}\n"); + + // Detection params (shared by both approaches) + var sharedSweeps = new (string Name, int[] Values, Action Set)[] { ("diffThresh", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.DiffThresh = v), ("rowThreshDiv", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.RowThreshDiv = v), ("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.ColThreshDiv = v), ("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.MaxGap = v), - ("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41], (p, v) => p.KernelSize = v), ("upscale", [1, 2, 3], (p, v) => p.Upscale = v), }; - // trimCutoff needs double values — handle separately + // Top-hat specific + var topHatSweeps = new (string Name, int[] Values, Action Set)[] + { + ("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41, 51], (p, v) => p.KernelSize = v), + }; + + // Background-subtraction specific + var bgSubSweeps = new (string Name, int[] Values, Action Set)[] + { + ("dimPercentile", [5, 10, 15, 20, 25, 30, 40, 50], (p, v) => p.DimPercentile = v), + ("textThresh", [10, 15, 20, 25, 30, 40, 50, 60, 80], (p, v) => p.TextThresh = v), + }; + double[] trimValues = [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5]; - int totalEvals = 0; - const int maxRounds = 3; + var allIntSweeps = sharedSweeps + .Concat(tuneTopHat ? topHatSweeps : []) + .Concat(tuneBgSub ? bgSubSweeps : []) + .ToArray(); + const int maxRounds = 3; for (int round = 0; round < maxRounds; round++) { bool improved = false; Console.Error.WriteLine($"--- Round {round + 1} ---"); - // Sweep integer params - foreach (var (name, values, set) in sweeps) + foreach (var (name, values, set) in allIntSweeps) { Console.Error.Write($" {name}: "); int bestVal = 0; @@ -307,7 +355,6 @@ class OcrHandler(TesseractEngine engine) double score = ScoreParams(trial); totalEvals++; Console.Error.Write($"{v}={score:F3} "); - if (score > bestValScore) { bestValScore = score; bestVal = v; } } Console.Error.WriteLine(); @@ -334,7 +381,6 @@ class OcrHandler(TesseractEngine engine) double score = ScoreParams(trial); totalEvals++; Console.Error.Write($"{v:F2}={score:F3} "); - if (score > bestTrimScore) { bestTrimScore = score; bestTrim = v; } } Console.Error.WriteLine(); @@ -352,17 +398,7 @@ class OcrHandler(TesseractEngine engine) if (!improved) break; } - Console.Error.WriteLine($"\n=== Tuning done === evals={totalEvals} bestScore={bestScore:F3}\n {best}\n"); - - // Run verbose test with best params for final report - var finalResult = RunTestCases(best, verbose: true); - - return new TuneResponse - { - BestScore = bestScore, - BestParams = best, - Iterations = totalEvals, - }; + return bestScore; } /// Score a param set: average match ratio across all test cases (0-1).