diff --git a/tools/OcrDaemon/ImagePreprocessor.cs b/tools/OcrDaemon/ImagePreprocessor.cs
index ecbce81..44605e3 100644
--- a/tools/OcrDaemon/ImagePreprocessor.cs
+++ b/tools/OcrDaemon/ImagePreprocessor.cs
@@ -11,7 +11,7 @@ static class ImagePreprocessor
/// Isolates bright tooltip text, suppresses dim background text visible through overlay.
/// Pipeline: grayscale → morphological top-hat → Otsu binary → upscale
///
- public static Bitmap PreprocessForOcr(Bitmap src, int kernelSize = 25, int upscale = 2)
+ public static Bitmap PreprocessForOcr(Bitmap src, int kernelSize = 41, int upscale = 2)
{
using var mat = BitmapConverter.ToMat(src);
using var gray = new Mat();
@@ -37,4 +37,90 @@ static class ImagePreprocessor
return BitmapConverter.ToBitmap(binary);
}
+
+ ///
+ /// Background-subtraction preprocessing: uses the reference frame to remove
+ /// background bleed-through from the semi-transparent tooltip overlay.
+ /// Pipeline: estimate dimming factor → subtract expected background → threshold → upscale
+ ///
+ public static Bitmap PreprocessWithBackgroundSub(Bitmap tooltipCrop, Bitmap referenceCrop,
+ int dimPercentile = 25, int textThresh = 30, int upscale = 2)
+ {
+ using var curMat = BitmapConverter.ToMat(tooltipCrop);
+ using var refMat = BitmapConverter.ToMat(referenceCrop);
+ using var curGray = new Mat();
+ using var refGray = new Mat();
+ Cv2.CvtColor(curMat, curGray, ColorConversionCodes.BGRA2GRAY);
+ Cv2.CvtColor(refMat, refGray, ColorConversionCodes.BGRA2GRAY);
+
+ int rows = curGray.Rows, cols = curGray.Cols;
+
+ // Estimate the dimming factor of the tooltip overlay.
+ // For non-text pixels: current ≈ reference × dim_factor
+ // Collect ratios where reference is bright enough to be meaningful
+ var ratios = new List();
+ unsafe
+ {
+ byte* curPtr = (byte*)curGray.Data;
+ byte* refPtr = (byte*)refGray.Data;
+ int curStep = (int)curGray.Step();
+ int refStep = (int)refGray.Step();
+
+ for (int y = 0; y < rows; y++)
+ for (int x = 0; x < cols; x++)
+ {
+ byte r = refPtr[y * refStep + x];
+ byte c = curPtr[y * curStep + x];
+ if (r > 30) // skip very dark reference pixels (no signal)
+ ratios.Add((double)c / r);
+ }
+ }
+
+ if (ratios.Count == 0)
+ return PreprocessForOcr(tooltipCrop, 41, upscale); // fallback
+
+ // Use a low percentile of ratios as the dimming factor.
+ // Text pixels have high ratios (bright on dark), overlay pixels have low ratios.
+ // A low percentile captures the overlay dimming, ignoring text.
+ ratios.Sort();
+ int idx = Math.Clamp(ratios.Count * dimPercentile / 100, 0, ratios.Count - 1);
+ double dimFactor = ratios[idx];
+ // Clamp to sane range
+ dimFactor = Math.Clamp(dimFactor, 0.05, 0.95);
+
+ // Subtract expected background: text_signal = current - reference × dimFactor
+ using var textSignal = new Mat(rows, cols, MatType.CV_8UC1);
+ unsafe
+ {
+ byte* curPtr = (byte*)curGray.Data;
+ byte* refPtr = (byte*)refGray.Data;
+ byte* outPtr = (byte*)textSignal.Data;
+ int curStep = (int)curGray.Step();
+ int refStep = (int)refGray.Step();
+ int outStep = (int)textSignal.Step();
+
+ for (int y = 0; y < rows; y++)
+ for (int x = 0; x < cols; x++)
+ {
+ double expected = refPtr[y * refStep + x] * dimFactor;
+ double signal = curPtr[y * curStep + x] - expected;
+ outPtr[y * outStep + x] = (byte)Math.Clamp(signal, 0, 255);
+ }
+ }
+
+ // Threshold: pixels above textThresh are text
+ using var binary = new Mat();
+ Cv2.Threshold(textSignal, binary, textThresh, 255, ThresholdTypes.BinaryInv);
+
+ // Upscale for better LSTM recognition
+ if (upscale > 1)
+ {
+ using var upscaled = new Mat();
+ Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * upscale, binary.Height * upscale),
+ interpolation: InterpolationFlags.Cubic);
+ return BitmapConverter.ToBitmap(upscaled);
+ }
+
+ return BitmapConverter.ToBitmap(binary);
+ }
}
diff --git a/tools/OcrDaemon/Models.cs b/tools/OcrDaemon/Models.cs
index ea5247b..344185a 100644
--- a/tools/OcrDaemon/Models.cs
+++ b/tools/OcrDaemon/Models.cs
@@ -212,10 +212,10 @@ class DetectGridResponse
class DiffOcrParams
{
[JsonPropertyName("diffThresh")]
- public int DiffThresh { get; set; } = 10;
+ public int DiffThresh { get; set; } = 20;
[JsonPropertyName("rowThreshDiv")]
- public int RowThreshDiv { get; set; } = 30;
+ public int RowThreshDiv { get; set; } = 40;
[JsonPropertyName("colThreshDiv")]
public int ColThreshDiv { get; set; } = 8;
@@ -232,10 +232,21 @@ class DiffOcrParams
[JsonPropertyName("upscale")]
public int Upscale { get; set; } = 2;
+ [JsonPropertyName("useBackgroundSub")]
+ public bool UseBackgroundSub { get; set; } = true;
+
+ [JsonPropertyName("dimPercentile")]
+ public int DimPercentile { get; set; } = 40;
+
+ [JsonPropertyName("textThresh")]
+ public int TextThresh { get; set; } = 60;
+
public DiffOcrParams Clone() => (DiffOcrParams)MemberwiseClone();
public override string ToString() =>
- $"diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} kernelSize={KernelSize} upscale={Upscale}";
+ UseBackgroundSub
+ ? $"bgSub dimPct={DimPercentile} textThresh={TextThresh} diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} upscale={Upscale}"
+ : $"topHat kernelSize={KernelSize} diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} upscale={Upscale}";
}
class TestCase
diff --git a/tools/OcrDaemon/OcrDaemon.csproj b/tools/OcrDaemon/OcrDaemon.csproj
index 1f750d3..ffb6815 100644
--- a/tools/OcrDaemon/OcrDaemon.csproj
+++ b/tools/OcrDaemon/OcrDaemon.csproj
@@ -5,6 +5,7 @@
net8.0-windows10.0.19041.0
enable
enable
+ true
diff --git a/tools/OcrDaemon/OcrHandler.cs b/tools/OcrDaemon/OcrHandler.cs
index bfda313..f5e8d76 100644
--- a/tools/OcrDaemon/OcrHandler.cs
+++ b/tools/OcrDaemon/OcrHandler.cs
@@ -55,10 +55,9 @@ class OcrHandler(TesseractEngine engine)
return new OkResponse();
}
- public object HandleDiffOcr(Request req) => HandleDiffOcr(req, new DiffOcrParams
- {
- DiffThresh = req.Threshold > 0 ? req.Threshold : 30,
- });
+ public object HandleDiffOcr(Request req) => HandleDiffOcr(req, req.Threshold > 0
+ ? new DiffOcrParams { DiffThresh = req.Threshold }
+ : new DiffOcrParams());
public object HandleDiffOcr(Request req, DiffOcrParams p)
{
@@ -219,9 +218,9 @@ class OcrHandler(TesseractEngine engine)
if (debug) Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}");
- // Simple crop of the tooltip region from the current frame (no per-pixel masking).
- // The top-hat preprocessing will handle suppressing background text.
+ // Crop tooltip region from both current and reference frames
using var cropped = current.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb);
+ using var refCropped = _referenceFrame.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb);
// Save before/after preprocessing images if path is provided
if (!string.IsNullOrEmpty(req.Path))
@@ -233,8 +232,10 @@ class OcrHandler(TesseractEngine engine)
if (debug) Console.Error.WriteLine($" diff-ocr: saved raw to {req.Path}");
}
- // Pre-process for OCR: top-hat + binarize + upscale
- using var processed = ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
+ // Pre-process for OCR
+ using var processed = p.UseBackgroundSub
+ ? ImagePreprocessor.PreprocessWithBackgroundSub(cropped, refCropped, p.DimPercentile, p.TextThresh, p.Upscale)
+ : ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
// Save fullscreen and preprocessed versions alongside raw
if (!string.IsNullOrEmpty(req.Path))
@@ -266,35 +267,82 @@ class OcrHandler(TesseractEngine engine)
public object HandleTune(Request req)
{
- // Coordinate descent: optimize one parameter at a time, repeat until stable.
- var best = new DiffOcrParams();
- double bestScore = ScoreParams(best);
- Console.Error.WriteLine($"\n=== Tuning start === baseline score={bestScore:F3} {best}\n");
+ int totalEvals = 0;
- // Define search ranges for each parameter
- var sweeps = new (string Name, int[] Values, Action Set)[]
+ // --- Phase 1: Tune top-hat approach ---
+ Console.Error.WriteLine("\n========== Phase 1: Top-Hat ==========");
+ var topHat = new DiffOcrParams { UseBackgroundSub = false };
+ double topHatScore = TuneParams(topHat, ref totalEvals, tuneTopHat: true, tuneBgSub: false);
+
+ // --- Phase 2: Tune background-subtraction approach ---
+ Console.Error.WriteLine("\n========== Phase 2: Background Subtraction ==========");
+ // Start bgSub from the best detection params found in phase 1
+ var bgSub = topHat.Clone();
+ bgSub.UseBackgroundSub = true;
+ double bgSubScore = TuneParams(bgSub, ref totalEvals, tuneTopHat: false, tuneBgSub: true);
+
+ // Pick the winner
+ var best = bgSubScore > topHatScore ? bgSub : topHat;
+ double bestScore = Math.Max(topHatScore, bgSubScore);
+
+ Console.Error.WriteLine($"\n========== Result ==========");
+ Console.Error.WriteLine($" Top-Hat: {topHatScore:F3} {topHat}");
+ Console.Error.WriteLine($" BgSub: {bgSubScore:F3} {bgSub}");
+ Console.Error.WriteLine($" Winner: {(best.UseBackgroundSub ? "BgSub" : "TopHat")} evals={totalEvals}\n");
+
+ // Final verbose report with best params
+ RunTestCases(best, verbose: true);
+
+ return new TuneResponse
+ {
+ BestScore = bestScore,
+ BestParams = best,
+ Iterations = totalEvals,
+ };
+ }
+
+ private double TuneParams(DiffOcrParams best, ref int totalEvals, bool tuneTopHat, bool tuneBgSub)
+ {
+ double bestScore = ScoreParams(best);
+ Console.Error.WriteLine($" baseline score={bestScore:F3} {best}\n");
+
+ // Detection params (shared by both approaches)
+ var sharedSweeps = new (string Name, int[] Values, Action Set)[]
{
("diffThresh", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.DiffThresh = v),
("rowThreshDiv", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.RowThreshDiv = v),
("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.ColThreshDiv = v),
("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.MaxGap = v),
- ("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41], (p, v) => p.KernelSize = v),
("upscale", [1, 2, 3], (p, v) => p.Upscale = v),
};
- // trimCutoff needs double values — handle separately
+ // Top-hat specific
+ var topHatSweeps = new (string Name, int[] Values, Action Set)[]
+ {
+ ("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41, 51], (p, v) => p.KernelSize = v),
+ };
+
+ // Background-subtraction specific
+ var bgSubSweeps = new (string Name, int[] Values, Action Set)[]
+ {
+ ("dimPercentile", [5, 10, 15, 20, 25, 30, 40, 50], (p, v) => p.DimPercentile = v),
+ ("textThresh", [10, 15, 20, 25, 30, 40, 50, 60, 80], (p, v) => p.TextThresh = v),
+ };
+
double[] trimValues = [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5];
- int totalEvals = 0;
- const int maxRounds = 3;
+ var allIntSweeps = sharedSweeps
+ .Concat(tuneTopHat ? topHatSweeps : [])
+ .Concat(tuneBgSub ? bgSubSweeps : [])
+ .ToArray();
+ const int maxRounds = 3;
for (int round = 0; round < maxRounds; round++)
{
bool improved = false;
Console.Error.WriteLine($"--- Round {round + 1} ---");
- // Sweep integer params
- foreach (var (name, values, set) in sweeps)
+ foreach (var (name, values, set) in allIntSweeps)
{
Console.Error.Write($" {name}: ");
int bestVal = 0;
@@ -307,7 +355,6 @@ class OcrHandler(TesseractEngine engine)
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v}={score:F3} ");
-
if (score > bestValScore) { bestValScore = score; bestVal = v; }
}
Console.Error.WriteLine();
@@ -334,7 +381,6 @@ class OcrHandler(TesseractEngine engine)
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v:F2}={score:F3} ");
-
if (score > bestTrimScore) { bestTrimScore = score; bestTrim = v; }
}
Console.Error.WriteLine();
@@ -352,17 +398,7 @@ class OcrHandler(TesseractEngine engine)
if (!improved) break;
}
- Console.Error.WriteLine($"\n=== Tuning done === evals={totalEvals} bestScore={bestScore:F3}\n {best}\n");
-
- // Run verbose test with best params for final report
- var finalResult = RunTestCases(best, verbose: true);
-
- return new TuneResponse
- {
- BestScore = bestScore,
- BestParams = best,
- Iterations = totalEvals,
- };
+ return bestScore;
}
/// Score a param set: average match ratio across all test cases (0-1).