more tuning switched to background sub

This commit is contained in:
Boki 2026-02-11 15:39:57 -05:00
parent 641c87121a
commit 6600969947
4 changed files with 171 additions and 37 deletions

View file

@ -11,7 +11,7 @@ static class ImagePreprocessor
/// Isolates bright tooltip text, suppresses dim background text visible through overlay.
/// Pipeline: grayscale → morphological top-hat → Otsu binary → upscale
/// </summary>
public static Bitmap PreprocessForOcr(Bitmap src, int kernelSize = 25, int upscale = 2)
public static Bitmap PreprocessForOcr(Bitmap src, int kernelSize = 41, int upscale = 2)
{
using var mat = BitmapConverter.ToMat(src);
using var gray = new Mat();
@ -37,4 +37,90 @@ static class ImagePreprocessor
return BitmapConverter.ToBitmap(binary);
}
/// <summary>
/// Background-subtraction preprocessing: uses the reference frame to remove
/// background bleed-through from the semi-transparent tooltip overlay.
/// Pipeline: estimate dimming factor → subtract expected background → threshold → upscale
/// </summary>
public static Bitmap PreprocessWithBackgroundSub(Bitmap tooltipCrop, Bitmap referenceCrop,
int dimPercentile = 25, int textThresh = 30, int upscale = 2)
{
using var curMat = BitmapConverter.ToMat(tooltipCrop);
using var refMat = BitmapConverter.ToMat(referenceCrop);
using var curGray = new Mat();
using var refGray = new Mat();
Cv2.CvtColor(curMat, curGray, ColorConversionCodes.BGRA2GRAY);
Cv2.CvtColor(refMat, refGray, ColorConversionCodes.BGRA2GRAY);
int rows = curGray.Rows, cols = curGray.Cols;
// Estimate the dimming factor of the tooltip overlay.
// For non-text pixels: current ≈ reference × dim_factor
// Collect ratios where reference is bright enough to be meaningful
var ratios = new List<double>();
unsafe
{
byte* curPtr = (byte*)curGray.Data;
byte* refPtr = (byte*)refGray.Data;
int curStep = (int)curGray.Step();
int refStep = (int)refGray.Step();
for (int y = 0; y < rows; y++)
for (int x = 0; x < cols; x++)
{
byte r = refPtr[y * refStep + x];
byte c = curPtr[y * curStep + x];
if (r > 30) // skip very dark reference pixels (no signal)
ratios.Add((double)c / r);
}
}
if (ratios.Count == 0)
return PreprocessForOcr(tooltipCrop, 41, upscale); // fallback
// Use a low percentile of ratios as the dimming factor.
// Text pixels have high ratios (bright on dark), overlay pixels have low ratios.
// A low percentile captures the overlay dimming, ignoring text.
ratios.Sort();
int idx = Math.Clamp(ratios.Count * dimPercentile / 100, 0, ratios.Count - 1);
double dimFactor = ratios[idx];
// Clamp to sane range
dimFactor = Math.Clamp(dimFactor, 0.05, 0.95);
// Subtract expected background: text_signal = current - reference × dimFactor
using var textSignal = new Mat(rows, cols, MatType.CV_8UC1);
unsafe
{
byte* curPtr = (byte*)curGray.Data;
byte* refPtr = (byte*)refGray.Data;
byte* outPtr = (byte*)textSignal.Data;
int curStep = (int)curGray.Step();
int refStep = (int)refGray.Step();
int outStep = (int)textSignal.Step();
for (int y = 0; y < rows; y++)
for (int x = 0; x < cols; x++)
{
double expected = refPtr[y * refStep + x] * dimFactor;
double signal = curPtr[y * curStep + x] - expected;
outPtr[y * outStep + x] = (byte)Math.Clamp(signal, 0, 255);
}
}
// Threshold: pixels above textThresh are text
using var binary = new Mat();
Cv2.Threshold(textSignal, binary, textThresh, 255, ThresholdTypes.BinaryInv);
// Upscale for better LSTM recognition
if (upscale > 1)
{
using var upscaled = new Mat();
Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * upscale, binary.Height * upscale),
interpolation: InterpolationFlags.Cubic);
return BitmapConverter.ToBitmap(upscaled);
}
return BitmapConverter.ToBitmap(binary);
}
}

View file

@ -212,10 +212,10 @@ class DetectGridResponse
class DiffOcrParams
{
[JsonPropertyName("diffThresh")]
public int DiffThresh { get; set; } = 10;
public int DiffThresh { get; set; } = 20;
[JsonPropertyName("rowThreshDiv")]
public int RowThreshDiv { get; set; } = 30;
public int RowThreshDiv { get; set; } = 40;
[JsonPropertyName("colThreshDiv")]
public int ColThreshDiv { get; set; } = 8;
@ -232,10 +232,21 @@ class DiffOcrParams
[JsonPropertyName("upscale")]
public int Upscale { get; set; } = 2;
[JsonPropertyName("useBackgroundSub")]
public bool UseBackgroundSub { get; set; } = true;
[JsonPropertyName("dimPercentile")]
public int DimPercentile { get; set; } = 40;
[JsonPropertyName("textThresh")]
public int TextThresh { get; set; } = 60;
public DiffOcrParams Clone() => (DiffOcrParams)MemberwiseClone();
public override string ToString() =>
$"diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} kernelSize={KernelSize} upscale={Upscale}";
UseBackgroundSub
? $"bgSub dimPct={DimPercentile} textThresh={TextThresh} diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} upscale={Upscale}"
: $"topHat kernelSize={KernelSize} diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} upscale={Upscale}";
}
class TestCase

View file

@ -5,6 +5,7 @@
<TargetFramework>net8.0-windows10.0.19041.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
</PropertyGroup>
<ItemGroup>

View file

@ -55,10 +55,9 @@ class OcrHandler(TesseractEngine engine)
return new OkResponse();
}
public object HandleDiffOcr(Request req) => HandleDiffOcr(req, new DiffOcrParams
{
DiffThresh = req.Threshold > 0 ? req.Threshold : 30,
});
public object HandleDiffOcr(Request req) => HandleDiffOcr(req, req.Threshold > 0
? new DiffOcrParams { DiffThresh = req.Threshold }
: new DiffOcrParams());
public object HandleDiffOcr(Request req, DiffOcrParams p)
{
@ -219,9 +218,9 @@ class OcrHandler(TesseractEngine engine)
if (debug) Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}");
// Simple crop of the tooltip region from the current frame (no per-pixel masking).
// The top-hat preprocessing will handle suppressing background text.
// Crop tooltip region from both current and reference frames
using var cropped = current.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb);
using var refCropped = _referenceFrame.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb);
// Save before/after preprocessing images if path is provided
if (!string.IsNullOrEmpty(req.Path))
@ -233,8 +232,10 @@ class OcrHandler(TesseractEngine engine)
if (debug) Console.Error.WriteLine($" diff-ocr: saved raw to {req.Path}");
}
// Pre-process for OCR: top-hat + binarize + upscale
using var processed = ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
// Pre-process for OCR
using var processed = p.UseBackgroundSub
? ImagePreprocessor.PreprocessWithBackgroundSub(cropped, refCropped, p.DimPercentile, p.TextThresh, p.Upscale)
: ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
// Save fullscreen and preprocessed versions alongside raw
if (!string.IsNullOrEmpty(req.Path))
@ -266,35 +267,82 @@ class OcrHandler(TesseractEngine engine)
public object HandleTune(Request req)
{
// Coordinate descent: optimize one parameter at a time, repeat until stable.
var best = new DiffOcrParams();
double bestScore = ScoreParams(best);
Console.Error.WriteLine($"\n=== Tuning start === baseline score={bestScore:F3} {best}\n");
int totalEvals = 0;
// Define search ranges for each parameter
var sweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
// --- Phase 1: Tune top-hat approach ---
Console.Error.WriteLine("\n========== Phase 1: Top-Hat ==========");
var topHat = new DiffOcrParams { UseBackgroundSub = false };
double topHatScore = TuneParams(topHat, ref totalEvals, tuneTopHat: true, tuneBgSub: false);
// --- Phase 2: Tune background-subtraction approach ---
Console.Error.WriteLine("\n========== Phase 2: Background Subtraction ==========");
// Start bgSub from the best detection params found in phase 1
var bgSub = topHat.Clone();
bgSub.UseBackgroundSub = true;
double bgSubScore = TuneParams(bgSub, ref totalEvals, tuneTopHat: false, tuneBgSub: true);
// Pick the winner
var best = bgSubScore > topHatScore ? bgSub : topHat;
double bestScore = Math.Max(topHatScore, bgSubScore);
Console.Error.WriteLine($"\n========== Result ==========");
Console.Error.WriteLine($" Top-Hat: {topHatScore:F3} {topHat}");
Console.Error.WriteLine($" BgSub: {bgSubScore:F3} {bgSub}");
Console.Error.WriteLine($" Winner: {(best.UseBackgroundSub ? "BgSub" : "TopHat")} evals={totalEvals}\n");
// Final verbose report with best params
RunTestCases(best, verbose: true);
return new TuneResponse
{
BestScore = bestScore,
BestParams = best,
Iterations = totalEvals,
};
}
private double TuneParams(DiffOcrParams best, ref int totalEvals, bool tuneTopHat, bool tuneBgSub)
{
double bestScore = ScoreParams(best);
Console.Error.WriteLine($" baseline score={bestScore:F3} {best}\n");
// Detection params (shared by both approaches)
var sharedSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
{
("diffThresh", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.DiffThresh = v),
("rowThreshDiv", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.RowThreshDiv = v),
("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.ColThreshDiv = v),
("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.MaxGap = v),
("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41], (p, v) => p.KernelSize = v),
("upscale", [1, 2, 3], (p, v) => p.Upscale = v),
};
// trimCutoff needs double values — handle separately
// Top-hat specific
var topHatSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
{
("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41, 51], (p, v) => p.KernelSize = v),
};
// Background-subtraction specific
var bgSubSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
{
("dimPercentile", [5, 10, 15, 20, 25, 30, 40, 50], (p, v) => p.DimPercentile = v),
("textThresh", [10, 15, 20, 25, 30, 40, 50, 60, 80], (p, v) => p.TextThresh = v),
};
double[] trimValues = [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5];
int totalEvals = 0;
const int maxRounds = 3;
var allIntSweeps = sharedSweeps
.Concat(tuneTopHat ? topHatSweeps : [])
.Concat(tuneBgSub ? bgSubSweeps : [])
.ToArray();
const int maxRounds = 3;
for (int round = 0; round < maxRounds; round++)
{
bool improved = false;
Console.Error.WriteLine($"--- Round {round + 1} ---");
// Sweep integer params
foreach (var (name, values, set) in sweeps)
foreach (var (name, values, set) in allIntSweeps)
{
Console.Error.Write($" {name}: ");
int bestVal = 0;
@ -307,7 +355,6 @@ class OcrHandler(TesseractEngine engine)
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v}={score:F3} ");
if (score > bestValScore) { bestValScore = score; bestVal = v; }
}
Console.Error.WriteLine();
@ -334,7 +381,6 @@ class OcrHandler(TesseractEngine engine)
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v:F2}={score:F3} ");
if (score > bestTrimScore) { bestTrimScore = score; bestTrim = v; }
}
Console.Error.WriteLine();
@ -352,17 +398,7 @@ class OcrHandler(TesseractEngine engine)
if (!improved) break;
}
Console.Error.WriteLine($"\n=== Tuning done === evals={totalEvals} bestScore={bestScore:F3}\n {best}\n");
// Run verbose test with best params for final report
var finalResult = RunTestCases(best, verbose: true);
return new TuneResponse
{
BestScore = bestScore,
BestParams = best,
Iterations = totalEvals,
};
return bestScore;
}
/// <summary>Score a param set: average match ratio across all test cases (0-1).</summary>