more tuning switched to background sub
This commit is contained in:
parent
641c87121a
commit
6600969947
4 changed files with 171 additions and 37 deletions
|
|
@ -11,7 +11,7 @@ static class ImagePreprocessor
|
|||
/// Isolates bright tooltip text, suppresses dim background text visible through overlay.
|
||||
/// Pipeline: grayscale → morphological top-hat → Otsu binary → upscale
|
||||
/// </summary>
|
||||
public static Bitmap PreprocessForOcr(Bitmap src, int kernelSize = 25, int upscale = 2)
|
||||
public static Bitmap PreprocessForOcr(Bitmap src, int kernelSize = 41, int upscale = 2)
|
||||
{
|
||||
using var mat = BitmapConverter.ToMat(src);
|
||||
using var gray = new Mat();
|
||||
|
|
@ -37,4 +37,90 @@ static class ImagePreprocessor
|
|||
|
||||
return BitmapConverter.ToBitmap(binary);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Background-subtraction preprocessing: uses the reference frame to remove
|
||||
/// background bleed-through from the semi-transparent tooltip overlay.
|
||||
/// Pipeline: estimate dimming factor → subtract expected background → threshold → upscale
|
||||
/// </summary>
|
||||
public static Bitmap PreprocessWithBackgroundSub(Bitmap tooltipCrop, Bitmap referenceCrop,
|
||||
int dimPercentile = 25, int textThresh = 30, int upscale = 2)
|
||||
{
|
||||
using var curMat = BitmapConverter.ToMat(tooltipCrop);
|
||||
using var refMat = BitmapConverter.ToMat(referenceCrop);
|
||||
using var curGray = new Mat();
|
||||
using var refGray = new Mat();
|
||||
Cv2.CvtColor(curMat, curGray, ColorConversionCodes.BGRA2GRAY);
|
||||
Cv2.CvtColor(refMat, refGray, ColorConversionCodes.BGRA2GRAY);
|
||||
|
||||
int rows = curGray.Rows, cols = curGray.Cols;
|
||||
|
||||
// Estimate the dimming factor of the tooltip overlay.
|
||||
// For non-text pixels: current ≈ reference × dim_factor
|
||||
// Collect ratios where reference is bright enough to be meaningful
|
||||
var ratios = new List<double>();
|
||||
unsafe
|
||||
{
|
||||
byte* curPtr = (byte*)curGray.Data;
|
||||
byte* refPtr = (byte*)refGray.Data;
|
||||
int curStep = (int)curGray.Step();
|
||||
int refStep = (int)refGray.Step();
|
||||
|
||||
for (int y = 0; y < rows; y++)
|
||||
for (int x = 0; x < cols; x++)
|
||||
{
|
||||
byte r = refPtr[y * refStep + x];
|
||||
byte c = curPtr[y * curStep + x];
|
||||
if (r > 30) // skip very dark reference pixels (no signal)
|
||||
ratios.Add((double)c / r);
|
||||
}
|
||||
}
|
||||
|
||||
if (ratios.Count == 0)
|
||||
return PreprocessForOcr(tooltipCrop, 41, upscale); // fallback
|
||||
|
||||
// Use a low percentile of ratios as the dimming factor.
|
||||
// Text pixels have high ratios (bright on dark), overlay pixels have low ratios.
|
||||
// A low percentile captures the overlay dimming, ignoring text.
|
||||
ratios.Sort();
|
||||
int idx = Math.Clamp(ratios.Count * dimPercentile / 100, 0, ratios.Count - 1);
|
||||
double dimFactor = ratios[idx];
|
||||
// Clamp to sane range
|
||||
dimFactor = Math.Clamp(dimFactor, 0.05, 0.95);
|
||||
|
||||
// Subtract expected background: text_signal = current - reference × dimFactor
|
||||
using var textSignal = new Mat(rows, cols, MatType.CV_8UC1);
|
||||
unsafe
|
||||
{
|
||||
byte* curPtr = (byte*)curGray.Data;
|
||||
byte* refPtr = (byte*)refGray.Data;
|
||||
byte* outPtr = (byte*)textSignal.Data;
|
||||
int curStep = (int)curGray.Step();
|
||||
int refStep = (int)refGray.Step();
|
||||
int outStep = (int)textSignal.Step();
|
||||
|
||||
for (int y = 0; y < rows; y++)
|
||||
for (int x = 0; x < cols; x++)
|
||||
{
|
||||
double expected = refPtr[y * refStep + x] * dimFactor;
|
||||
double signal = curPtr[y * curStep + x] - expected;
|
||||
outPtr[y * outStep + x] = (byte)Math.Clamp(signal, 0, 255);
|
||||
}
|
||||
}
|
||||
|
||||
// Threshold: pixels above textThresh are text
|
||||
using var binary = new Mat();
|
||||
Cv2.Threshold(textSignal, binary, textThresh, 255, ThresholdTypes.BinaryInv);
|
||||
|
||||
// Upscale for better LSTM recognition
|
||||
if (upscale > 1)
|
||||
{
|
||||
using var upscaled = new Mat();
|
||||
Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * upscale, binary.Height * upscale),
|
||||
interpolation: InterpolationFlags.Cubic);
|
||||
return BitmapConverter.ToBitmap(upscaled);
|
||||
}
|
||||
|
||||
return BitmapConverter.ToBitmap(binary);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -212,10 +212,10 @@ class DetectGridResponse
|
|||
class DiffOcrParams
|
||||
{
|
||||
[JsonPropertyName("diffThresh")]
|
||||
public int DiffThresh { get; set; } = 10;
|
||||
public int DiffThresh { get; set; } = 20;
|
||||
|
||||
[JsonPropertyName("rowThreshDiv")]
|
||||
public int RowThreshDiv { get; set; } = 30;
|
||||
public int RowThreshDiv { get; set; } = 40;
|
||||
|
||||
[JsonPropertyName("colThreshDiv")]
|
||||
public int ColThreshDiv { get; set; } = 8;
|
||||
|
|
@ -232,10 +232,21 @@ class DiffOcrParams
|
|||
[JsonPropertyName("upscale")]
|
||||
public int Upscale { get; set; } = 2;
|
||||
|
||||
[JsonPropertyName("useBackgroundSub")]
|
||||
public bool UseBackgroundSub { get; set; } = true;
|
||||
|
||||
[JsonPropertyName("dimPercentile")]
|
||||
public int DimPercentile { get; set; } = 40;
|
||||
|
||||
[JsonPropertyName("textThresh")]
|
||||
public int TextThresh { get; set; } = 60;
|
||||
|
||||
public DiffOcrParams Clone() => (DiffOcrParams)MemberwiseClone();
|
||||
|
||||
public override string ToString() =>
|
||||
$"diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} kernelSize={KernelSize} upscale={Upscale}";
|
||||
UseBackgroundSub
|
||||
? $"bgSub dimPct={DimPercentile} textThresh={TextThresh} diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} upscale={Upscale}"
|
||||
: $"topHat kernelSize={KernelSize} diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} upscale={Upscale}";
|
||||
}
|
||||
|
||||
class TestCase
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
<TargetFramework>net8.0-windows10.0.19041.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
|
|
|||
|
|
@ -55,10 +55,9 @@ class OcrHandler(TesseractEngine engine)
|
|||
return new OkResponse();
|
||||
}
|
||||
|
||||
public object HandleDiffOcr(Request req) => HandleDiffOcr(req, new DiffOcrParams
|
||||
{
|
||||
DiffThresh = req.Threshold > 0 ? req.Threshold : 30,
|
||||
});
|
||||
public object HandleDiffOcr(Request req) => HandleDiffOcr(req, req.Threshold > 0
|
||||
? new DiffOcrParams { DiffThresh = req.Threshold }
|
||||
: new DiffOcrParams());
|
||||
|
||||
public object HandleDiffOcr(Request req, DiffOcrParams p)
|
||||
{
|
||||
|
|
@ -219,9 +218,9 @@ class OcrHandler(TesseractEngine engine)
|
|||
|
||||
if (debug) Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}");
|
||||
|
||||
// Simple crop of the tooltip region from the current frame (no per-pixel masking).
|
||||
// The top-hat preprocessing will handle suppressing background text.
|
||||
// Crop tooltip region from both current and reference frames
|
||||
using var cropped = current.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb);
|
||||
using var refCropped = _referenceFrame.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb);
|
||||
|
||||
// Save before/after preprocessing images if path is provided
|
||||
if (!string.IsNullOrEmpty(req.Path))
|
||||
|
|
@ -233,8 +232,10 @@ class OcrHandler(TesseractEngine engine)
|
|||
if (debug) Console.Error.WriteLine($" diff-ocr: saved raw to {req.Path}");
|
||||
}
|
||||
|
||||
// Pre-process for OCR: top-hat + binarize + upscale
|
||||
using var processed = ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
|
||||
// Pre-process for OCR
|
||||
using var processed = p.UseBackgroundSub
|
||||
? ImagePreprocessor.PreprocessWithBackgroundSub(cropped, refCropped, p.DimPercentile, p.TextThresh, p.Upscale)
|
||||
: ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
|
||||
|
||||
// Save fullscreen and preprocessed versions alongside raw
|
||||
if (!string.IsNullOrEmpty(req.Path))
|
||||
|
|
@ -266,35 +267,82 @@ class OcrHandler(TesseractEngine engine)
|
|||
|
||||
public object HandleTune(Request req)
|
||||
{
|
||||
// Coordinate descent: optimize one parameter at a time, repeat until stable.
|
||||
var best = new DiffOcrParams();
|
||||
double bestScore = ScoreParams(best);
|
||||
Console.Error.WriteLine($"\n=== Tuning start === baseline score={bestScore:F3} {best}\n");
|
||||
int totalEvals = 0;
|
||||
|
||||
// Define search ranges for each parameter
|
||||
var sweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
|
||||
// --- Phase 1: Tune top-hat approach ---
|
||||
Console.Error.WriteLine("\n========== Phase 1: Top-Hat ==========");
|
||||
var topHat = new DiffOcrParams { UseBackgroundSub = false };
|
||||
double topHatScore = TuneParams(topHat, ref totalEvals, tuneTopHat: true, tuneBgSub: false);
|
||||
|
||||
// --- Phase 2: Tune background-subtraction approach ---
|
||||
Console.Error.WriteLine("\n========== Phase 2: Background Subtraction ==========");
|
||||
// Start bgSub from the best detection params found in phase 1
|
||||
var bgSub = topHat.Clone();
|
||||
bgSub.UseBackgroundSub = true;
|
||||
double bgSubScore = TuneParams(bgSub, ref totalEvals, tuneTopHat: false, tuneBgSub: true);
|
||||
|
||||
// Pick the winner
|
||||
var best = bgSubScore > topHatScore ? bgSub : topHat;
|
||||
double bestScore = Math.Max(topHatScore, bgSubScore);
|
||||
|
||||
Console.Error.WriteLine($"\n========== Result ==========");
|
||||
Console.Error.WriteLine($" Top-Hat: {topHatScore:F3} {topHat}");
|
||||
Console.Error.WriteLine($" BgSub: {bgSubScore:F3} {bgSub}");
|
||||
Console.Error.WriteLine($" Winner: {(best.UseBackgroundSub ? "BgSub" : "TopHat")} evals={totalEvals}\n");
|
||||
|
||||
// Final verbose report with best params
|
||||
RunTestCases(best, verbose: true);
|
||||
|
||||
return new TuneResponse
|
||||
{
|
||||
BestScore = bestScore,
|
||||
BestParams = best,
|
||||
Iterations = totalEvals,
|
||||
};
|
||||
}
|
||||
|
||||
private double TuneParams(DiffOcrParams best, ref int totalEvals, bool tuneTopHat, bool tuneBgSub)
|
||||
{
|
||||
double bestScore = ScoreParams(best);
|
||||
Console.Error.WriteLine($" baseline score={bestScore:F3} {best}\n");
|
||||
|
||||
// Detection params (shared by both approaches)
|
||||
var sharedSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
|
||||
{
|
||||
("diffThresh", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.DiffThresh = v),
|
||||
("rowThreshDiv", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.RowThreshDiv = v),
|
||||
("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.ColThreshDiv = v),
|
||||
("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.MaxGap = v),
|
||||
("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41], (p, v) => p.KernelSize = v),
|
||||
("upscale", [1, 2, 3], (p, v) => p.Upscale = v),
|
||||
};
|
||||
|
||||
// trimCutoff needs double values — handle separately
|
||||
// Top-hat specific
|
||||
var topHatSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
|
||||
{
|
||||
("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41, 51], (p, v) => p.KernelSize = v),
|
||||
};
|
||||
|
||||
// Background-subtraction specific
|
||||
var bgSubSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
|
||||
{
|
||||
("dimPercentile", [5, 10, 15, 20, 25, 30, 40, 50], (p, v) => p.DimPercentile = v),
|
||||
("textThresh", [10, 15, 20, 25, 30, 40, 50, 60, 80], (p, v) => p.TextThresh = v),
|
||||
};
|
||||
|
||||
double[] trimValues = [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5];
|
||||
|
||||
int totalEvals = 0;
|
||||
const int maxRounds = 3;
|
||||
var allIntSweeps = sharedSweeps
|
||||
.Concat(tuneTopHat ? topHatSweeps : [])
|
||||
.Concat(tuneBgSub ? bgSubSweeps : [])
|
||||
.ToArray();
|
||||
|
||||
const int maxRounds = 3;
|
||||
for (int round = 0; round < maxRounds; round++)
|
||||
{
|
||||
bool improved = false;
|
||||
Console.Error.WriteLine($"--- Round {round + 1} ---");
|
||||
|
||||
// Sweep integer params
|
||||
foreach (var (name, values, set) in sweeps)
|
||||
foreach (var (name, values, set) in allIntSweeps)
|
||||
{
|
||||
Console.Error.Write($" {name}: ");
|
||||
int bestVal = 0;
|
||||
|
|
@ -307,7 +355,6 @@ class OcrHandler(TesseractEngine engine)
|
|||
double score = ScoreParams(trial);
|
||||
totalEvals++;
|
||||
Console.Error.Write($"{v}={score:F3} ");
|
||||
|
||||
if (score > bestValScore) { bestValScore = score; bestVal = v; }
|
||||
}
|
||||
Console.Error.WriteLine();
|
||||
|
|
@ -334,7 +381,6 @@ class OcrHandler(TesseractEngine engine)
|
|||
double score = ScoreParams(trial);
|
||||
totalEvals++;
|
||||
Console.Error.Write($"{v:F2}={score:F3} ");
|
||||
|
||||
if (score > bestTrimScore) { bestTrimScore = score; bestTrim = v; }
|
||||
}
|
||||
Console.Error.WriteLine();
|
||||
|
|
@ -352,17 +398,7 @@ class OcrHandler(TesseractEngine engine)
|
|||
if (!improved) break;
|
||||
}
|
||||
|
||||
Console.Error.WriteLine($"\n=== Tuning done === evals={totalEvals} bestScore={bestScore:F3}\n {best}\n");
|
||||
|
||||
// Run verbose test with best params for final report
|
||||
var finalResult = RunTestCases(best, verbose: true);
|
||||
|
||||
return new TuneResponse
|
||||
{
|
||||
BestScore = bestScore,
|
||||
BestParams = best,
|
||||
Iterations = totalEvals,
|
||||
};
|
||||
return bestScore;
|
||||
}
|
||||
|
||||
/// <summary>Score a param set: average match ratio across all test cases (0-1).</summary>
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue