more tuning switched to background sub
This commit is contained in:
parent
641c87121a
commit
6600969947
4 changed files with 171 additions and 37 deletions
|
|
@ -11,7 +11,7 @@ static class ImagePreprocessor
|
||||||
/// Isolates bright tooltip text, suppresses dim background text visible through overlay.
|
/// Isolates bright tooltip text, suppresses dim background text visible through overlay.
|
||||||
/// Pipeline: grayscale → morphological top-hat → Otsu binary → upscale
|
/// Pipeline: grayscale → morphological top-hat → Otsu binary → upscale
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public static Bitmap PreprocessForOcr(Bitmap src, int kernelSize = 25, int upscale = 2)
|
public static Bitmap PreprocessForOcr(Bitmap src, int kernelSize = 41, int upscale = 2)
|
||||||
{
|
{
|
||||||
using var mat = BitmapConverter.ToMat(src);
|
using var mat = BitmapConverter.ToMat(src);
|
||||||
using var gray = new Mat();
|
using var gray = new Mat();
|
||||||
|
|
@ -37,4 +37,90 @@ static class ImagePreprocessor
|
||||||
|
|
||||||
return BitmapConverter.ToBitmap(binary);
|
return BitmapConverter.ToBitmap(binary);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Background-subtraction preprocessing: uses the reference frame to remove
|
||||||
|
/// background bleed-through from the semi-transparent tooltip overlay.
|
||||||
|
/// Pipeline: estimate dimming factor → subtract expected background → threshold → upscale
|
||||||
|
/// </summary>
|
||||||
|
public static Bitmap PreprocessWithBackgroundSub(Bitmap tooltipCrop, Bitmap referenceCrop,
|
||||||
|
int dimPercentile = 25, int textThresh = 30, int upscale = 2)
|
||||||
|
{
|
||||||
|
using var curMat = BitmapConverter.ToMat(tooltipCrop);
|
||||||
|
using var refMat = BitmapConverter.ToMat(referenceCrop);
|
||||||
|
using var curGray = new Mat();
|
||||||
|
using var refGray = new Mat();
|
||||||
|
Cv2.CvtColor(curMat, curGray, ColorConversionCodes.BGRA2GRAY);
|
||||||
|
Cv2.CvtColor(refMat, refGray, ColorConversionCodes.BGRA2GRAY);
|
||||||
|
|
||||||
|
int rows = curGray.Rows, cols = curGray.Cols;
|
||||||
|
|
||||||
|
// Estimate the dimming factor of the tooltip overlay.
|
||||||
|
// For non-text pixels: current ≈ reference × dim_factor
|
||||||
|
// Collect ratios where reference is bright enough to be meaningful
|
||||||
|
var ratios = new List<double>();
|
||||||
|
unsafe
|
||||||
|
{
|
||||||
|
byte* curPtr = (byte*)curGray.Data;
|
||||||
|
byte* refPtr = (byte*)refGray.Data;
|
||||||
|
int curStep = (int)curGray.Step();
|
||||||
|
int refStep = (int)refGray.Step();
|
||||||
|
|
||||||
|
for (int y = 0; y < rows; y++)
|
||||||
|
for (int x = 0; x < cols; x++)
|
||||||
|
{
|
||||||
|
byte r = refPtr[y * refStep + x];
|
||||||
|
byte c = curPtr[y * curStep + x];
|
||||||
|
if (r > 30) // skip very dark reference pixels (no signal)
|
||||||
|
ratios.Add((double)c / r);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ratios.Count == 0)
|
||||||
|
return PreprocessForOcr(tooltipCrop, 41, upscale); // fallback
|
||||||
|
|
||||||
|
// Use a low percentile of ratios as the dimming factor.
|
||||||
|
// Text pixels have high ratios (bright on dark), overlay pixels have low ratios.
|
||||||
|
// A low percentile captures the overlay dimming, ignoring text.
|
||||||
|
ratios.Sort();
|
||||||
|
int idx = Math.Clamp(ratios.Count * dimPercentile / 100, 0, ratios.Count - 1);
|
||||||
|
double dimFactor = ratios[idx];
|
||||||
|
// Clamp to sane range
|
||||||
|
dimFactor = Math.Clamp(dimFactor, 0.05, 0.95);
|
||||||
|
|
||||||
|
// Subtract expected background: text_signal = current - reference × dimFactor
|
||||||
|
using var textSignal = new Mat(rows, cols, MatType.CV_8UC1);
|
||||||
|
unsafe
|
||||||
|
{
|
||||||
|
byte* curPtr = (byte*)curGray.Data;
|
||||||
|
byte* refPtr = (byte*)refGray.Data;
|
||||||
|
byte* outPtr = (byte*)textSignal.Data;
|
||||||
|
int curStep = (int)curGray.Step();
|
||||||
|
int refStep = (int)refGray.Step();
|
||||||
|
int outStep = (int)textSignal.Step();
|
||||||
|
|
||||||
|
for (int y = 0; y < rows; y++)
|
||||||
|
for (int x = 0; x < cols; x++)
|
||||||
|
{
|
||||||
|
double expected = refPtr[y * refStep + x] * dimFactor;
|
||||||
|
double signal = curPtr[y * curStep + x] - expected;
|
||||||
|
outPtr[y * outStep + x] = (byte)Math.Clamp(signal, 0, 255);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Threshold: pixels above textThresh are text
|
||||||
|
using var binary = new Mat();
|
||||||
|
Cv2.Threshold(textSignal, binary, textThresh, 255, ThresholdTypes.BinaryInv);
|
||||||
|
|
||||||
|
// Upscale for better LSTM recognition
|
||||||
|
if (upscale > 1)
|
||||||
|
{
|
||||||
|
using var upscaled = new Mat();
|
||||||
|
Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * upscale, binary.Height * upscale),
|
||||||
|
interpolation: InterpolationFlags.Cubic);
|
||||||
|
return BitmapConverter.ToBitmap(upscaled);
|
||||||
|
}
|
||||||
|
|
||||||
|
return BitmapConverter.ToBitmap(binary);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -212,10 +212,10 @@ class DetectGridResponse
|
||||||
class DiffOcrParams
|
class DiffOcrParams
|
||||||
{
|
{
|
||||||
[JsonPropertyName("diffThresh")]
|
[JsonPropertyName("diffThresh")]
|
||||||
public int DiffThresh { get; set; } = 10;
|
public int DiffThresh { get; set; } = 20;
|
||||||
|
|
||||||
[JsonPropertyName("rowThreshDiv")]
|
[JsonPropertyName("rowThreshDiv")]
|
||||||
public int RowThreshDiv { get; set; } = 30;
|
public int RowThreshDiv { get; set; } = 40;
|
||||||
|
|
||||||
[JsonPropertyName("colThreshDiv")]
|
[JsonPropertyName("colThreshDiv")]
|
||||||
public int ColThreshDiv { get; set; } = 8;
|
public int ColThreshDiv { get; set; } = 8;
|
||||||
|
|
@ -232,10 +232,21 @@ class DiffOcrParams
|
||||||
[JsonPropertyName("upscale")]
|
[JsonPropertyName("upscale")]
|
||||||
public int Upscale { get; set; } = 2;
|
public int Upscale { get; set; } = 2;
|
||||||
|
|
||||||
|
[JsonPropertyName("useBackgroundSub")]
|
||||||
|
public bool UseBackgroundSub { get; set; } = true;
|
||||||
|
|
||||||
|
[JsonPropertyName("dimPercentile")]
|
||||||
|
public int DimPercentile { get; set; } = 40;
|
||||||
|
|
||||||
|
[JsonPropertyName("textThresh")]
|
||||||
|
public int TextThresh { get; set; } = 60;
|
||||||
|
|
||||||
public DiffOcrParams Clone() => (DiffOcrParams)MemberwiseClone();
|
public DiffOcrParams Clone() => (DiffOcrParams)MemberwiseClone();
|
||||||
|
|
||||||
public override string ToString() =>
|
public override string ToString() =>
|
||||||
$"diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} kernelSize={KernelSize} upscale={Upscale}";
|
UseBackgroundSub
|
||||||
|
? $"bgSub dimPct={DimPercentile} textThresh={TextThresh} diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} upscale={Upscale}"
|
||||||
|
: $"topHat kernelSize={KernelSize} diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} upscale={Upscale}";
|
||||||
}
|
}
|
||||||
|
|
||||||
class TestCase
|
class TestCase
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@
|
||||||
<TargetFramework>net8.0-windows10.0.19041.0</TargetFramework>
|
<TargetFramework>net8.0-windows10.0.19041.0</TargetFramework>
|
||||||
<Nullable>enable</Nullable>
|
<Nullable>enable</Nullable>
|
||||||
<ImplicitUsings>enable</ImplicitUsings>
|
<ImplicitUsings>enable</ImplicitUsings>
|
||||||
|
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|
|
||||||
|
|
@ -55,10 +55,9 @@ class OcrHandler(TesseractEngine engine)
|
||||||
return new OkResponse();
|
return new OkResponse();
|
||||||
}
|
}
|
||||||
|
|
||||||
public object HandleDiffOcr(Request req) => HandleDiffOcr(req, new DiffOcrParams
|
public object HandleDiffOcr(Request req) => HandleDiffOcr(req, req.Threshold > 0
|
||||||
{
|
? new DiffOcrParams { DiffThresh = req.Threshold }
|
||||||
DiffThresh = req.Threshold > 0 ? req.Threshold : 30,
|
: new DiffOcrParams());
|
||||||
});
|
|
||||||
|
|
||||||
public object HandleDiffOcr(Request req, DiffOcrParams p)
|
public object HandleDiffOcr(Request req, DiffOcrParams p)
|
||||||
{
|
{
|
||||||
|
|
@ -219,9 +218,9 @@ class OcrHandler(TesseractEngine engine)
|
||||||
|
|
||||||
if (debug) Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}");
|
if (debug) Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}");
|
||||||
|
|
||||||
// Simple crop of the tooltip region from the current frame (no per-pixel masking).
|
// Crop tooltip region from both current and reference frames
|
||||||
// The top-hat preprocessing will handle suppressing background text.
|
|
||||||
using var cropped = current.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb);
|
using var cropped = current.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb);
|
||||||
|
using var refCropped = _referenceFrame.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb);
|
||||||
|
|
||||||
// Save before/after preprocessing images if path is provided
|
// Save before/after preprocessing images if path is provided
|
||||||
if (!string.IsNullOrEmpty(req.Path))
|
if (!string.IsNullOrEmpty(req.Path))
|
||||||
|
|
@ -233,8 +232,10 @@ class OcrHandler(TesseractEngine engine)
|
||||||
if (debug) Console.Error.WriteLine($" diff-ocr: saved raw to {req.Path}");
|
if (debug) Console.Error.WriteLine($" diff-ocr: saved raw to {req.Path}");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pre-process for OCR: top-hat + binarize + upscale
|
// Pre-process for OCR
|
||||||
using var processed = ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
|
using var processed = p.UseBackgroundSub
|
||||||
|
? ImagePreprocessor.PreprocessWithBackgroundSub(cropped, refCropped, p.DimPercentile, p.TextThresh, p.Upscale)
|
||||||
|
: ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
|
||||||
|
|
||||||
// Save fullscreen and preprocessed versions alongside raw
|
// Save fullscreen and preprocessed versions alongside raw
|
||||||
if (!string.IsNullOrEmpty(req.Path))
|
if (!string.IsNullOrEmpty(req.Path))
|
||||||
|
|
@ -266,35 +267,82 @@ class OcrHandler(TesseractEngine engine)
|
||||||
|
|
||||||
public object HandleTune(Request req)
|
public object HandleTune(Request req)
|
||||||
{
|
{
|
||||||
// Coordinate descent: optimize one parameter at a time, repeat until stable.
|
int totalEvals = 0;
|
||||||
var best = new DiffOcrParams();
|
|
||||||
double bestScore = ScoreParams(best);
|
|
||||||
Console.Error.WriteLine($"\n=== Tuning start === baseline score={bestScore:F3} {best}\n");
|
|
||||||
|
|
||||||
// Define search ranges for each parameter
|
// --- Phase 1: Tune top-hat approach ---
|
||||||
var sweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
|
Console.Error.WriteLine("\n========== Phase 1: Top-Hat ==========");
|
||||||
|
var topHat = new DiffOcrParams { UseBackgroundSub = false };
|
||||||
|
double topHatScore = TuneParams(topHat, ref totalEvals, tuneTopHat: true, tuneBgSub: false);
|
||||||
|
|
||||||
|
// --- Phase 2: Tune background-subtraction approach ---
|
||||||
|
Console.Error.WriteLine("\n========== Phase 2: Background Subtraction ==========");
|
||||||
|
// Start bgSub from the best detection params found in phase 1
|
||||||
|
var bgSub = topHat.Clone();
|
||||||
|
bgSub.UseBackgroundSub = true;
|
||||||
|
double bgSubScore = TuneParams(bgSub, ref totalEvals, tuneTopHat: false, tuneBgSub: true);
|
||||||
|
|
||||||
|
// Pick the winner
|
||||||
|
var best = bgSubScore > topHatScore ? bgSub : topHat;
|
||||||
|
double bestScore = Math.Max(topHatScore, bgSubScore);
|
||||||
|
|
||||||
|
Console.Error.WriteLine($"\n========== Result ==========");
|
||||||
|
Console.Error.WriteLine($" Top-Hat: {topHatScore:F3} {topHat}");
|
||||||
|
Console.Error.WriteLine($" BgSub: {bgSubScore:F3} {bgSub}");
|
||||||
|
Console.Error.WriteLine($" Winner: {(best.UseBackgroundSub ? "BgSub" : "TopHat")} evals={totalEvals}\n");
|
||||||
|
|
||||||
|
// Final verbose report with best params
|
||||||
|
RunTestCases(best, verbose: true);
|
||||||
|
|
||||||
|
return new TuneResponse
|
||||||
|
{
|
||||||
|
BestScore = bestScore,
|
||||||
|
BestParams = best,
|
||||||
|
Iterations = totalEvals,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private double TuneParams(DiffOcrParams best, ref int totalEvals, bool tuneTopHat, bool tuneBgSub)
|
||||||
|
{
|
||||||
|
double bestScore = ScoreParams(best);
|
||||||
|
Console.Error.WriteLine($" baseline score={bestScore:F3} {best}\n");
|
||||||
|
|
||||||
|
// Detection params (shared by both approaches)
|
||||||
|
var sharedSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
|
||||||
{
|
{
|
||||||
("diffThresh", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.DiffThresh = v),
|
("diffThresh", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.DiffThresh = v),
|
||||||
("rowThreshDiv", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.RowThreshDiv = v),
|
("rowThreshDiv", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.RowThreshDiv = v),
|
||||||
("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.ColThreshDiv = v),
|
("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.ColThreshDiv = v),
|
||||||
("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.MaxGap = v),
|
("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.MaxGap = v),
|
||||||
("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41], (p, v) => p.KernelSize = v),
|
|
||||||
("upscale", [1, 2, 3], (p, v) => p.Upscale = v),
|
("upscale", [1, 2, 3], (p, v) => p.Upscale = v),
|
||||||
};
|
};
|
||||||
|
|
||||||
// trimCutoff needs double values — handle separately
|
// Top-hat specific
|
||||||
|
var topHatSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
|
||||||
|
{
|
||||||
|
("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41, 51], (p, v) => p.KernelSize = v),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Background-subtraction specific
|
||||||
|
var bgSubSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
|
||||||
|
{
|
||||||
|
("dimPercentile", [5, 10, 15, 20, 25, 30, 40, 50], (p, v) => p.DimPercentile = v),
|
||||||
|
("textThresh", [10, 15, 20, 25, 30, 40, 50, 60, 80], (p, v) => p.TextThresh = v),
|
||||||
|
};
|
||||||
|
|
||||||
double[] trimValues = [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5];
|
double[] trimValues = [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5];
|
||||||
|
|
||||||
int totalEvals = 0;
|
var allIntSweeps = sharedSweeps
|
||||||
const int maxRounds = 3;
|
.Concat(tuneTopHat ? topHatSweeps : [])
|
||||||
|
.Concat(tuneBgSub ? bgSubSweeps : [])
|
||||||
|
.ToArray();
|
||||||
|
|
||||||
|
const int maxRounds = 3;
|
||||||
for (int round = 0; round < maxRounds; round++)
|
for (int round = 0; round < maxRounds; round++)
|
||||||
{
|
{
|
||||||
bool improved = false;
|
bool improved = false;
|
||||||
Console.Error.WriteLine($"--- Round {round + 1} ---");
|
Console.Error.WriteLine($"--- Round {round + 1} ---");
|
||||||
|
|
||||||
// Sweep integer params
|
foreach (var (name, values, set) in allIntSweeps)
|
||||||
foreach (var (name, values, set) in sweeps)
|
|
||||||
{
|
{
|
||||||
Console.Error.Write($" {name}: ");
|
Console.Error.Write($" {name}: ");
|
||||||
int bestVal = 0;
|
int bestVal = 0;
|
||||||
|
|
@ -307,7 +355,6 @@ class OcrHandler(TesseractEngine engine)
|
||||||
double score = ScoreParams(trial);
|
double score = ScoreParams(trial);
|
||||||
totalEvals++;
|
totalEvals++;
|
||||||
Console.Error.Write($"{v}={score:F3} ");
|
Console.Error.Write($"{v}={score:F3} ");
|
||||||
|
|
||||||
if (score > bestValScore) { bestValScore = score; bestVal = v; }
|
if (score > bestValScore) { bestValScore = score; bestVal = v; }
|
||||||
}
|
}
|
||||||
Console.Error.WriteLine();
|
Console.Error.WriteLine();
|
||||||
|
|
@ -334,7 +381,6 @@ class OcrHandler(TesseractEngine engine)
|
||||||
double score = ScoreParams(trial);
|
double score = ScoreParams(trial);
|
||||||
totalEvals++;
|
totalEvals++;
|
||||||
Console.Error.Write($"{v:F2}={score:F3} ");
|
Console.Error.Write($"{v:F2}={score:F3} ");
|
||||||
|
|
||||||
if (score > bestTrimScore) { bestTrimScore = score; bestTrim = v; }
|
if (score > bestTrimScore) { bestTrimScore = score; bestTrim = v; }
|
||||||
}
|
}
|
||||||
Console.Error.WriteLine();
|
Console.Error.WriteLine();
|
||||||
|
|
@ -352,17 +398,7 @@ class OcrHandler(TesseractEngine engine)
|
||||||
if (!improved) break;
|
if (!improved) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
Console.Error.WriteLine($"\n=== Tuning done === evals={totalEvals} bestScore={bestScore:F3}\n {best}\n");
|
return bestScore;
|
||||||
|
|
||||||
// Run verbose test with best params for final report
|
|
||||||
var finalResult = RunTestCases(best, verbose: true);
|
|
||||||
|
|
||||||
return new TuneResponse
|
|
||||||
{
|
|
||||||
BestScore = bestScore,
|
|
||||||
BestParams = best,
|
|
||||||
Iterations = totalEvals,
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>Score a param set: average match ratio across all test cases (0-1).</summary>
|
/// <summary>Score a param set: average match ratio across all test cases (0-1).</summary>
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue