working on crop

This commit is contained in:
Boki 2026-02-12 17:48:16 -05:00
parent 93e2234c4e
commit f74e3e1c85
12 changed files with 1135 additions and 220 deletions

View file

@ -108,7 +108,7 @@ static class Daemon
var engine = request.Engine ?? "tesseract";
var preprocess = request.Preprocess ?? "none";
var kernelSize = request.Params?.KernelSize ?? 41;
var kernelSize = request.Params?.Ocr.KernelSize ?? 41;
// No preprocess + tesseract = original fast path
if (engine == "tesseract" && preprocess == "none")
@ -155,15 +155,17 @@ static class Daemon
{
var engine = request.Engine ?? "tesseract";
var isPythonEngine = engine is "easyocr" or "paddleocr";
var p = request.Params?.Clone() ?? new DiffOcrParams();
if (request.Threshold > 0) p.DiffThresh = request.Threshold;
var p = request.Params ?? new DiffOcrParams();
var cropParams = p.Crop;
var ocrParams = p.Ocr;
if (request.Threshold > 0) cropParams.DiffThresh = request.Threshold;
// Determine preprocess mode: explicit request.Preprocess > params.UseBackgroundSub > default "bgsub"
string preprocess;
if (request.Preprocess != null)
preprocess = request.Preprocess;
else if (request.Params != null)
preprocess = p.UseBackgroundSub ? "bgsub" : "tophat";
preprocess = ocrParams.UseBackgroundSub ? "bgsub" : "tophat";
else
preprocess = "bgsub";
@ -173,25 +175,25 @@ static class Daemon
var sw = System.Diagnostics.Stopwatch.StartNew();
var cropResult = ocrHandler.DiffCrop(request, p);
var cropResult = ocrHandler.DiffCrop(request, cropParams);
if (cropResult == null)
return new OcrResponse { Text = "", Lines = [] };
var (cropped, refCropped, current, region) = cropResult.Value;
using var _current = current;
// Preprocess
// Preprocess — only sees ocrParams
Bitmap processed;
if (preprocess == "bgsub")
{
int upscale = isPythonEngine ? 1 : p.Upscale;
int upscale = isPythonEngine ? 1 : ocrParams.Upscale;
processed = ImagePreprocessor.PreprocessWithBackgroundSub(
cropped, refCropped, dimPercentile: p.DimPercentile, textThresh: p.TextThresh,
upscale: upscale, softThreshold: p.SoftThreshold);
cropped, refCropped, dimPercentile: ocrParams.DimPercentile, textThresh: ocrParams.TextThresh,
upscale: upscale, softThreshold: ocrParams.SoftThreshold);
}
else if (preprocess == "tophat")
{
processed = ImagePreprocessor.PreprocessForOcr(cropped, kernelSize: p.KernelSize);
processed = ImagePreprocessor.PreprocessForOcr(cropped, kernelSize: ocrParams.KernelSize);
}
else // "none"
{
@ -228,7 +230,7 @@ static class Daemon
}
else // easyocr, paddleocr
{
var ocrResult = pythonBridge.OcrFromBitmap(processed, engine);
var ocrResult = pythonBridge.OcrFromBitmap(processed, engine, ocrParams);
var ocrMs = sw.ElapsedMilliseconds;
Console.Error.WriteLine($" diff-ocr-pipeline: engine={engine} preprocess={preprocess} diff={diffMs}ms ocr={ocrMs}ms crop={region.Width}x{region.Height}");

View file

@ -242,7 +242,7 @@ class TemplateMatchResponse
public double Confidence { get; set; }
}
class DiffOcrParams
sealed class DiffCropParams
{
[JsonPropertyName("diffThresh")]
public int DiffThresh { get; set; } = 20;
@ -259,6 +259,16 @@ class DiffOcrParams
[JsonPropertyName("trimCutoff")]
public double TrimCutoff { get; set; } = 0.4;
[JsonPropertyName("ocrPad")]
public int OcrPad { get; set; } = 10;
public override string ToString() =>
$"diffThresh={DiffThresh} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} ocrPad={OcrPad}";
}
sealed class OcrParams
{
// preprocessing
[JsonPropertyName("kernelSize")]
public int KernelSize { get; set; } = 41;
@ -277,9 +287,7 @@ class DiffOcrParams
[JsonPropertyName("softThreshold")]
public bool SoftThreshold { get; set; } = false;
[JsonPropertyName("ocrPad")]
public int OcrPad { get; set; } = 10;
// Tesseract-specific
[JsonPropertyName("usePerLineOcr")]
public bool UsePerLineOcr { get; set; } = false;
@ -292,12 +300,40 @@ class DiffOcrParams
[JsonPropertyName("psm")]
public int Psm { get; set; } = 6;
public DiffOcrParams Clone() => (DiffOcrParams)MemberwiseClone();
// post-merge / Python engine tuning
[JsonPropertyName("mergeGap")]
public int MergeGap { get; set; } = 0;
[JsonPropertyName("linkThreshold")]
public double? LinkThreshold { get; set; }
[JsonPropertyName("textThreshold")]
public double? TextThreshold { get; set; }
[JsonPropertyName("lowText")]
public double? LowText { get; set; }
[JsonPropertyName("widthThs")]
public double? WidthThs { get; set; }
[JsonPropertyName("paragraph")]
public bool? Paragraph { get; set; }
public override string ToString() =>
UseBackgroundSub
? $"bgSub dimPct={DimPercentile} textThresh={TextThresh} soft={SoftThreshold} ocrPad={OcrPad} perLine={UsePerLineOcr} lineGap={LineGapTolerance} linePadY={LinePadY} psm={Psm} diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} upscale={Upscale}"
: $"topHat kernelSize={KernelSize} ocrPad={OcrPad} perLine={UsePerLineOcr} lineGap={LineGapTolerance} linePadY={LinePadY} psm={Psm} diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} upscale={Upscale}";
? $"bgSub dimPct={DimPercentile} textThresh={TextThresh} soft={SoftThreshold} upscale={Upscale} mergeGap={MergeGap}"
: $"topHat kernel={KernelSize} upscale={Upscale} mergeGap={MergeGap}";
}
sealed class DiffOcrParams
{
[JsonPropertyName("crop")]
public DiffCropParams Crop { get; set; } = new();
[JsonPropertyName("ocr")]
public OcrParams Ocr { get; set; } = new();
public override string ToString() => $"[{Crop}] [{Ocr}]";
}
class TestCase

View file

@ -14,6 +14,7 @@ using SdImageFormat = System.Drawing.Imaging.ImageFormat;
class OcrHandler(TesseractEngine engine)
{
private Bitmap? _referenceFrame;
private RegionRect? _referenceRegion;
public object HandleOcr(Request req)
{
@ -56,31 +57,79 @@ class OcrHandler(TesseractEngine engine)
{
_referenceFrame?.Dispose();
_referenceFrame = ScreenCapture.CaptureOrLoad(req.File, req.Region);
_referenceRegion = req.Region == null
? null
: new RegionRect { X = req.Region.X, Y = req.Region.Y, Width = req.Region.Width, Height = req.Region.Height };
return new OkResponse();
}
public object HandleDiffOcr(Request req) => HandleDiffOcr(req, req.Threshold > 0
? new DiffOcrParams { DiffThresh = req.Threshold }
? new DiffOcrParams { Crop = new DiffCropParams { DiffThresh = req.Threshold } }
: new DiffOcrParams());
/// <summary>
/// Diff detection + crop only. Returns the raw tooltip crop bitmap and region,
/// or null if no tooltip detected. Caller is responsible for disposing the bitmap.
/// </summary>
public (Bitmap cropped, Bitmap refCropped, Bitmap current, RegionRect region)? DiffCrop(Request req, DiffOcrParams p)
public (Bitmap cropped, Bitmap refCropped, Bitmap current, RegionRect region)? DiffCrop(Request req, DiffCropParams c)
{
if (_referenceFrame == null)
return null;
var current = ScreenCapture.CaptureOrLoad(req.File, null);
var diffRegion = req.Region ?? _referenceRegion;
int baseX = diffRegion?.X ?? 0;
int baseY = diffRegion?.Y ?? 0;
var current = ScreenCapture.CaptureOrLoad(req.File, diffRegion);
int w = Math.Min(_referenceFrame.Width, current.Width);
int h = Math.Min(_referenceFrame.Height, current.Height);
Bitmap refForDiff = _referenceFrame;
bool disposeRef = false;
var refData = _referenceFrame.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
if (diffRegion != null)
{
if (_referenceRegion == null)
{
var croppedRef = CropBitmap(_referenceFrame, diffRegion);
if (croppedRef == null)
{
current.Dispose();
return null;
}
refForDiff = croppedRef;
disposeRef = true;
}
else if (!RegionsEqual(diffRegion, _referenceRegion))
{
int offX = diffRegion.X - _referenceRegion.X;
int offY = diffRegion.Y - _referenceRegion.Y;
if (offX < 0 || offY < 0 || offX + diffRegion.Width > _referenceFrame.Width || offY + diffRegion.Height > _referenceFrame.Height)
{
current.Dispose();
return null;
}
var croppedRef = CropBitmap(_referenceFrame, new RegionRect
{
X = offX,
Y = offY,
Width = diffRegion.Width,
Height = diffRegion.Height,
});
if (croppedRef == null)
{
current.Dispose();
return null;
}
refForDiff = croppedRef;
disposeRef = true;
}
}
int w = Math.Min(refForDiff.Width, current.Width);
int h = Math.Min(refForDiff.Height, current.Height);
var refData = refForDiff.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
byte[] refPx = new byte[refData.Stride * h];
Marshal.Copy(refData.Scan0, refPx, 0, refPx.Length);
_referenceFrame.UnlockBits(refData);
refForDiff.UnlockBits(refData);
int stride = refData.Stride;
var curData = current.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
@ -88,7 +137,7 @@ class OcrHandler(TesseractEngine engine)
Marshal.Copy(curData.Scan0, curPx, 0, curPx.Length);
current.UnlockBits(curData);
int diffThresh = p.DiffThresh;
int diffThresh = c.DiffThresh;
// Pass 1: parallel row diff — compute rowCounts[] directly, no changed[] array
int[] rowCounts = new int[h];
@ -112,11 +161,12 @@ class OcrHandler(TesseractEngine engine)
if (totalChanged == 0)
{
current.Dispose();
if (disposeRef) refForDiff.Dispose();
return null;
}
int maxGap = p.MaxGap;
int rowThresh = w / p.RowThreshDiv;
int maxGap = c.MaxGap;
int rowThresh = w / c.RowThreshDiv;
int bestRowStart = 0, bestRowEnd = 0, bestRowLen = 0;
int curRowStart = -1, lastActiveRow = -1;
for (int y = 0; y < h; y++)
@ -180,7 +230,7 @@ class OcrHandler(TesseractEngine engine)
}
int tooltipHeight = bestRowEnd - bestRowStart + 1;
int colThresh = tooltipHeight / p.ColThreshDiv;
int colThresh = tooltipHeight / c.ColThreshDiv;
int bestColStart = 0, bestColEnd = 0, bestColLen = 0;
int curColStart = -1, lastActiveCol = -1;
@ -210,6 +260,7 @@ class OcrHandler(TesseractEngine engine)
{
Console.Error.WriteLine($" diff-ocr: no tooltip-sized region found (rows={bestRowLen}, cols={bestColLen})");
current.Dispose();
if (disposeRef) refForDiff.Dispose();
return null;
}
@ -218,37 +269,73 @@ class OcrHandler(TesseractEngine engine)
int maxX = Math.Min(bestColEnd, w - 1);
int maxY = Math.Min(bestRowEnd, h - 1);
// Trim low-density edges on both axes to avoid oversized crops.
int colSpan = maxX - minX + 1;
if (colSpan > 100)
if (colSpan > 50)
{
int q1 = minX + colSpan / 4;
int q3 = minX + colSpan * 3 / 4;
long midSum = 0;
int midCount = 0;
for (int x = q1; x <= q3; x++) { midSum += colCounts[x]; midCount++; }
double avgMidDensity = (double)midSum / midCount;
double cutoff = avgMidDensity * p.TrimCutoff;
while (maxX > minX + 100 && colCounts[maxX] < cutoff)
double avgMidDensity = (double)midSum / Math.Max(1, midCount);
double cutoff = avgMidDensity * c.TrimCutoff;
while (minX < maxX - 50 && colCounts[minX] < cutoff)
minX++;
while (maxX > minX + 50 && colCounts[maxX] < cutoff)
maxX--;
}
int rowSpan = maxY - minY + 1;
if (rowSpan > 50)
{
int q1 = minY + rowSpan / 4;
int q3 = minY + rowSpan * 3 / 4;
long midSum = 0;
int midCount = 0;
for (int y = q1; y <= q3; y++) { midSum += rowCounts[y]; midCount++; }
double avgMidDensity = (double)midSum / Math.Max(1, midCount);
double cutoff = avgMidDensity * c.TrimCutoff;
while (minY < maxY - 50 && rowCounts[minY] < cutoff)
minY++;
while (maxY > minY + 50 && rowCounts[maxY] < cutoff)
maxY--;
}
int rw = maxX - minX + 1;
int rh = maxY - minY + 1;
var cropped = CropFromBytes(curPx, stride, minX, minY, rw, rh);
var refCropped = CropFromBytes(refPx, stride, minX, minY, rw, rh);
var region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh };
var region = new RegionRect { X = baseX + minX, Y = baseY + minY, Width = rw, Height = rh };
Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}");
if (disposeRef) refForDiff.Dispose();
return (cropped, refCropped, current, region);
}
private static bool RegionsEqual(RegionRect a, RegionRect b) =>
a.X == b.X && a.Y == b.Y && a.Width == b.Width && a.Height == b.Height;
private static Bitmap? CropBitmap(Bitmap src, RegionRect region)
{
int cx = Math.Max(0, region.X);
int cy = Math.Max(0, region.Y);
int cw = Math.Min(region.Width, src.Width - cx);
int ch = Math.Min(region.Height, src.Height - cy);
if (cw <= 0 || ch <= 0)
return null;
return src.Clone(new Rectangle(cx, cy, cw, ch), PixelFormat.Format32bppArgb);
}
public object HandleDiffOcr(Request req, DiffOcrParams p)
{
if (_referenceFrame == null)
return new ErrorResponse("No reference snapshot stored. Send 'snapshot' first.");
var cropResult = DiffCrop(req, p);
var cropResult = DiffCrop(req, p.Crop);
if (cropResult == null)
return new OcrResponse { Text = "", Lines = [] };
@ -270,14 +357,15 @@ class OcrHandler(TesseractEngine engine)
}
// Pre-process for OCR — get Mat for per-line detection and padding
var ocr = p.Ocr;
Mat processedMat;
if (p.UseBackgroundSub)
if (ocr.UseBackgroundSub)
{
processedMat = ImagePreprocessor.PreprocessWithBackgroundSubMat(cropped, refCropped, p.DimPercentile, p.TextThresh, p.Upscale, p.SoftThreshold);
processedMat = ImagePreprocessor.PreprocessWithBackgroundSubMat(cropped, refCropped, ocr.DimPercentile, ocr.TextThresh, ocr.Upscale, ocr.SoftThreshold);
}
else
{
using var topHatBmp = ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
using var topHatBmp = ImagePreprocessor.PreprocessForOcr(cropped, ocr.KernelSize, ocr.Upscale);
processedMat = BitmapConverter.ToMat(topHatBmp);
}
using var _processedMat = processedMat; // ensure disposal
@ -296,25 +384,25 @@ class OcrHandler(TesseractEngine engine)
if (debug) Console.Error.WriteLine($" diff-ocr: saved preprocessed to {prePath}");
}
int pad = p.OcrPad;
int upscale = p.Upscale > 0 ? p.Upscale : 1;
int pad = p.Crop.OcrPad;
int upscale = ocr.Upscale > 0 ? ocr.Upscale : 1;
var lines = new List<OcrLineResult>();
// Per-line OCR: detect text lines via horizontal projection, OCR each individually
if (p.UsePerLineOcr)
if (ocr.UsePerLineOcr)
{
// DetectTextLines needs binary input; if soft threshold produced grayscale, binarize a copy
int minRowPx = Math.Max(processedMat.Cols / 200, 3);
using var detectionMat = p.SoftThreshold ? new Mat() : null;
if (p.SoftThreshold)
using var detectionMat = ocr.SoftThreshold ? new Mat() : null;
if (ocr.SoftThreshold)
Cv2.Threshold(processedMat, detectionMat!, 128, 255, ThresholdTypes.Binary);
var lineDetectInput = p.SoftThreshold ? detectionMat! : processedMat;
var textLines = ImagePreprocessor.DetectTextLines(lineDetectInput, minRowPixels: minRowPx, gapTolerance: p.LineGapTolerance * upscale);
var lineDetectInput = ocr.SoftThreshold ? detectionMat! : processedMat;
var textLines = ImagePreprocessor.DetectTextLines(lineDetectInput, minRowPixels: minRowPx, gapTolerance: ocr.LineGapTolerance * upscale);
if (debug) Console.Error.WriteLine($" diff-ocr: detected {textLines.Count} text lines");
if (textLines.Count > 0)
{
int linePadY = p.LinePadY;
int linePadY = ocr.LinePadY;
foreach (var (yStart, yEnd) in textLines)
{
int y0 = Math.Max(yStart - linePadY, 0);
@ -330,7 +418,7 @@ class OcrHandler(TesseractEngine engine)
using var lineBmp = BitmapConverter.ToBitmap(padded);
using var linePix = ImageUtils.BitmapToPix(lineBmp);
using var linePage = engine.Process(linePix, (PageSegMode)p.Psm);
using var linePage = engine.Process(linePix, (PageSegMode)ocr.Psm);
// Extract words, adjusting coordinates back to screen space
// Word coords are in padded image space → subtract pad, add line offset, scale to original, add region offset
@ -386,7 +474,7 @@ class OcrHandler(TesseractEngine engine)
Cv2.CopyMakeBorder(processedMat, padded, pad, pad, pad, pad, BorderTypes.Constant, Scalar.White);
using var bmp = BitmapConverter.ToBitmap(padded);
using var pix = ImageUtils.BitmapToPix(bmp);
using var page = engine.Process(pix, (PageSegMode)p.Psm);
using var page = engine.Process(pix, (PageSegMode)ocr.Psm);
var text = page.GetText();
// Adjust word coordinates: subtract padding offset
@ -430,77 +518,161 @@ class OcrHandler(TesseractEngine engine)
public object HandleTest(Request req) => RunTestCases(new DiffOcrParams(), verbose: true);
private static DiffOcrParams CloneParams(DiffOcrParams p)
{
var json = JsonSerializer.Serialize(p);
return JsonSerializer.Deserialize<DiffOcrParams>(json)!;
}
public object HandleTune(Request req)
{
int totalEvals = 0;
// --- Phase 1: Tune top-hat approach ---
Console.Error.WriteLine("\n========== Phase 1: Top-Hat ==========");
var topHat = new DiffOcrParams { UseBackgroundSub = false };
double topHatScore = TuneParams(topHat, ref totalEvals, tuneTopHat: true, tuneBgSub: false);
// --- Phase A: Tune crop params ---
Console.Error.WriteLine("\n========== Phase A: Crop Params ==========");
var best = new DiffOcrParams();
double bestScore = TuneCropParams(best, ref totalEvals);
// --- Phase 2: Tune background-subtraction approach ---
Console.Error.WriteLine("\n========== Phase 2: Background Subtraction ==========");
// Start bgSub from the best detection params found in phase 1
var bgSub = topHat.Clone();
bgSub.UseBackgroundSub = true;
double bgSubScore = TuneParams(bgSub, ref totalEvals, tuneTopHat: false, tuneBgSub: true);
// --- Phase B: Tune OCR params (top-hat) ---
Console.Error.WriteLine("\n========== Phase B: OCR — Top-Hat ==========");
var topHat = CloneParams(best);
topHat.Ocr.UseBackgroundSub = false;
double topHatScore = TuneOcrParams(topHat, ref totalEvals, tuneTopHat: true, tuneBgSub: false);
// --- Phase C: Tune OCR params (background-subtraction) ---
Console.Error.WriteLine("\n========== Phase C: OCR — Background Subtraction ==========");
var bgSub = CloneParams(best);
bgSub.Ocr.UseBackgroundSub = true;
double bgSubScore = TuneOcrParams(bgSub, ref totalEvals, tuneTopHat: false, tuneBgSub: true);
// Pick the winner
var best = bgSubScore > topHatScore ? bgSub : topHat;
double bestScore = Math.Max(topHatScore, bgSubScore);
var winner = bgSubScore > topHatScore ? bgSub : topHat;
double winnerScore = Math.Max(topHatScore, bgSubScore);
Console.Error.WriteLine($"\n========== Result ==========");
Console.Error.WriteLine($" Top-Hat: {topHatScore:F3} {topHat}");
Console.Error.WriteLine($" BgSub: {bgSubScore:F3} {bgSub}");
Console.Error.WriteLine($" Winner: {(best.UseBackgroundSub ? "BgSub" : "TopHat")} evals={totalEvals}\n");
Console.Error.WriteLine($" Winner: {(winner.Ocr.UseBackgroundSub ? "BgSub" : "TopHat")} evals={totalEvals}\n");
// Final verbose report with best params
RunTestCases(best, verbose: true);
RunTestCases(winner, verbose: true);
return new TuneResponse
{
BestScore = bestScore,
BestParams = best,
BestScore = winnerScore,
BestParams = winner,
Iterations = totalEvals,
};
}
private double TuneParams(DiffOcrParams best, ref int totalEvals, bool tuneTopHat, bool tuneBgSub)
private double TuneCropParams(DiffOcrParams best, ref int totalEvals)
{
double bestScore = ScoreParams(best);
Console.Error.WriteLine($" baseline score={bestScore:F3} {best}\n");
// Detection params (shared by both approaches)
var sharedSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
var cropSweeps = new (string Name, int[] Values, Action<DiffCropParams, int> Set)[]
{
("diffThresh", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.DiffThresh = v),
("rowThreshDiv", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.RowThreshDiv = v),
("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.ColThreshDiv = v),
("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.MaxGap = v),
("upscale", [1, 2, 3], (p, v) => p.Upscale = v),
("ocrPad", [0, 5, 10, 15, 20, 30], (p, v) => p.OcrPad = v),
("psm", [4, 6, 11, 13], (p, v) => p.Psm = v),
};
// Top-hat specific
var topHatSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
{
("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41, 51], (p, v) => p.KernelSize = v),
};
// Background-subtraction specific
var bgSubSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
{
("dimPercentile", [5, 10, 15, 20, 25, 30, 40, 50], (p, v) => p.DimPercentile = v),
("textThresh", [10, 15, 20, 25, 30, 40, 50, 60, 80], (p, v) => p.TextThresh = v),
("lineGapTolerance", [3, 5, 8, 10, 15], (p, v) => p.LineGapTolerance = v),
("linePadY", [5, 10, 15, 20], (p, v) => p.LinePadY = v),
("diffThresh", [10, 15, 20, 25, 30, 40, 50, 60], (c, v) => c.DiffThresh = v),
("rowThreshDiv", [10, 15, 20, 25, 30, 40, 50, 60], (c, v) => c.RowThreshDiv = v),
("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (c, v) => c.ColThreshDiv = v),
("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (c, v) => c.MaxGap = v),
("ocrPad", [0, 5, 10, 15, 20, 30], (c, v) => c.OcrPad = v),
};
double[] trimValues = [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5];
var allIntSweeps = sharedSweeps
const int maxRounds = 3;
for (int round = 0; round < maxRounds; round++)
{
bool improved = false;
Console.Error.WriteLine($"--- Round {round + 1} ---");
foreach (var (name, values, set) in cropSweeps)
{
Console.Error.Write($" {name}: ");
int bestVal = 0;
double bestValScore = -1;
foreach (int v in values)
{
var trial = CloneParams(best);
set(trial.Crop, v);
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v}={score:F3} ");
if (score > bestValScore) { bestValScore = score; bestVal = v; }
}
Console.Error.WriteLine();
if (bestValScore > bestScore)
{
set(best.Crop, bestVal);
bestScore = bestValScore;
improved = true;
Console.Error.WriteLine($" → {name}={bestVal} score={bestScore:F3}");
}
}
// Sweep trimCutoff
{
Console.Error.Write($" trimCutoff: ");
double bestTrim = best.Crop.TrimCutoff;
double bestTrimScore = bestScore;
foreach (double v in trimValues)
{
var trial = CloneParams(best);
trial.Crop.TrimCutoff = v;
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v:F2}={score:F3} ");
if (score > bestTrimScore) { bestTrimScore = score; bestTrim = v; }
}
Console.Error.WriteLine();
if (bestTrimScore > bestScore)
{
best.Crop.TrimCutoff = bestTrim;
bestScore = bestTrimScore;
improved = true;
Console.Error.WriteLine($" → trimCutoff={bestTrim:F2} score={bestScore:F3}");
}
}
Console.Error.WriteLine($" End of round {round + 1}: score={bestScore:F3} {best}");
if (!improved) break;
}
return bestScore;
}
private double TuneOcrParams(DiffOcrParams best, ref int totalEvals, bool tuneTopHat, bool tuneBgSub)
{
double bestScore = ScoreParams(best);
Console.Error.WriteLine($" baseline score={bestScore:F3} {best}\n");
var sharedOcrSweeps = new (string Name, int[] Values, Action<OcrParams, int> Set)[]
{
("upscale", [1, 2, 3], (o, v) => o.Upscale = v),
("psm", [4, 6, 11, 13], (o, v) => o.Psm = v),
};
// Top-hat specific
var topHatSweeps = new (string Name, int[] Values, Action<OcrParams, int> Set)[]
{
("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41, 51], (o, v) => o.KernelSize = v),
};
// Background-subtraction specific
var bgSubSweeps = new (string Name, int[] Values, Action<OcrParams, int> Set)[]
{
("dimPercentile", [5, 10, 15, 20, 25, 30, 40, 50], (o, v) => o.DimPercentile = v),
("textThresh", [10, 15, 20, 25, 30, 40, 50, 60, 80], (o, v) => o.TextThresh = v),
("lineGapTolerance", [3, 5, 8, 10, 15], (o, v) => o.LineGapTolerance = v),
("linePadY", [5, 10, 15, 20], (o, v) => o.LinePadY = v),
};
var allOcrSweeps = sharedOcrSweeps
.Concat(tuneTopHat ? topHatSweeps : [])
.Concat(tuneBgSub ? bgSubSweeps : [])
.ToArray();
@ -511,7 +683,7 @@ class OcrHandler(TesseractEngine engine)
bool improved = false;
Console.Error.WriteLine($"--- Round {round + 1} ---");
foreach (var (name, values, set) in allIntSweeps)
foreach (var (name, values, set) in allOcrSweeps)
{
Console.Error.Write($" {name}: ");
int bestVal = 0;
@ -519,8 +691,8 @@ class OcrHandler(TesseractEngine engine)
foreach (int v in values)
{
var trial = best.Clone();
set(trial, v);
var trial = CloneParams(best);
set(trial.Ocr, v);
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v}={score:F3} ");
@ -530,39 +702,13 @@ class OcrHandler(TesseractEngine engine)
if (bestValScore > bestScore)
{
set(best, bestVal);
set(best.Ocr, bestVal);
bestScore = bestValScore;
improved = true;
Console.Error.WriteLine($" → {name}={bestVal} score={bestScore:F3}");
}
}
// Sweep trimCutoff
{
Console.Error.Write($" trimCutoff: ");
double bestTrim = best.TrimCutoff;
double bestTrimScore = bestScore;
foreach (double v in trimValues)
{
var trial = best.Clone();
trial.TrimCutoff = v;
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v:F2}={score:F3} ");
if (score > bestTrimScore) { bestTrimScore = score; bestTrim = v; }
}
Console.Error.WriteLine();
if (bestTrimScore > bestScore)
{
best.TrimCutoff = bestTrim;
bestScore = bestTrimScore;
improved = true;
Console.Error.WriteLine($" → trimCutoff={bestTrim:F2} score={bestScore:F3}");
}
}
Console.Error.WriteLine($" End of round {round + 1}: score={bestScore:F3} {best}");
if (!improved) break;
}

View file

@ -60,18 +60,19 @@ class PythonOcrBridge : IDisposable
/// <summary>
/// Run OCR on an already-saved image file via the Python engine.
/// </summary>
public OcrResponse OcrFromFile(string imagePath, string engine)
public OcrResponse OcrFromFile(string imagePath, string engine, OcrParams? ocrParams = null)
{
EnsureRunning();
var pyReq = new { cmd = "ocr", engine, imagePath };
var pyReq = BuildPythonRequest(engine, ocrParams);
pyReq["imagePath"] = imagePath;
return SendPythonRequest(pyReq);
}
/// <summary>
/// Run OCR on a bitmap via the Python engine (base64 PNG over pipe, no temp file).
/// </summary>
public OcrResponse OcrFromBitmap(Bitmap bitmap, string engine)
public OcrResponse OcrFromBitmap(Bitmap bitmap, string engine, OcrParams? ocrParams = null)
{
EnsureRunning();
@ -79,10 +80,26 @@ class PythonOcrBridge : IDisposable
bitmap.Save(ms, SdImageFormat.Png);
var imageBase64 = Convert.ToBase64String(ms.ToArray());
var pyReq = new { cmd = "ocr", engine, imageBase64 };
var pyReq = BuildPythonRequest(engine, ocrParams);
pyReq["imageBase64"] = imageBase64;
return SendPythonRequest(pyReq);
}
private static Dictionary<string, object?> BuildPythonRequest(string engine, OcrParams? ocrParams)
{
var req = new Dictionary<string, object?> { ["cmd"] = "ocr", ["engine"] = engine };
if (ocrParams == null) return req;
if (ocrParams.MergeGap > 0) req["mergeGap"] = ocrParams.MergeGap;
if (ocrParams.LinkThreshold.HasValue) req["linkThreshold"] = ocrParams.LinkThreshold.Value;
if (ocrParams.TextThreshold.HasValue) req["textThreshold"] = ocrParams.TextThreshold.Value;
if (ocrParams.LowText.HasValue) req["lowText"] = ocrParams.LowText.Value;
if (ocrParams.WidthThs.HasValue) req["widthThs"] = ocrParams.WidthThs.Value;
if (ocrParams.Paragraph.HasValue) req["paragraph"] = ocrParams.Paragraph.Value;
return req;
}
private OcrResponse SendPythonRequest(object pyReq)
{
var json = JsonSerializer.Serialize(pyReq, JsonOptions);