working on crop

This commit is contained in:
Boki 2026-02-12 17:48:16 -05:00
parent 93e2234c4e
commit f74e3e1c85
12 changed files with 1135 additions and 220 deletions

View file

@ -108,7 +108,7 @@ static class Daemon
var engine = request.Engine ?? "tesseract";
var preprocess = request.Preprocess ?? "none";
var kernelSize = request.Params?.KernelSize ?? 41;
var kernelSize = request.Params?.Ocr.KernelSize ?? 41;
// No preprocess + tesseract = original fast path
if (engine == "tesseract" && preprocess == "none")
@ -155,15 +155,17 @@ static class Daemon
{
var engine = request.Engine ?? "tesseract";
var isPythonEngine = engine is "easyocr" or "paddleocr";
var p = request.Params?.Clone() ?? new DiffOcrParams();
if (request.Threshold > 0) p.DiffThresh = request.Threshold;
var p = request.Params ?? new DiffOcrParams();
var cropParams = p.Crop;
var ocrParams = p.Ocr;
if (request.Threshold > 0) cropParams.DiffThresh = request.Threshold;
// Determine preprocess mode: explicit request.Preprocess > params.UseBackgroundSub > default "bgsub"
string preprocess;
if (request.Preprocess != null)
preprocess = request.Preprocess;
else if (request.Params != null)
preprocess = p.UseBackgroundSub ? "bgsub" : "tophat";
preprocess = ocrParams.UseBackgroundSub ? "bgsub" : "tophat";
else
preprocess = "bgsub";
@ -173,25 +175,25 @@ static class Daemon
var sw = System.Diagnostics.Stopwatch.StartNew();
var cropResult = ocrHandler.DiffCrop(request, p);
var cropResult = ocrHandler.DiffCrop(request, cropParams);
if (cropResult == null)
return new OcrResponse { Text = "", Lines = [] };
var (cropped, refCropped, current, region) = cropResult.Value;
using var _current = current;
// Preprocess
// Preprocess — only sees ocrParams
Bitmap processed;
if (preprocess == "bgsub")
{
int upscale = isPythonEngine ? 1 : p.Upscale;
int upscale = isPythonEngine ? 1 : ocrParams.Upscale;
processed = ImagePreprocessor.PreprocessWithBackgroundSub(
cropped, refCropped, dimPercentile: p.DimPercentile, textThresh: p.TextThresh,
upscale: upscale, softThreshold: p.SoftThreshold);
cropped, refCropped, dimPercentile: ocrParams.DimPercentile, textThresh: ocrParams.TextThresh,
upscale: upscale, softThreshold: ocrParams.SoftThreshold);
}
else if (preprocess == "tophat")
{
processed = ImagePreprocessor.PreprocessForOcr(cropped, kernelSize: p.KernelSize);
processed = ImagePreprocessor.PreprocessForOcr(cropped, kernelSize: ocrParams.KernelSize);
}
else // "none"
{
@ -228,7 +230,7 @@ static class Daemon
}
else // easyocr, paddleocr
{
var ocrResult = pythonBridge.OcrFromBitmap(processed, engine);
var ocrResult = pythonBridge.OcrFromBitmap(processed, engine, ocrParams);
var ocrMs = sw.ElapsedMilliseconds;
Console.Error.WriteLine($" diff-ocr-pipeline: engine={engine} preprocess={preprocess} diff={diffMs}ms ocr={ocrMs}ms crop={region.Width}x{region.Height}");

View file

@ -242,7 +242,7 @@ class TemplateMatchResponse
public double Confidence { get; set; }
}
class DiffOcrParams
sealed class DiffCropParams
{
[JsonPropertyName("diffThresh")]
public int DiffThresh { get; set; } = 20;
@ -259,6 +259,16 @@ class DiffOcrParams
[JsonPropertyName("trimCutoff")]
public double TrimCutoff { get; set; } = 0.4;
[JsonPropertyName("ocrPad")]
public int OcrPad { get; set; } = 10;
public override string ToString() =>
$"diffThresh={DiffThresh} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} ocrPad={OcrPad}";
}
sealed class OcrParams
{
// preprocessing
[JsonPropertyName("kernelSize")]
public int KernelSize { get; set; } = 41;
@ -277,9 +287,7 @@ class DiffOcrParams
[JsonPropertyName("softThreshold")]
public bool SoftThreshold { get; set; } = false;
[JsonPropertyName("ocrPad")]
public int OcrPad { get; set; } = 10;
// Tesseract-specific
[JsonPropertyName("usePerLineOcr")]
public bool UsePerLineOcr { get; set; } = false;
@ -292,12 +300,40 @@ class DiffOcrParams
[JsonPropertyName("psm")]
public int Psm { get; set; } = 6;
public DiffOcrParams Clone() => (DiffOcrParams)MemberwiseClone();
// post-merge / Python engine tuning
[JsonPropertyName("mergeGap")]
public int MergeGap { get; set; } = 0;
[JsonPropertyName("linkThreshold")]
public double? LinkThreshold { get; set; }
[JsonPropertyName("textThreshold")]
public double? TextThreshold { get; set; }
[JsonPropertyName("lowText")]
public double? LowText { get; set; }
[JsonPropertyName("widthThs")]
public double? WidthThs { get; set; }
[JsonPropertyName("paragraph")]
public bool? Paragraph { get; set; }
public override string ToString() =>
UseBackgroundSub
? $"bgSub dimPct={DimPercentile} textThresh={TextThresh} soft={SoftThreshold} ocrPad={OcrPad} perLine={UsePerLineOcr} lineGap={LineGapTolerance} linePadY={LinePadY} psm={Psm} diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} upscale={Upscale}"
: $"topHat kernelSize={KernelSize} ocrPad={OcrPad} perLine={UsePerLineOcr} lineGap={LineGapTolerance} linePadY={LinePadY} psm={Psm} diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} upscale={Upscale}";
? $"bgSub dimPct={DimPercentile} textThresh={TextThresh} soft={SoftThreshold} upscale={Upscale} mergeGap={MergeGap}"
: $"topHat kernel={KernelSize} upscale={Upscale} mergeGap={MergeGap}";
}
sealed class DiffOcrParams
{
[JsonPropertyName("crop")]
public DiffCropParams Crop { get; set; } = new();
[JsonPropertyName("ocr")]
public OcrParams Ocr { get; set; } = new();
public override string ToString() => $"[{Crop}] [{Ocr}]";
}
class TestCase

View file

@ -14,6 +14,7 @@ using SdImageFormat = System.Drawing.Imaging.ImageFormat;
class OcrHandler(TesseractEngine engine)
{
private Bitmap? _referenceFrame;
private RegionRect? _referenceRegion;
public object HandleOcr(Request req)
{
@ -56,31 +57,79 @@ class OcrHandler(TesseractEngine engine)
{
_referenceFrame?.Dispose();
_referenceFrame = ScreenCapture.CaptureOrLoad(req.File, req.Region);
_referenceRegion = req.Region == null
? null
: new RegionRect { X = req.Region.X, Y = req.Region.Y, Width = req.Region.Width, Height = req.Region.Height };
return new OkResponse();
}
public object HandleDiffOcr(Request req) => HandleDiffOcr(req, req.Threshold > 0
? new DiffOcrParams { DiffThresh = req.Threshold }
? new DiffOcrParams { Crop = new DiffCropParams { DiffThresh = req.Threshold } }
: new DiffOcrParams());
/// <summary>
/// Diff detection + crop only. Returns the raw tooltip crop bitmap and region,
/// or null if no tooltip detected. Caller is responsible for disposing the bitmap.
/// </summary>
public (Bitmap cropped, Bitmap refCropped, Bitmap current, RegionRect region)? DiffCrop(Request req, DiffOcrParams p)
public (Bitmap cropped, Bitmap refCropped, Bitmap current, RegionRect region)? DiffCrop(Request req, DiffCropParams c)
{
if (_referenceFrame == null)
return null;
var current = ScreenCapture.CaptureOrLoad(req.File, null);
var diffRegion = req.Region ?? _referenceRegion;
int baseX = diffRegion?.X ?? 0;
int baseY = diffRegion?.Y ?? 0;
var current = ScreenCapture.CaptureOrLoad(req.File, diffRegion);
int w = Math.Min(_referenceFrame.Width, current.Width);
int h = Math.Min(_referenceFrame.Height, current.Height);
Bitmap refForDiff = _referenceFrame;
bool disposeRef = false;
var refData = _referenceFrame.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
if (diffRegion != null)
{
if (_referenceRegion == null)
{
var croppedRef = CropBitmap(_referenceFrame, diffRegion);
if (croppedRef == null)
{
current.Dispose();
return null;
}
refForDiff = croppedRef;
disposeRef = true;
}
else if (!RegionsEqual(diffRegion, _referenceRegion))
{
int offX = diffRegion.X - _referenceRegion.X;
int offY = diffRegion.Y - _referenceRegion.Y;
if (offX < 0 || offY < 0 || offX + diffRegion.Width > _referenceFrame.Width || offY + diffRegion.Height > _referenceFrame.Height)
{
current.Dispose();
return null;
}
var croppedRef = CropBitmap(_referenceFrame, new RegionRect
{
X = offX,
Y = offY,
Width = diffRegion.Width,
Height = diffRegion.Height,
});
if (croppedRef == null)
{
current.Dispose();
return null;
}
refForDiff = croppedRef;
disposeRef = true;
}
}
int w = Math.Min(refForDiff.Width, current.Width);
int h = Math.Min(refForDiff.Height, current.Height);
var refData = refForDiff.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
byte[] refPx = new byte[refData.Stride * h];
Marshal.Copy(refData.Scan0, refPx, 0, refPx.Length);
_referenceFrame.UnlockBits(refData);
refForDiff.UnlockBits(refData);
int stride = refData.Stride;
var curData = current.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
@ -88,7 +137,7 @@ class OcrHandler(TesseractEngine engine)
Marshal.Copy(curData.Scan0, curPx, 0, curPx.Length);
current.UnlockBits(curData);
int diffThresh = p.DiffThresh;
int diffThresh = c.DiffThresh;
// Pass 1: parallel row diff — compute rowCounts[] directly, no changed[] array
int[] rowCounts = new int[h];
@ -112,11 +161,12 @@ class OcrHandler(TesseractEngine engine)
if (totalChanged == 0)
{
current.Dispose();
if (disposeRef) refForDiff.Dispose();
return null;
}
int maxGap = p.MaxGap;
int rowThresh = w / p.RowThreshDiv;
int maxGap = c.MaxGap;
int rowThresh = w / c.RowThreshDiv;
int bestRowStart = 0, bestRowEnd = 0, bestRowLen = 0;
int curRowStart = -1, lastActiveRow = -1;
for (int y = 0; y < h; y++)
@ -180,7 +230,7 @@ class OcrHandler(TesseractEngine engine)
}
int tooltipHeight = bestRowEnd - bestRowStart + 1;
int colThresh = tooltipHeight / p.ColThreshDiv;
int colThresh = tooltipHeight / c.ColThreshDiv;
int bestColStart = 0, bestColEnd = 0, bestColLen = 0;
int curColStart = -1, lastActiveCol = -1;
@ -210,6 +260,7 @@ class OcrHandler(TesseractEngine engine)
{
Console.Error.WriteLine($" diff-ocr: no tooltip-sized region found (rows={bestRowLen}, cols={bestColLen})");
current.Dispose();
if (disposeRef) refForDiff.Dispose();
return null;
}
@ -218,37 +269,73 @@ class OcrHandler(TesseractEngine engine)
int maxX = Math.Min(bestColEnd, w - 1);
int maxY = Math.Min(bestRowEnd, h - 1);
// Trim low-density edges on both axes to avoid oversized crops.
int colSpan = maxX - minX + 1;
if (colSpan > 100)
if (colSpan > 50)
{
int q1 = minX + colSpan / 4;
int q3 = minX + colSpan * 3 / 4;
long midSum = 0;
int midCount = 0;
for (int x = q1; x <= q3; x++) { midSum += colCounts[x]; midCount++; }
double avgMidDensity = (double)midSum / midCount;
double cutoff = avgMidDensity * p.TrimCutoff;
while (maxX > minX + 100 && colCounts[maxX] < cutoff)
double avgMidDensity = (double)midSum / Math.Max(1, midCount);
double cutoff = avgMidDensity * c.TrimCutoff;
while (minX < maxX - 50 && colCounts[minX] < cutoff)
minX++;
while (maxX > minX + 50 && colCounts[maxX] < cutoff)
maxX--;
}
int rowSpan = maxY - minY + 1;
if (rowSpan > 50)
{
int q1 = minY + rowSpan / 4;
int q3 = minY + rowSpan * 3 / 4;
long midSum = 0;
int midCount = 0;
for (int y = q1; y <= q3; y++) { midSum += rowCounts[y]; midCount++; }
double avgMidDensity = (double)midSum / Math.Max(1, midCount);
double cutoff = avgMidDensity * c.TrimCutoff;
while (minY < maxY - 50 && rowCounts[minY] < cutoff)
minY++;
while (maxY > minY + 50 && rowCounts[maxY] < cutoff)
maxY--;
}
int rw = maxX - minX + 1;
int rh = maxY - minY + 1;
var cropped = CropFromBytes(curPx, stride, minX, minY, rw, rh);
var refCropped = CropFromBytes(refPx, stride, minX, minY, rw, rh);
var region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh };
var region = new RegionRect { X = baseX + minX, Y = baseY + minY, Width = rw, Height = rh };
Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}");
if (disposeRef) refForDiff.Dispose();
return (cropped, refCropped, current, region);
}
private static bool RegionsEqual(RegionRect a, RegionRect b) =>
a.X == b.X && a.Y == b.Y && a.Width == b.Width && a.Height == b.Height;
private static Bitmap? CropBitmap(Bitmap src, RegionRect region)
{
int cx = Math.Max(0, region.X);
int cy = Math.Max(0, region.Y);
int cw = Math.Min(region.Width, src.Width - cx);
int ch = Math.Min(region.Height, src.Height - cy);
if (cw <= 0 || ch <= 0)
return null;
return src.Clone(new Rectangle(cx, cy, cw, ch), PixelFormat.Format32bppArgb);
}
public object HandleDiffOcr(Request req, DiffOcrParams p)
{
if (_referenceFrame == null)
return new ErrorResponse("No reference snapshot stored. Send 'snapshot' first.");
var cropResult = DiffCrop(req, p);
var cropResult = DiffCrop(req, p.Crop);
if (cropResult == null)
return new OcrResponse { Text = "", Lines = [] };
@ -270,14 +357,15 @@ class OcrHandler(TesseractEngine engine)
}
// Pre-process for OCR — get Mat for per-line detection and padding
var ocr = p.Ocr;
Mat processedMat;
if (p.UseBackgroundSub)
if (ocr.UseBackgroundSub)
{
processedMat = ImagePreprocessor.PreprocessWithBackgroundSubMat(cropped, refCropped, p.DimPercentile, p.TextThresh, p.Upscale, p.SoftThreshold);
processedMat = ImagePreprocessor.PreprocessWithBackgroundSubMat(cropped, refCropped, ocr.DimPercentile, ocr.TextThresh, ocr.Upscale, ocr.SoftThreshold);
}
else
{
using var topHatBmp = ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
using var topHatBmp = ImagePreprocessor.PreprocessForOcr(cropped, ocr.KernelSize, ocr.Upscale);
processedMat = BitmapConverter.ToMat(topHatBmp);
}
using var _processedMat = processedMat; // ensure disposal
@ -296,25 +384,25 @@ class OcrHandler(TesseractEngine engine)
if (debug) Console.Error.WriteLine($" diff-ocr: saved preprocessed to {prePath}");
}
int pad = p.OcrPad;
int upscale = p.Upscale > 0 ? p.Upscale : 1;
int pad = p.Crop.OcrPad;
int upscale = ocr.Upscale > 0 ? ocr.Upscale : 1;
var lines = new List<OcrLineResult>();
// Per-line OCR: detect text lines via horizontal projection, OCR each individually
if (p.UsePerLineOcr)
if (ocr.UsePerLineOcr)
{
// DetectTextLines needs binary input; if soft threshold produced grayscale, binarize a copy
int minRowPx = Math.Max(processedMat.Cols / 200, 3);
using var detectionMat = p.SoftThreshold ? new Mat() : null;
if (p.SoftThreshold)
using var detectionMat = ocr.SoftThreshold ? new Mat() : null;
if (ocr.SoftThreshold)
Cv2.Threshold(processedMat, detectionMat!, 128, 255, ThresholdTypes.Binary);
var lineDetectInput = p.SoftThreshold ? detectionMat! : processedMat;
var textLines = ImagePreprocessor.DetectTextLines(lineDetectInput, minRowPixels: minRowPx, gapTolerance: p.LineGapTolerance * upscale);
var lineDetectInput = ocr.SoftThreshold ? detectionMat! : processedMat;
var textLines = ImagePreprocessor.DetectTextLines(lineDetectInput, minRowPixels: minRowPx, gapTolerance: ocr.LineGapTolerance * upscale);
if (debug) Console.Error.WriteLine($" diff-ocr: detected {textLines.Count} text lines");
if (textLines.Count > 0)
{
int linePadY = p.LinePadY;
int linePadY = ocr.LinePadY;
foreach (var (yStart, yEnd) in textLines)
{
int y0 = Math.Max(yStart - linePadY, 0);
@ -330,7 +418,7 @@ class OcrHandler(TesseractEngine engine)
using var lineBmp = BitmapConverter.ToBitmap(padded);
using var linePix = ImageUtils.BitmapToPix(lineBmp);
using var linePage = engine.Process(linePix, (PageSegMode)p.Psm);
using var linePage = engine.Process(linePix, (PageSegMode)ocr.Psm);
// Extract words, adjusting coordinates back to screen space
// Word coords are in padded image space → subtract pad, add line offset, scale to original, add region offset
@ -386,7 +474,7 @@ class OcrHandler(TesseractEngine engine)
Cv2.CopyMakeBorder(processedMat, padded, pad, pad, pad, pad, BorderTypes.Constant, Scalar.White);
using var bmp = BitmapConverter.ToBitmap(padded);
using var pix = ImageUtils.BitmapToPix(bmp);
using var page = engine.Process(pix, (PageSegMode)p.Psm);
using var page = engine.Process(pix, (PageSegMode)ocr.Psm);
var text = page.GetText();
// Adjust word coordinates: subtract padding offset
@ -430,77 +518,161 @@ class OcrHandler(TesseractEngine engine)
public object HandleTest(Request req) => RunTestCases(new DiffOcrParams(), verbose: true);
private static DiffOcrParams CloneParams(DiffOcrParams p)
{
var json = JsonSerializer.Serialize(p);
return JsonSerializer.Deserialize<DiffOcrParams>(json)!;
}
public object HandleTune(Request req)
{
int totalEvals = 0;
// --- Phase 1: Tune top-hat approach ---
Console.Error.WriteLine("\n========== Phase 1: Top-Hat ==========");
var topHat = new DiffOcrParams { UseBackgroundSub = false };
double topHatScore = TuneParams(topHat, ref totalEvals, tuneTopHat: true, tuneBgSub: false);
// --- Phase A: Tune crop params ---
Console.Error.WriteLine("\n========== Phase A: Crop Params ==========");
var best = new DiffOcrParams();
double bestScore = TuneCropParams(best, ref totalEvals);
// --- Phase 2: Tune background-subtraction approach ---
Console.Error.WriteLine("\n========== Phase 2: Background Subtraction ==========");
// Start bgSub from the best detection params found in phase 1
var bgSub = topHat.Clone();
bgSub.UseBackgroundSub = true;
double bgSubScore = TuneParams(bgSub, ref totalEvals, tuneTopHat: false, tuneBgSub: true);
// --- Phase B: Tune OCR params (top-hat) ---
Console.Error.WriteLine("\n========== Phase B: OCR — Top-Hat ==========");
var topHat = CloneParams(best);
topHat.Ocr.UseBackgroundSub = false;
double topHatScore = TuneOcrParams(topHat, ref totalEvals, tuneTopHat: true, tuneBgSub: false);
// --- Phase C: Tune OCR params (background-subtraction) ---
Console.Error.WriteLine("\n========== Phase C: OCR — Background Subtraction ==========");
var bgSub = CloneParams(best);
bgSub.Ocr.UseBackgroundSub = true;
double bgSubScore = TuneOcrParams(bgSub, ref totalEvals, tuneTopHat: false, tuneBgSub: true);
// Pick the winner
var best = bgSubScore > topHatScore ? bgSub : topHat;
double bestScore = Math.Max(topHatScore, bgSubScore);
var winner = bgSubScore > topHatScore ? bgSub : topHat;
double winnerScore = Math.Max(topHatScore, bgSubScore);
Console.Error.WriteLine($"\n========== Result ==========");
Console.Error.WriteLine($" Top-Hat: {topHatScore:F3} {topHat}");
Console.Error.WriteLine($" BgSub: {bgSubScore:F3} {bgSub}");
Console.Error.WriteLine($" Winner: {(best.UseBackgroundSub ? "BgSub" : "TopHat")} evals={totalEvals}\n");
Console.Error.WriteLine($" Winner: {(winner.Ocr.UseBackgroundSub ? "BgSub" : "TopHat")} evals={totalEvals}\n");
// Final verbose report with best params
RunTestCases(best, verbose: true);
RunTestCases(winner, verbose: true);
return new TuneResponse
{
BestScore = bestScore,
BestParams = best,
BestScore = winnerScore,
BestParams = winner,
Iterations = totalEvals,
};
}
private double TuneParams(DiffOcrParams best, ref int totalEvals, bool tuneTopHat, bool tuneBgSub)
private double TuneCropParams(DiffOcrParams best, ref int totalEvals)
{
double bestScore = ScoreParams(best);
Console.Error.WriteLine($" baseline score={bestScore:F3} {best}\n");
// Detection params (shared by both approaches)
var sharedSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
var cropSweeps = new (string Name, int[] Values, Action<DiffCropParams, int> Set)[]
{
("diffThresh", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.DiffThresh = v),
("rowThreshDiv", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.RowThreshDiv = v),
("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.ColThreshDiv = v),
("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.MaxGap = v),
("upscale", [1, 2, 3], (p, v) => p.Upscale = v),
("ocrPad", [0, 5, 10, 15, 20, 30], (p, v) => p.OcrPad = v),
("psm", [4, 6, 11, 13], (p, v) => p.Psm = v),
};
// Top-hat specific
var topHatSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
{
("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41, 51], (p, v) => p.KernelSize = v),
};
// Background-subtraction specific
var bgSubSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
{
("dimPercentile", [5, 10, 15, 20, 25, 30, 40, 50], (p, v) => p.DimPercentile = v),
("textThresh", [10, 15, 20, 25, 30, 40, 50, 60, 80], (p, v) => p.TextThresh = v),
("lineGapTolerance", [3, 5, 8, 10, 15], (p, v) => p.LineGapTolerance = v),
("linePadY", [5, 10, 15, 20], (p, v) => p.LinePadY = v),
("diffThresh", [10, 15, 20, 25, 30, 40, 50, 60], (c, v) => c.DiffThresh = v),
("rowThreshDiv", [10, 15, 20, 25, 30, 40, 50, 60], (c, v) => c.RowThreshDiv = v),
("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (c, v) => c.ColThreshDiv = v),
("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (c, v) => c.MaxGap = v),
("ocrPad", [0, 5, 10, 15, 20, 30], (c, v) => c.OcrPad = v),
};
double[] trimValues = [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5];
var allIntSweeps = sharedSweeps
const int maxRounds = 3;
for (int round = 0; round < maxRounds; round++)
{
bool improved = false;
Console.Error.WriteLine($"--- Round {round + 1} ---");
foreach (var (name, values, set) in cropSweeps)
{
Console.Error.Write($" {name}: ");
int bestVal = 0;
double bestValScore = -1;
foreach (int v in values)
{
var trial = CloneParams(best);
set(trial.Crop, v);
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v}={score:F3} ");
if (score > bestValScore) { bestValScore = score; bestVal = v; }
}
Console.Error.WriteLine();
if (bestValScore > bestScore)
{
set(best.Crop, bestVal);
bestScore = bestValScore;
improved = true;
Console.Error.WriteLine($" → {name}={bestVal} score={bestScore:F3}");
}
}
// Sweep trimCutoff
{
Console.Error.Write($" trimCutoff: ");
double bestTrim = best.Crop.TrimCutoff;
double bestTrimScore = bestScore;
foreach (double v in trimValues)
{
var trial = CloneParams(best);
trial.Crop.TrimCutoff = v;
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v:F2}={score:F3} ");
if (score > bestTrimScore) { bestTrimScore = score; bestTrim = v; }
}
Console.Error.WriteLine();
if (bestTrimScore > bestScore)
{
best.Crop.TrimCutoff = bestTrim;
bestScore = bestTrimScore;
improved = true;
Console.Error.WriteLine($" → trimCutoff={bestTrim:F2} score={bestScore:F3}");
}
}
Console.Error.WriteLine($" End of round {round + 1}: score={bestScore:F3} {best}");
if (!improved) break;
}
return bestScore;
}
private double TuneOcrParams(DiffOcrParams best, ref int totalEvals, bool tuneTopHat, bool tuneBgSub)
{
double bestScore = ScoreParams(best);
Console.Error.WriteLine($" baseline score={bestScore:F3} {best}\n");
var sharedOcrSweeps = new (string Name, int[] Values, Action<OcrParams, int> Set)[]
{
("upscale", [1, 2, 3], (o, v) => o.Upscale = v),
("psm", [4, 6, 11, 13], (o, v) => o.Psm = v),
};
// Top-hat specific
var topHatSweeps = new (string Name, int[] Values, Action<OcrParams, int> Set)[]
{
("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41, 51], (o, v) => o.KernelSize = v),
};
// Background-subtraction specific
var bgSubSweeps = new (string Name, int[] Values, Action<OcrParams, int> Set)[]
{
("dimPercentile", [5, 10, 15, 20, 25, 30, 40, 50], (o, v) => o.DimPercentile = v),
("textThresh", [10, 15, 20, 25, 30, 40, 50, 60, 80], (o, v) => o.TextThresh = v),
("lineGapTolerance", [3, 5, 8, 10, 15], (o, v) => o.LineGapTolerance = v),
("linePadY", [5, 10, 15, 20], (o, v) => o.LinePadY = v),
};
var allOcrSweeps = sharedOcrSweeps
.Concat(tuneTopHat ? topHatSweeps : [])
.Concat(tuneBgSub ? bgSubSweeps : [])
.ToArray();
@ -511,7 +683,7 @@ class OcrHandler(TesseractEngine engine)
bool improved = false;
Console.Error.WriteLine($"--- Round {round + 1} ---");
foreach (var (name, values, set) in allIntSweeps)
foreach (var (name, values, set) in allOcrSweeps)
{
Console.Error.Write($" {name}: ");
int bestVal = 0;
@ -519,8 +691,8 @@ class OcrHandler(TesseractEngine engine)
foreach (int v in values)
{
var trial = best.Clone();
set(trial, v);
var trial = CloneParams(best);
set(trial.Ocr, v);
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v}={score:F3} ");
@ -530,39 +702,13 @@ class OcrHandler(TesseractEngine engine)
if (bestValScore > bestScore)
{
set(best, bestVal);
set(best.Ocr, bestVal);
bestScore = bestValScore;
improved = true;
Console.Error.WriteLine($" → {name}={bestVal} score={bestScore:F3}");
}
}
// Sweep trimCutoff
{
Console.Error.Write($" trimCutoff: ");
double bestTrim = best.TrimCutoff;
double bestTrimScore = bestScore;
foreach (double v in trimValues)
{
var trial = best.Clone();
trial.TrimCutoff = v;
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v:F2}={score:F3} ");
if (score > bestTrimScore) { bestTrimScore = score; bestTrim = v; }
}
Console.Error.WriteLine();
if (bestTrimScore > bestScore)
{
best.TrimCutoff = bestTrim;
bestScore = bestTrimScore;
improved = true;
Console.Error.WriteLine($" → trimCutoff={bestTrim:F2} score={bestScore:F3}");
}
}
Console.Error.WriteLine($" End of round {round + 1}: score={bestScore:F3} {best}");
if (!improved) break;
}

View file

@ -60,18 +60,19 @@ class PythonOcrBridge : IDisposable
/// <summary>
/// Run OCR on an already-saved image file via the Python engine.
/// </summary>
public OcrResponse OcrFromFile(string imagePath, string engine)
public OcrResponse OcrFromFile(string imagePath, string engine, OcrParams? ocrParams = null)
{
EnsureRunning();
var pyReq = new { cmd = "ocr", engine, imagePath };
var pyReq = BuildPythonRequest(engine, ocrParams);
pyReq["imagePath"] = imagePath;
return SendPythonRequest(pyReq);
}
/// <summary>
/// Run OCR on a bitmap via the Python engine (base64 PNG over pipe, no temp file).
/// </summary>
public OcrResponse OcrFromBitmap(Bitmap bitmap, string engine)
public OcrResponse OcrFromBitmap(Bitmap bitmap, string engine, OcrParams? ocrParams = null)
{
EnsureRunning();
@ -79,10 +80,26 @@ class PythonOcrBridge : IDisposable
bitmap.Save(ms, SdImageFormat.Png);
var imageBase64 = Convert.ToBase64String(ms.ToArray());
var pyReq = new { cmd = "ocr", engine, imageBase64 };
var pyReq = BuildPythonRequest(engine, ocrParams);
pyReq["imageBase64"] = imageBase64;
return SendPythonRequest(pyReq);
}
private static Dictionary<string, object?> BuildPythonRequest(string engine, OcrParams? ocrParams)
{
var req = new Dictionary<string, object?> { ["cmd"] = "ocr", ["engine"] = engine };
if (ocrParams == null) return req;
if (ocrParams.MergeGap > 0) req["mergeGap"] = ocrParams.MergeGap;
if (ocrParams.LinkThreshold.HasValue) req["linkThreshold"] = ocrParams.LinkThreshold.Value;
if (ocrParams.TextThreshold.HasValue) req["textThreshold"] = ocrParams.TextThreshold.Value;
if (ocrParams.LowText.HasValue) req["lowText"] = ocrParams.LowText.Value;
if (ocrParams.WidthThs.HasValue) req["widthThs"] = ocrParams.WidthThs.Value;
if (ocrParams.Paragraph.HasValue) req["paragraph"] = ocrParams.Paragraph.Value;
return req;
}
private OcrResponse SendPythonRequest(object pyReq)
{
var json = JsonSerializer.Serialize(pyReq, JsonOptions);

View file

@ -71,6 +71,51 @@ def split_into_words(text, x, y, width, height):
return words
def merge_nearby_detections(items, merge_gap):
"""Merge adjacent detections on the same Y baseline when X gap < merge_gap.
items: list of {"text", "x", "y", "w", "h"}
Merge when: Y overlap > 50% of min height AND 0 <= X gap <= merge_gap.
"""
if not items or merge_gap <= 0:
return items
sorted_items = sorted(items, key=lambda d: (d["y"] + d["h"] / 2, d["x"]))
merged = [dict(sorted_items[0])]
for item in sorted_items[1:]:
last = merged[-1]
overlap = min(last["y"] + last["h"], item["y"] + item["h"]) - max(last["y"], item["y"])
min_h = min(last["h"], item["h"])
x_gap = item["x"] - (last["x"] + last["w"])
if min_h > 0 and overlap / min_h > 0.5 and 0 <= x_gap <= merge_gap:
new_x = min(last["x"], item["x"])
new_y = min(last["y"], item["y"])
new_x2 = max(last["x"] + last["w"], item["x"] + item["w"])
new_y2 = max(last["y"] + last["h"], item["y"] + item["h"])
last["x"] = new_x
last["y"] = new_y
last["w"] = new_x2 - new_x
last["h"] = new_y2 - new_y
last["text"] = last["text"] + " " + item["text"]
else:
merged.append(dict(item))
return merged
def items_to_response(items):
"""Convert list of {"text", "x", "y", "w", "h"} to OcrResponse format."""
lines = []
all_text_parts = []
for item in items:
words = split_into_words(item["text"], item["x"], item["y"], item["w"], item["h"])
lines.append({"text": item["text"], "words": words})
all_text_parts.append(item["text"])
return {"ok": True, "text": "\n".join(all_text_parts), "lines": lines}
def run_easyocr(image_path):
from PIL import Image
import numpy as np
@ -78,27 +123,28 @@ def run_easyocr(image_path):
return run_easyocr_array(img)
def run_easyocr_array(img):
def run_easyocr_array(img, merge_gap=0, **easyocr_kwargs):
reader = get_easyocr()
# Redirect stdout during inference — easyocr can print warnings
real_stdout = _redirect_stdout_to_stderr()
try:
# batch_size=32: batch GPU recognition of detected text regions
results = reader.readtext(img, batch_size=32)
results = reader.readtext(img, batch_size=32, **easyocr_kwargs)
finally:
_restore_stdout(real_stdout)
# results: [(bbox_4corners, text, conf), ...]
lines = []
all_text_parts = []
items = []
for bbox, text, conf in results:
if not text.strip():
continue
x, y, w, h = bbox_to_rect(bbox)
words = split_into_words(text, x, y, w, h)
lines.append({"text": text.strip(), "words": words})
all_text_parts.append(text.strip())
return {"ok": True, "text": "\n".join(all_text_parts), "lines": lines}
items.append({"text": text.strip(), "x": x, "y": y, "w": w, "h": h})
if merge_gap > 0:
items = merge_nearby_detections(items, merge_gap)
return items_to_response(items)
def get_paddleocr():
@ -106,10 +152,18 @@ def get_paddleocr():
if _paddle_ocr is None:
sys.stderr.write("Loading PaddleOCR model...\n")
sys.stderr.flush()
import os
os.environ.setdefault("PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", "True")
real_stdout = _redirect_stdout_to_stderr()
try:
from paddleocr import PaddleOCR
_paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en", use_gpu=True, show_log=False)
_paddle_ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
lang="en",
ocr_version="PP-OCRv4",
)
finally:
_restore_stdout(real_stdout)
sys.stderr.write("PaddleOCR model loaded.\n")
@ -117,28 +171,41 @@ def get_paddleocr():
return _paddle_ocr
def run_paddleocr_array(img):
def run_paddleocr_array(img, merge_gap=0):
ocr = get_paddleocr()
# Ensure RGB 3-channel
if len(img.shape) == 2:
import numpy as np
img = np.stack([img, img, img], axis=-1)
elif img.shape[2] == 4:
img = img[:, :, :3]
real_stdout = _redirect_stdout_to_stderr()
try:
results = ocr.ocr(img, cls=True)
results = ocr.predict(img)
finally:
_restore_stdout(real_stdout)
lines = []
all_text_parts = []
# PaddleOCR returns [page_results], each item is [bbox_4corners, (text, conf)]
if results and results[0]:
for item in results[0]:
bbox, (text, conf) = item
items = []
# PaddleOCR 3.4: results is list of OCRResult objects
for res in results:
texts = res.get("rec_texts", []) if hasattr(res, "get") else getattr(res, "rec_texts", [])
polys = res.get("dt_polys", []) if hasattr(res, "get") else getattr(res, "dt_polys", [])
for i, text in enumerate(texts):
if not text.strip():
continue
x, y, w, h = bbox_to_rect(bbox)
words = split_into_words(text, x, y, w, h)
lines.append({"text": text.strip(), "words": words})
all_text_parts.append(text.strip())
return {"ok": True, "text": "\n".join(all_text_parts), "lines": lines}
if i < len(polys):
bbox = polys[i]
x, y, w, h = bbox_to_rect(bbox)
else:
x, y, w, h = 0, 0, 0, 0
items.append({"text": text.strip(), "x": x, "y": y, "w": w, "h": h})
if merge_gap > 0:
items = merge_nearby_detections(items, merge_gap)
return items_to_response(items)
def load_image(req):
@ -170,10 +237,22 @@ def handle_request(req):
if img is None:
return {"ok": False, "error": "Missing imagePath or imageBase64"}
merge_gap = req.get("mergeGap", 0)
if engine == "easyocr":
return run_easyocr_array(img)
easyocr_kwargs = {}
for json_key, py_param in [
("linkThreshold", "link_threshold"),
("textThreshold", "text_threshold"),
("lowText", "low_text"),
("widthThs", "width_ths"),
("paragraph", "paragraph"),
]:
if json_key in req:
easyocr_kwargs[py_param] = req[json_key]
return run_easyocr_array(img, merge_gap=merge_gap, **easyocr_kwargs)
elif engine == "paddleocr":
return run_paddleocr_array(img)
return run_paddleocr_array(img, merge_gap=merge_gap)
else:
return {"ok": False, "error": f"Unknown engine: {engine}"}

484
tools/test-ocr.ts Normal file
View file

@ -0,0 +1,484 @@
/**
* OCR test runner + parameter tuner.
*
* Usage:
* npx tsx tools/test-ocr.ts # test all combos with defaults
* npx tsx tools/test-ocr.ts paddleocr # filter to paddleocr combos
* npx tsx tools/test-ocr.ts --tune # tune all combos (coordinate descent)
* npx tsx tools/test-ocr.ts --tune easyocr # tune only easyocr combos
*/
import { OcrDaemon, type OcrEngine, type OcrPreprocess, type DiffOcrParams, type DiffCropParams, type OcrParams } from '../src/game/OcrDaemon.js';
import { readFileSync } from 'fs';
import { join } from 'path';
// ── Types ──────────────────────────────────────────────────────────────────
interface TestCase {
id: string;
image: string;
fullImage: string;
expected: string[];
}
interface Combo {
engine: OcrEngine;
preprocess: OcrPreprocess;
label: string;
}
interface TuneResult {
label: string;
score: number;
params: DiffOcrParams;
evals: number;
}
// ── Combos ─────────────────────────────────────────────────────────────────
const ALL_COMBOS: Combo[] = [
{ engine: 'tesseract', preprocess: 'bgsub', label: 'tesseract+bgsub' },
{ engine: 'tesseract', preprocess: 'tophat', label: 'tesseract+tophat' },
{ engine: 'tesseract', preprocess: 'none', label: 'tesseract+none' },
{ engine: 'easyocr', preprocess: 'bgsub', label: 'easyocr+bgsub' },
{ engine: 'easyocr', preprocess: 'tophat', label: 'easyocr+tophat' },
{ engine: 'easyocr', preprocess: 'none', label: 'easyocr+none' },
{ engine: 'paddleocr', preprocess: 'bgsub', label: 'paddleocr+bgsub' },
{ engine: 'paddleocr', preprocess: 'tophat', label: 'paddleocr+tophat' },
{ engine: 'paddleocr', preprocess: 'none', label: 'paddleocr+none' },
];
// ── Scoring ────────────────────────────────────────────────────────────────
function levenshtein(a: string, b: string): number {
const m = a.length, n = b.length;
const dp: number[][] = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));
for (let i = 0; i <= m; i++) dp[i][0] = i;
for (let j = 0; j <= n; j++) dp[0][j] = j;
for (let i = 1; i <= m; i++)
for (let j = 1; j <= n; j++)
dp[i][j] = a[i - 1] === b[j - 1]
? dp[i - 1][j - 1]
: 1 + Math.min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]);
return dp[m][n];
}
function similarity(a: string, b: string): number {
const maxLen = Math.max(a.length, b.length);
if (maxLen === 0) return 1;
return 1 - levenshtein(a.toLowerCase(), b.toLowerCase()) / maxLen;
}
function scoreLines(expected: string[], actual: string[]): number {
const used = new Set<number>();
let matched = 0;
for (const exp of expected) {
let bestIdx = -1, bestSim = 0;
for (let i = 0; i < actual.length; i++) {
if (used.has(i)) continue;
const sim = similarity(exp, actual[i]);
if (sim > bestSim) { bestSim = sim; bestIdx = i; }
}
if (bestIdx >= 0 && bestSim >= 0.75) {
matched++;
used.add(bestIdx);
}
}
return expected.length > 0 ? matched / expected.length : 1;
}
function scoreLinesVerbose(expected: string[], actual: string[]): { matched: string[]; missed: string[]; extra: string[]; score: number } {
const used = new Set<number>();
const matched: string[] = [];
const missed: string[] = [];
for (const exp of expected) {
let bestIdx = -1, bestSim = 0;
for (let i = 0; i < actual.length; i++) {
if (used.has(i)) continue;
const sim = similarity(exp, actual[i]);
if (sim > bestSim) { bestSim = sim; bestIdx = i; }
}
if (bestIdx >= 0 && bestSim >= 0.75) {
matched.push(exp);
used.add(bestIdx);
} else {
missed.push(exp);
}
}
const extra = actual.filter((_, i) => !used.has(i));
return { matched, missed, extra, score: expected.length > 0 ? matched.length / expected.length : 1 };
}
// ── Daemon helpers ─────────────────────────────────────────────────────────
async function runCase(
daemon: OcrDaemon,
tc: TestCase,
tessdataDir: string,
engine: OcrEngine,
preprocess: OcrPreprocess,
params?: DiffOcrParams,
): Promise<string[]> {
const fullPath = join(tessdataDir, tc.fullImage).replace(/\//g, '\\');
const imagePath = join(tessdataDir, tc.image).replace(/\//g, '\\');
await (daemon as any).sendWithRetry({ cmd: 'snapshot', file: fullPath }, 10_000);
const req: any = { cmd: 'diff-ocr', file: imagePath };
if (engine !== 'tesseract') req.engine = engine;
if (preprocess !== 'none') req.preprocess = preprocess;
if (params && Object.keys(params).length > 0) req.params = params;
const timeout = engine !== 'tesseract' ? 120_000 : 10_000;
const resp = await (daemon as any).sendWithRetry(req, timeout);
return (resp.lines ?? [])
.map((l: any) => (l.text ?? '').trim())
.filter((l: string) => l.length > 0);
}
async function scoreCombo(
daemon: OcrDaemon,
cases: TestCase[],
tessdataDir: string,
engine: OcrEngine,
preprocess: OcrPreprocess,
params?: DiffOcrParams,
): Promise<number> {
let totalScore = 0;
for (const tc of cases) {
try {
const actual = await runCase(daemon, tc, tessdataDir, engine, preprocess, params);
totalScore += scoreLines(tc.expected, actual);
} catch {
// error = 0 score for this case
}
}
return totalScore / cases.length;
}
// ── Parameter sweep definitions ────────────────────────────────────────────
interface CropIntSweep {
name: keyof DiffCropParams;
values: number[];
}
interface OcrIntSweep {
name: keyof OcrParams;
values: number[];
}
interface OcrBoolSweep {
name: keyof OcrParams;
values: boolean[];
}
const CROP_SWEEPS: CropIntSweep[] = [
{ name: 'diffThresh', values: [10, 15, 20, 25, 30, 40, 50] },
{ name: 'maxGap', values: [5, 10, 15, 20, 25, 30] },
];
const CROP_TRIM_VALUES = [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5];
const SHARED_OCR_SWEEPS: OcrIntSweep[] = [
{ name: 'upscale', values: [1, 2, 3] },
{ name: 'mergeGap', values: [0, 20, 40, 60, 80, 100] },
];
const BGSUB_INT_SWEEPS: OcrIntSweep[] = [
{ name: 'dimPercentile', values: [5, 10, 15, 20, 25, 30, 40, 50, 60] },
{ name: 'textThresh', values: [10, 20, 30, 40, 50, 60, 80, 100] },
];
const BGSUB_BOOL_SWEEPS: OcrBoolSweep[] = [
{ name: 'softThreshold', values: [false, true] },
];
const TOPHAT_SWEEPS: OcrIntSweep[] = [
{ name: 'kernelSize', values: [11, 15, 21, 25, 31, 41, 51, 61] },
];
// ── Default params per preprocess ──────────────────────────────────────────
function defaultParams(preprocess: OcrPreprocess): DiffOcrParams {
const crop: DiffCropParams = { diffThresh: 20, maxGap: 20, trimCutoff: 0.4 };
if (preprocess === 'bgsub') {
return { crop, ocr: { useBackgroundSub: true, upscale: 2, dimPercentile: 40, textThresh: 60, softThreshold: false } };
} else if (preprocess === 'tophat') {
return { crop, ocr: { useBackgroundSub: false, upscale: 2, kernelSize: 41 } };
}
return { crop, ocr: { upscale: 2 } }; // none
}
function cloneParams(p: DiffOcrParams): DiffOcrParams {
return JSON.parse(JSON.stringify(p));
}
// ── Coordinate descent tuner (two-phase: crop then OCR) ──────────────────
async function tuneCombo(
daemon: OcrDaemon,
cases: TestCase[],
tessdataDir: string,
combo: Combo,
): Promise<TuneResult> {
const params = defaultParams(combo.preprocess);
let bestScore = await scoreCombo(daemon, cases, tessdataDir, combo.engine, combo.preprocess, params);
let evals = 1;
process.stderr.write(` baseline: ${(bestScore * 100).toFixed(1)}% ${JSON.stringify(params)}\n`);
// ── Phase A: Tune crop params ──
process.stderr.write(`\n === Phase A: Crop Params ===\n`);
const MAX_ROUNDS = 3;
for (let round = 0; round < MAX_ROUNDS; round++) {
let improved = false;
process.stderr.write(` --- Crop Round ${round + 1} ---\n`);
for (const { name, values } of CROP_SWEEPS) {
process.stderr.write(` crop.${name}: `);
let bestVal: number | undefined;
let bestValScore = -1;
for (const v of values) {
const trial = cloneParams(params);
(trial.crop as any)[name] = v;
const score = await scoreCombo(daemon, cases, tessdataDir, combo.engine, combo.preprocess, trial);
evals++;
process.stderr.write(`${v}=${(score * 100).toFixed(1)} `);
if (score > bestValScore) { bestValScore = score; bestVal = v; }
}
process.stderr.write('\n');
if (bestValScore > bestScore && bestVal !== undefined) {
(params.crop as any)![name] = bestVal;
bestScore = bestValScore;
improved = true;
process.stderr.write(` -> crop.${name}=${bestVal} score=${(bestScore * 100).toFixed(1)}%\n`);
}
}
// Sweep trimCutoff
{
process.stderr.write(` crop.trimCutoff: `);
let bestTrim = params.crop?.trimCutoff ?? 0.2;
let bestTrimScore = bestScore;
for (const v of CROP_TRIM_VALUES) {
const trial = cloneParams(params);
trial.crop!.trimCutoff = v;
const score = await scoreCombo(daemon, cases, tessdataDir, combo.engine, combo.preprocess, trial);
evals++;
process.stderr.write(`${v}=${(score * 100).toFixed(1)} `);
if (score > bestTrimScore) { bestTrimScore = score; bestTrim = v; }
}
process.stderr.write('\n');
if (bestTrimScore > bestScore) {
params.crop!.trimCutoff = bestTrim;
bestScore = bestTrimScore;
improved = true;
process.stderr.write(` -> crop.trimCutoff=${bestTrim} score=${(bestScore * 100).toFixed(1)}%\n`);
}
}
process.stderr.write(` End crop round ${round + 1}: ${(bestScore * 100).toFixed(1)}% (${evals} evals)\n`);
if (!improved) break;
}
// ── Phase B: Tune OCR params (crop is now locked) ──
process.stderr.write(`\n === Phase B: OCR Params (crop locked) ===\n`);
const ocrIntSweeps: OcrIntSweep[] = [...SHARED_OCR_SWEEPS];
const ocrBoolSweeps: OcrBoolSweep[] = [];
if (combo.preprocess === 'bgsub') {
ocrIntSweeps.push(...BGSUB_INT_SWEEPS);
ocrBoolSweeps.push(...BGSUB_BOOL_SWEEPS);
} else if (combo.preprocess === 'tophat') {
ocrIntSweeps.push(...TOPHAT_SWEEPS);
}
for (let round = 0; round < MAX_ROUNDS; round++) {
let improved = false;
process.stderr.write(` --- OCR Round ${round + 1} ---\n`);
for (const { name, values } of ocrIntSweeps) {
process.stderr.write(` ocr.${name}: `);
let bestVal: number | undefined;
let bestValScore = -1;
for (const v of values) {
const trial = cloneParams(params);
(trial.ocr as any)[name] = v;
const score = await scoreCombo(daemon, cases, tessdataDir, combo.engine, combo.preprocess, trial);
evals++;
process.stderr.write(`${v}=${(score * 100).toFixed(1)} `);
if (score > bestValScore) { bestValScore = score; bestVal = v; }
}
process.stderr.write('\n');
if (bestValScore > bestScore && bestVal !== undefined) {
(params.ocr as any)![name] = bestVal;
bestScore = bestValScore;
improved = true;
process.stderr.write(` -> ocr.${name}=${bestVal} score=${(bestScore * 100).toFixed(1)}%\n`);
}
}
for (const { name, values } of ocrBoolSweeps) {
process.stderr.write(` ocr.${name}: `);
let bestVal: boolean | undefined;
let bestValScore = -1;
for (const v of values) {
const trial = cloneParams(params);
(trial.ocr as any)[name] = v;
const score = await scoreCombo(daemon, cases, tessdataDir, combo.engine, combo.preprocess, trial);
evals++;
process.stderr.write(`${v}=${(score * 100).toFixed(1)} `);
if (score > bestValScore) { bestValScore = score; bestVal = v; }
}
process.stderr.write('\n');
if (bestValScore > bestScore && bestVal !== undefined) {
(params.ocr as any)![name] = bestVal;
bestScore = bestValScore;
improved = true;
process.stderr.write(` -> ocr.${name}=${bestVal} score=${(bestScore * 100).toFixed(1)}%\n`);
}
}
process.stderr.write(` End OCR round ${round + 1}: ${(bestScore * 100).toFixed(1)}% (${evals} evals)\n`);
if (!improved) break;
}
return { label: combo.label, score: bestScore, params, evals };
}
// ── Verbose test run ───────────────────────────────────────────────────────
async function testCombo(
daemon: OcrDaemon,
cases: TestCase[],
tessdataDir: string,
combo: Combo,
params?: DiffOcrParams,
): Promise<number> {
let totalScore = 0;
for (const tc of cases) {
try {
const actual = await runCase(daemon, tc, tessdataDir, combo.engine, combo.preprocess, params);
const { matched, missed, extra, score } = scoreLinesVerbose(tc.expected, actual);
totalScore += score;
const status = missed.length === 0 ? 'PASS' : 'FAIL';
console.log(` [${status}] ${tc.id} matched=${matched.length}/${tc.expected.length} extra=${extra.length} score=${score.toFixed(2)}`);
for (const m of missed) console.log(` MISS: ${m}`);
for (const e of extra) console.log(` EXTRA: ${e}`);
} catch (err: any) {
console.log(` [ERROR] ${tc.id}: ${err.message}`);
}
}
return totalScore / cases.length;
}
// ── Main ───────────────────────────────────────────────────────────────────
async function main() {
const args = process.argv.slice(2);
const tuneMode = args.includes('--tune');
const filterArg = args.find(a => !a.startsWith('--'))?.toLowerCase();
const combos = filterArg
? ALL_COMBOS.filter(c => c.label.includes(filterArg))
: ALL_COMBOS;
const tessdataDir = join('tools', 'OcrDaemon', 'bin', 'Release', 'net8.0-windows10.0.19041.0', 'tessdata');
const casesPath = join(tessdataDir, 'cases.json');
const cases: TestCase[] = JSON.parse(readFileSync(casesPath, 'utf-8'));
console.log(`Loaded ${cases.length} test cases: ${cases.map(c => c.id).join(', ')}`);
console.log(`Mode: ${tuneMode ? 'TUNE' : 'TEST'} Combos: ${combos.length}\n`);
const daemon = new OcrDaemon();
if (tuneMode) {
// ── Tune mode: coordinate descent for each combo ──
const tuneResults: TuneResult[] = [];
for (const combo of combos) {
console.log(`\n${'='.repeat(60)}`);
console.log(` TUNING: ${combo.label}`);
console.log(`${'='.repeat(60)}`);
try {
const result = await tuneCombo(daemon, cases, tessdataDir, combo);
tuneResults.push(result);
console.log(`\n Best: ${(result.score * 100).toFixed(1)}% (${result.evals} evals)`);
console.log(` Params: ${JSON.stringify(result.params)}`);
// Verbose run with best params
console.log('');
await testCombo(daemon, cases, tessdataDir, combo, result.params);
} catch (err: any) {
console.log(` ERROR: ${err.message}`);
tuneResults.push({ label: combo.label, score: 0, params: {}, evals: 0 });
}
}
// Summary
console.log(`\n${'='.repeat(70)}`);
console.log(' TUNE RESULTS');
console.log(`${'='.repeat(70)}`);
const sorted = tuneResults.sort((a, b) => b.score - a.score);
for (const r of sorted) {
const bar = '#'.repeat(Math.round(r.score * 40));
console.log(` ${r.label.padEnd(22)} ${(r.score * 100).toFixed(1).padStart(5)}% ${bar}`);
}
console.log(`\n BEST PARAMS PER COMBO:`);
for (const r of sorted) {
if (r.score > 0) {
console.log(` ${r.label.padEnd(22)} ${JSON.stringify(r.params)}`);
}
}
} else {
// ── Test mode: defaults only ──
const results: Record<string, number> = {};
for (const combo of combos) {
console.log(`\n${'='.repeat(60)}`);
console.log(` ${combo.label}`);
console.log(`${'='.repeat(60)}`);
try {
const score = await testCombo(daemon, cases, tessdataDir, combo);
results[combo.label] = score;
console.log(`\n Average: ${(score * 100).toFixed(1)}%`);
} catch (err: any) {
console.log(` ERROR: ${err.message}`);
results[combo.label] = 0;
}
}
console.log(`\n${'='.repeat(60)}`);
console.log(' SUMMARY');
console.log(`${'='.repeat(60)}`);
const sorted = Object.entries(results).sort((a, b) => b[1] - a[1]);
for (const [label, score] of sorted) {
const bar = '#'.repeat(Math.round(score * 40));
console.log(` ${label.padEnd(22)} ${(score * 100).toFixed(1).padStart(5)}% ${bar}`);
}
}
await daemon.stop();
}
main().catch(err => {
console.error(err);
process.exit(1);
});