working on crop

This commit is contained in:
Boki 2026-02-12 17:48:16 -05:00
parent 93e2234c4e
commit f74e3e1c85
12 changed files with 1135 additions and 220 deletions

View file

@ -14,6 +14,7 @@ using SdImageFormat = System.Drawing.Imaging.ImageFormat;
class OcrHandler(TesseractEngine engine)
{
private Bitmap? _referenceFrame;
private RegionRect? _referenceRegion;
public object HandleOcr(Request req)
{
@ -56,31 +57,79 @@ class OcrHandler(TesseractEngine engine)
{
_referenceFrame?.Dispose();
_referenceFrame = ScreenCapture.CaptureOrLoad(req.File, req.Region);
_referenceRegion = req.Region == null
? null
: new RegionRect { X = req.Region.X, Y = req.Region.Y, Width = req.Region.Width, Height = req.Region.Height };
return new OkResponse();
}
public object HandleDiffOcr(Request req) => HandleDiffOcr(req, req.Threshold > 0
? new DiffOcrParams { DiffThresh = req.Threshold }
? new DiffOcrParams { Crop = new DiffCropParams { DiffThresh = req.Threshold } }
: new DiffOcrParams());
/// <summary>
/// Diff detection + crop only. Returns the raw tooltip crop bitmap and region,
/// or null if no tooltip detected. Caller is responsible for disposing the bitmap.
/// </summary>
public (Bitmap cropped, Bitmap refCropped, Bitmap current, RegionRect region)? DiffCrop(Request req, DiffOcrParams p)
public (Bitmap cropped, Bitmap refCropped, Bitmap current, RegionRect region)? DiffCrop(Request req, DiffCropParams c)
{
if (_referenceFrame == null)
return null;
var current = ScreenCapture.CaptureOrLoad(req.File, null);
var diffRegion = req.Region ?? _referenceRegion;
int baseX = diffRegion?.X ?? 0;
int baseY = diffRegion?.Y ?? 0;
var current = ScreenCapture.CaptureOrLoad(req.File, diffRegion);
int w = Math.Min(_referenceFrame.Width, current.Width);
int h = Math.Min(_referenceFrame.Height, current.Height);
Bitmap refForDiff = _referenceFrame;
bool disposeRef = false;
var refData = _referenceFrame.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
if (diffRegion != null)
{
if (_referenceRegion == null)
{
var croppedRef = CropBitmap(_referenceFrame, diffRegion);
if (croppedRef == null)
{
current.Dispose();
return null;
}
refForDiff = croppedRef;
disposeRef = true;
}
else if (!RegionsEqual(diffRegion, _referenceRegion))
{
int offX = diffRegion.X - _referenceRegion.X;
int offY = diffRegion.Y - _referenceRegion.Y;
if (offX < 0 || offY < 0 || offX + diffRegion.Width > _referenceFrame.Width || offY + diffRegion.Height > _referenceFrame.Height)
{
current.Dispose();
return null;
}
var croppedRef = CropBitmap(_referenceFrame, new RegionRect
{
X = offX,
Y = offY,
Width = diffRegion.Width,
Height = diffRegion.Height,
});
if (croppedRef == null)
{
current.Dispose();
return null;
}
refForDiff = croppedRef;
disposeRef = true;
}
}
int w = Math.Min(refForDiff.Width, current.Width);
int h = Math.Min(refForDiff.Height, current.Height);
var refData = refForDiff.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
byte[] refPx = new byte[refData.Stride * h];
Marshal.Copy(refData.Scan0, refPx, 0, refPx.Length);
_referenceFrame.UnlockBits(refData);
refForDiff.UnlockBits(refData);
int stride = refData.Stride;
var curData = current.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
@ -88,7 +137,7 @@ class OcrHandler(TesseractEngine engine)
Marshal.Copy(curData.Scan0, curPx, 0, curPx.Length);
current.UnlockBits(curData);
int diffThresh = p.DiffThresh;
int diffThresh = c.DiffThresh;
// Pass 1: parallel row diff — compute rowCounts[] directly, no changed[] array
int[] rowCounts = new int[h];
@ -112,11 +161,12 @@ class OcrHandler(TesseractEngine engine)
if (totalChanged == 0)
{
current.Dispose();
if (disposeRef) refForDiff.Dispose();
return null;
}
int maxGap = p.MaxGap;
int rowThresh = w / p.RowThreshDiv;
int maxGap = c.MaxGap;
int rowThresh = w / c.RowThreshDiv;
int bestRowStart = 0, bestRowEnd = 0, bestRowLen = 0;
int curRowStart = -1, lastActiveRow = -1;
for (int y = 0; y < h; y++)
@ -180,7 +230,7 @@ class OcrHandler(TesseractEngine engine)
}
int tooltipHeight = bestRowEnd - bestRowStart + 1;
int colThresh = tooltipHeight / p.ColThreshDiv;
int colThresh = tooltipHeight / c.ColThreshDiv;
int bestColStart = 0, bestColEnd = 0, bestColLen = 0;
int curColStart = -1, lastActiveCol = -1;
@ -210,6 +260,7 @@ class OcrHandler(TesseractEngine engine)
{
Console.Error.WriteLine($" diff-ocr: no tooltip-sized region found (rows={bestRowLen}, cols={bestColLen})");
current.Dispose();
if (disposeRef) refForDiff.Dispose();
return null;
}
@ -218,37 +269,73 @@ class OcrHandler(TesseractEngine engine)
int maxX = Math.Min(bestColEnd, w - 1);
int maxY = Math.Min(bestRowEnd, h - 1);
// Trim low-density edges on both axes to avoid oversized crops.
int colSpan = maxX - minX + 1;
if (colSpan > 100)
if (colSpan > 50)
{
int q1 = minX + colSpan / 4;
int q3 = minX + colSpan * 3 / 4;
long midSum = 0;
int midCount = 0;
for (int x = q1; x <= q3; x++) { midSum += colCounts[x]; midCount++; }
double avgMidDensity = (double)midSum / midCount;
double cutoff = avgMidDensity * p.TrimCutoff;
while (maxX > minX + 100 && colCounts[maxX] < cutoff)
double avgMidDensity = (double)midSum / Math.Max(1, midCount);
double cutoff = avgMidDensity * c.TrimCutoff;
while (minX < maxX - 50 && colCounts[minX] < cutoff)
minX++;
while (maxX > minX + 50 && colCounts[maxX] < cutoff)
maxX--;
}
int rowSpan = maxY - minY + 1;
if (rowSpan > 50)
{
int q1 = minY + rowSpan / 4;
int q3 = minY + rowSpan * 3 / 4;
long midSum = 0;
int midCount = 0;
for (int y = q1; y <= q3; y++) { midSum += rowCounts[y]; midCount++; }
double avgMidDensity = (double)midSum / Math.Max(1, midCount);
double cutoff = avgMidDensity * c.TrimCutoff;
while (minY < maxY - 50 && rowCounts[minY] < cutoff)
minY++;
while (maxY > minY + 50 && rowCounts[maxY] < cutoff)
maxY--;
}
int rw = maxX - minX + 1;
int rh = maxY - minY + 1;
var cropped = CropFromBytes(curPx, stride, minX, minY, rw, rh);
var refCropped = CropFromBytes(refPx, stride, minX, minY, rw, rh);
var region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh };
var region = new RegionRect { X = baseX + minX, Y = baseY + minY, Width = rw, Height = rh };
Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}");
if (disposeRef) refForDiff.Dispose();
return (cropped, refCropped, current, region);
}
private static bool RegionsEqual(RegionRect a, RegionRect b) =>
a.X == b.X && a.Y == b.Y && a.Width == b.Width && a.Height == b.Height;
private static Bitmap? CropBitmap(Bitmap src, RegionRect region)
{
int cx = Math.Max(0, region.X);
int cy = Math.Max(0, region.Y);
int cw = Math.Min(region.Width, src.Width - cx);
int ch = Math.Min(region.Height, src.Height - cy);
if (cw <= 0 || ch <= 0)
return null;
return src.Clone(new Rectangle(cx, cy, cw, ch), PixelFormat.Format32bppArgb);
}
public object HandleDiffOcr(Request req, DiffOcrParams p)
{
if (_referenceFrame == null)
return new ErrorResponse("No reference snapshot stored. Send 'snapshot' first.");
var cropResult = DiffCrop(req, p);
var cropResult = DiffCrop(req, p.Crop);
if (cropResult == null)
return new OcrResponse { Text = "", Lines = [] };
@ -270,14 +357,15 @@ class OcrHandler(TesseractEngine engine)
}
// Pre-process for OCR — get Mat for per-line detection and padding
var ocr = p.Ocr;
Mat processedMat;
if (p.UseBackgroundSub)
if (ocr.UseBackgroundSub)
{
processedMat = ImagePreprocessor.PreprocessWithBackgroundSubMat(cropped, refCropped, p.DimPercentile, p.TextThresh, p.Upscale, p.SoftThreshold);
processedMat = ImagePreprocessor.PreprocessWithBackgroundSubMat(cropped, refCropped, ocr.DimPercentile, ocr.TextThresh, ocr.Upscale, ocr.SoftThreshold);
}
else
{
using var topHatBmp = ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
using var topHatBmp = ImagePreprocessor.PreprocessForOcr(cropped, ocr.KernelSize, ocr.Upscale);
processedMat = BitmapConverter.ToMat(topHatBmp);
}
using var _processedMat = processedMat; // ensure disposal
@ -296,25 +384,25 @@ class OcrHandler(TesseractEngine engine)
if (debug) Console.Error.WriteLine($" diff-ocr: saved preprocessed to {prePath}");
}
int pad = p.OcrPad;
int upscale = p.Upscale > 0 ? p.Upscale : 1;
int pad = p.Crop.OcrPad;
int upscale = ocr.Upscale > 0 ? ocr.Upscale : 1;
var lines = new List<OcrLineResult>();
// Per-line OCR: detect text lines via horizontal projection, OCR each individually
if (p.UsePerLineOcr)
if (ocr.UsePerLineOcr)
{
// DetectTextLines needs binary input; if soft threshold produced grayscale, binarize a copy
int minRowPx = Math.Max(processedMat.Cols / 200, 3);
using var detectionMat = p.SoftThreshold ? new Mat() : null;
if (p.SoftThreshold)
using var detectionMat = ocr.SoftThreshold ? new Mat() : null;
if (ocr.SoftThreshold)
Cv2.Threshold(processedMat, detectionMat!, 128, 255, ThresholdTypes.Binary);
var lineDetectInput = p.SoftThreshold ? detectionMat! : processedMat;
var textLines = ImagePreprocessor.DetectTextLines(lineDetectInput, minRowPixels: minRowPx, gapTolerance: p.LineGapTolerance * upscale);
var lineDetectInput = ocr.SoftThreshold ? detectionMat! : processedMat;
var textLines = ImagePreprocessor.DetectTextLines(lineDetectInput, minRowPixels: minRowPx, gapTolerance: ocr.LineGapTolerance * upscale);
if (debug) Console.Error.WriteLine($" diff-ocr: detected {textLines.Count} text lines");
if (textLines.Count > 0)
{
int linePadY = p.LinePadY;
int linePadY = ocr.LinePadY;
foreach (var (yStart, yEnd) in textLines)
{
int y0 = Math.Max(yStart - linePadY, 0);
@ -330,7 +418,7 @@ class OcrHandler(TesseractEngine engine)
using var lineBmp = BitmapConverter.ToBitmap(padded);
using var linePix = ImageUtils.BitmapToPix(lineBmp);
using var linePage = engine.Process(linePix, (PageSegMode)p.Psm);
using var linePage = engine.Process(linePix, (PageSegMode)ocr.Psm);
// Extract words, adjusting coordinates back to screen space
// Word coords are in padded image space → subtract pad, add line offset, scale to original, add region offset
@ -386,7 +474,7 @@ class OcrHandler(TesseractEngine engine)
Cv2.CopyMakeBorder(processedMat, padded, pad, pad, pad, pad, BorderTypes.Constant, Scalar.White);
using var bmp = BitmapConverter.ToBitmap(padded);
using var pix = ImageUtils.BitmapToPix(bmp);
using var page = engine.Process(pix, (PageSegMode)p.Psm);
using var page = engine.Process(pix, (PageSegMode)ocr.Psm);
var text = page.GetText();
// Adjust word coordinates: subtract padding offset
@ -430,77 +518,161 @@ class OcrHandler(TesseractEngine engine)
public object HandleTest(Request req) => RunTestCases(new DiffOcrParams(), verbose: true);
private static DiffOcrParams CloneParams(DiffOcrParams p)
{
var json = JsonSerializer.Serialize(p);
return JsonSerializer.Deserialize<DiffOcrParams>(json)!;
}
public object HandleTune(Request req)
{
int totalEvals = 0;
// --- Phase 1: Tune top-hat approach ---
Console.Error.WriteLine("\n========== Phase 1: Top-Hat ==========");
var topHat = new DiffOcrParams { UseBackgroundSub = false };
double topHatScore = TuneParams(topHat, ref totalEvals, tuneTopHat: true, tuneBgSub: false);
// --- Phase A: Tune crop params ---
Console.Error.WriteLine("\n========== Phase A: Crop Params ==========");
var best = new DiffOcrParams();
double bestScore = TuneCropParams(best, ref totalEvals);
// --- Phase 2: Tune background-subtraction approach ---
Console.Error.WriteLine("\n========== Phase 2: Background Subtraction ==========");
// Start bgSub from the best detection params found in phase 1
var bgSub = topHat.Clone();
bgSub.UseBackgroundSub = true;
double bgSubScore = TuneParams(bgSub, ref totalEvals, tuneTopHat: false, tuneBgSub: true);
// --- Phase B: Tune OCR params (top-hat) ---
Console.Error.WriteLine("\n========== Phase B: OCR — Top-Hat ==========");
var topHat = CloneParams(best);
topHat.Ocr.UseBackgroundSub = false;
double topHatScore = TuneOcrParams(topHat, ref totalEvals, tuneTopHat: true, tuneBgSub: false);
// --- Phase C: Tune OCR params (background-subtraction) ---
Console.Error.WriteLine("\n========== Phase C: OCR — Background Subtraction ==========");
var bgSub = CloneParams(best);
bgSub.Ocr.UseBackgroundSub = true;
double bgSubScore = TuneOcrParams(bgSub, ref totalEvals, tuneTopHat: false, tuneBgSub: true);
// Pick the winner
var best = bgSubScore > topHatScore ? bgSub : topHat;
double bestScore = Math.Max(topHatScore, bgSubScore);
var winner = bgSubScore > topHatScore ? bgSub : topHat;
double winnerScore = Math.Max(topHatScore, bgSubScore);
Console.Error.WriteLine($"\n========== Result ==========");
Console.Error.WriteLine($" Top-Hat: {topHatScore:F3} {topHat}");
Console.Error.WriteLine($" BgSub: {bgSubScore:F3} {bgSub}");
Console.Error.WriteLine($" Winner: {(best.UseBackgroundSub ? "BgSub" : "TopHat")} evals={totalEvals}\n");
Console.Error.WriteLine($" Winner: {(winner.Ocr.UseBackgroundSub ? "BgSub" : "TopHat")} evals={totalEvals}\n");
// Final verbose report with best params
RunTestCases(best, verbose: true);
RunTestCases(winner, verbose: true);
return new TuneResponse
{
BestScore = bestScore,
BestParams = best,
BestScore = winnerScore,
BestParams = winner,
Iterations = totalEvals,
};
}
private double TuneParams(DiffOcrParams best, ref int totalEvals, bool tuneTopHat, bool tuneBgSub)
private double TuneCropParams(DiffOcrParams best, ref int totalEvals)
{
double bestScore = ScoreParams(best);
Console.Error.WriteLine($" baseline score={bestScore:F3} {best}\n");
// Detection params (shared by both approaches)
var sharedSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
var cropSweeps = new (string Name, int[] Values, Action<DiffCropParams, int> Set)[]
{
("diffThresh", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.DiffThresh = v),
("rowThreshDiv", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.RowThreshDiv = v),
("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.ColThreshDiv = v),
("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.MaxGap = v),
("upscale", [1, 2, 3], (p, v) => p.Upscale = v),
("ocrPad", [0, 5, 10, 15, 20, 30], (p, v) => p.OcrPad = v),
("psm", [4, 6, 11, 13], (p, v) => p.Psm = v),
};
// Top-hat specific
var topHatSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
{
("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41, 51], (p, v) => p.KernelSize = v),
};
// Background-subtraction specific
var bgSubSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
{
("dimPercentile", [5, 10, 15, 20, 25, 30, 40, 50], (p, v) => p.DimPercentile = v),
("textThresh", [10, 15, 20, 25, 30, 40, 50, 60, 80], (p, v) => p.TextThresh = v),
("lineGapTolerance", [3, 5, 8, 10, 15], (p, v) => p.LineGapTolerance = v),
("linePadY", [5, 10, 15, 20], (p, v) => p.LinePadY = v),
("diffThresh", [10, 15, 20, 25, 30, 40, 50, 60], (c, v) => c.DiffThresh = v),
("rowThreshDiv", [10, 15, 20, 25, 30, 40, 50, 60], (c, v) => c.RowThreshDiv = v),
("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (c, v) => c.ColThreshDiv = v),
("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (c, v) => c.MaxGap = v),
("ocrPad", [0, 5, 10, 15, 20, 30], (c, v) => c.OcrPad = v),
};
double[] trimValues = [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5];
var allIntSweeps = sharedSweeps
const int maxRounds = 3;
for (int round = 0; round < maxRounds; round++)
{
bool improved = false;
Console.Error.WriteLine($"--- Round {round + 1} ---");
foreach (var (name, values, set) in cropSweeps)
{
Console.Error.Write($" {name}: ");
int bestVal = 0;
double bestValScore = -1;
foreach (int v in values)
{
var trial = CloneParams(best);
set(trial.Crop, v);
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v}={score:F3} ");
if (score > bestValScore) { bestValScore = score; bestVal = v; }
}
Console.Error.WriteLine();
if (bestValScore > bestScore)
{
set(best.Crop, bestVal);
bestScore = bestValScore;
improved = true;
Console.Error.WriteLine($" → {name}={bestVal} score={bestScore:F3}");
}
}
// Sweep trimCutoff
{
Console.Error.Write($" trimCutoff: ");
double bestTrim = best.Crop.TrimCutoff;
double bestTrimScore = bestScore;
foreach (double v in trimValues)
{
var trial = CloneParams(best);
trial.Crop.TrimCutoff = v;
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v:F2}={score:F3} ");
if (score > bestTrimScore) { bestTrimScore = score; bestTrim = v; }
}
Console.Error.WriteLine();
if (bestTrimScore > bestScore)
{
best.Crop.TrimCutoff = bestTrim;
bestScore = bestTrimScore;
improved = true;
Console.Error.WriteLine($" → trimCutoff={bestTrim:F2} score={bestScore:F3}");
}
}
Console.Error.WriteLine($" End of round {round + 1}: score={bestScore:F3} {best}");
if (!improved) break;
}
return bestScore;
}
private double TuneOcrParams(DiffOcrParams best, ref int totalEvals, bool tuneTopHat, bool tuneBgSub)
{
double bestScore = ScoreParams(best);
Console.Error.WriteLine($" baseline score={bestScore:F3} {best}\n");
var sharedOcrSweeps = new (string Name, int[] Values, Action<OcrParams, int> Set)[]
{
("upscale", [1, 2, 3], (o, v) => o.Upscale = v),
("psm", [4, 6, 11, 13], (o, v) => o.Psm = v),
};
// Top-hat specific
var topHatSweeps = new (string Name, int[] Values, Action<OcrParams, int> Set)[]
{
("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41, 51], (o, v) => o.KernelSize = v),
};
// Background-subtraction specific
var bgSubSweeps = new (string Name, int[] Values, Action<OcrParams, int> Set)[]
{
("dimPercentile", [5, 10, 15, 20, 25, 30, 40, 50], (o, v) => o.DimPercentile = v),
("textThresh", [10, 15, 20, 25, 30, 40, 50, 60, 80], (o, v) => o.TextThresh = v),
("lineGapTolerance", [3, 5, 8, 10, 15], (o, v) => o.LineGapTolerance = v),
("linePadY", [5, 10, 15, 20], (o, v) => o.LinePadY = v),
};
var allOcrSweeps = sharedOcrSweeps
.Concat(tuneTopHat ? topHatSweeps : [])
.Concat(tuneBgSub ? bgSubSweeps : [])
.ToArray();
@ -511,7 +683,7 @@ class OcrHandler(TesseractEngine engine)
bool improved = false;
Console.Error.WriteLine($"--- Round {round + 1} ---");
foreach (var (name, values, set) in allIntSweeps)
foreach (var (name, values, set) in allOcrSweeps)
{
Console.Error.Write($" {name}: ");
int bestVal = 0;
@ -519,8 +691,8 @@ class OcrHandler(TesseractEngine engine)
foreach (int v in values)
{
var trial = best.Clone();
set(trial, v);
var trial = CloneParams(best);
set(trial.Ocr, v);
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v}={score:F3} ");
@ -530,39 +702,13 @@ class OcrHandler(TesseractEngine engine)
if (bestValScore > bestScore)
{
set(best, bestVal);
set(best.Ocr, bestVal);
bestScore = bestValScore;
improved = true;
Console.Error.WriteLine($" → {name}={bestVal} score={bestScore:F3}");
}
}
// Sweep trimCutoff
{
Console.Error.Write($" trimCutoff: ");
double bestTrim = best.TrimCutoff;
double bestTrimScore = bestScore;
foreach (double v in trimValues)
{
var trial = best.Clone();
trial.TrimCutoff = v;
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v:F2}={score:F3} ");
if (score > bestTrimScore) { bestTrimScore = score; bestTrim = v; }
}
Console.Error.WriteLine();
if (bestTrimScore > bestScore)
{
best.TrimCutoff = bestTrim;
bestScore = bestTrimScore;
improved = true;
Console.Error.WriteLine($" → trimCutoff={bestTrim:F2} score={bestScore:F3}");
}
}
Console.Error.WriteLine($" End of round {round + 1}: score={bestScore:F3} {best}");
if (!improved) break;
}