Revert "fixed padding tooltip for all"

This reverts commit 6242220, which broke tooltip detection.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Boki 2026-02-11 14:15:28 -05:00
parent f9b8ef9158
commit b8f5637c49
4 changed files with 123 additions and 260 deletions

View file

@ -11,25 +11,15 @@ static class ImagePreprocessor
/// Isolates bright tooltip text, suppresses dim background text visible through overlay.
/// Pipeline: grayscale → morphological top-hat → Otsu binary → 2x upscale
/// </summary>
public static Bitmap PreprocessForOcr(Bitmap src, OcrOptions? options = null)
public static Bitmap PreprocessForOcr(Bitmap src)
{
if (options != null && !options.Preprocess)
return CloneArgb(src);
int kernelSize = options?.KernelSize ?? 25;
if (kernelSize < 3) kernelSize = 3;
if (kernelSize % 2 == 0) kernelSize += 1;
int scale = options?.Scale ?? 2;
if (scale < 1) scale = 1;
using var mat = BitmapConverter.ToMat(src);
using var gray = new Mat();
Cv2.CvtColor(mat, gray, ColorConversionCodes.BGRA2GRAY);
// Morphological white top-hat: isolates bright text on dark background
// Kernel size 25x25 captures text strokes, suppresses dim background text
using var kernel = Cv2.GetStructuringElement(MorphShapes.Rect, new OpenCvSharp.Size(kernelSize, kernelSize));
using var kernel = Cv2.GetStructuringElement(MorphShapes.Rect, new OpenCvSharp.Size(25, 25));
using var tophat = new Mat();
Cv2.MorphologyEx(gray, tophat, MorphTypes.TopHat, kernel);
@ -38,19 +28,10 @@ static class ImagePreprocessor
Cv2.Threshold(tophat, binary, 0, 255, ThresholdTypes.BinaryInv | ThresholdTypes.Otsu);
// 2x upscale for better LSTM recognition
if (scale == 1)
return BitmapConverter.ToBitmap(binary);
using var upscaled = new Mat();
Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * scale, binary.Height * scale),
Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * 2, binary.Height * 2),
interpolation: InterpolationFlags.Cubic);
return BitmapConverter.ToBitmap(upscaled);
}
private static Bitmap CloneArgb(Bitmap src)
{
var rect = new Rectangle(0, 0, src.Width, src.Height);
return src.Clone(rect, System.Drawing.Imaging.PixelFormat.Format32bppArgb);
}
}

View file

@ -15,14 +15,12 @@ static class ImageUtils
return Pix.LoadFromMemory(ms.ToArray());
}
public static List<OcrLineResult> ExtractLinesFromPage(Page page, int offsetX, int offsetY, int minConfidence = 50)
public static List<OcrLineResult> ExtractLinesFromPage(Page page, int offsetX, int offsetY)
{
var lines = new List<OcrLineResult>();
using var iter = page.GetIterator();
if (iter == null) return lines;
int minConf = Math.Clamp(minConfidence, 0, 100);
iter.Begin();
do
@ -34,7 +32,7 @@ static class ImageUtils
if (string.IsNullOrWhiteSpace(wordText)) continue;
float conf = iter.GetConfidence(PageIteratorLevel.Word);
if (conf < minConf) continue; // reject low-confidence garbage from background bleed
if (conf < 50) continue; // reject low-confidence garbage from background bleed
if (iter.TryGetBoundingBox(PageIteratorLevel.Word, out var bounds))
{

View file

@ -34,9 +34,6 @@ class Request
[JsonPropertyName("debug")]
public bool Debug { get; set; }
[JsonPropertyName("ocr")]
public OcrOptions? Ocr { get; set; }
[JsonPropertyName("targetRow")]
public int TargetRow { get; set; } = -1;
@ -44,21 +41,6 @@ class Request
public int TargetCol { get; set; } = -1;
}
class OcrOptions
{
[JsonPropertyName("preprocess")]
public bool Preprocess { get; set; } = true;
[JsonPropertyName("kernelSize")]
public int KernelSize { get; set; } = 25;
[JsonPropertyName("scale")]
public int Scale { get; set; } = 2;
[JsonPropertyName("minConfidence")]
public int MinConfidence { get; set; } = 50;
}
class RegionRect
{
[JsonPropertyName("x")]

View file

@ -12,14 +12,12 @@ class OcrHandler(TesseractEngine engine)
public object HandleOcr(Request req)
{
var options = NormalizeOptions(req.Ocr);
using var bitmap = ScreenCapture.CaptureOrLoad(req.File, req.Region);
using var processed = ImagePreprocessor.PreprocessForOcr(bitmap, options);
using var pix = ImageUtils.BitmapToPix(processed);
using var pix = ImageUtils.BitmapToPix(bitmap);
using var page = engine.Process(pix);
var text = page.GetText();
var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: 0, offsetY: 0, minConfidence: options.MinConfidence);
var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: 0, offsetY: 0);
return new OcrResponse { Text = text, Lines = lines };
}
@ -58,7 +56,6 @@ class OcrHandler(TesseractEngine engine)
public object HandleDiffOcr(Request req)
{
var options = NormalizeOptions(req.Ocr);
if (_referenceFrame == null)
return new ErrorResponse("No reference snapshot stored. Send 'snapshot' first.");
@ -79,91 +76,142 @@ class OcrHandler(TesseractEngine engine)
Marshal.Copy(curData.Scan0, curPx, 0, curPx.Length);
current.UnlockBits(curData);
bool debug = req.Debug;
int[] delta = new int[w * h];
long sum = 0;
long sumSq = 0;
int count = 0;
// Detect pixels that got DARKER (tooltip = dark overlay).
// This filters out item highlight glow (brighter) and cursor changes.
int diffThresh = req.Threshold > 0 ? req.Threshold : 30;
bool[] changed = new bool[w * h];
int totalChanged = 0;
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
{
int i = y * stride + x * 4;
int refB = refPx[i];
int refG = refPx[i + 1];
int refR = refPx[i + 2];
int curB = curPx[i];
int curG = curPx[i + 1];
int curR = curPx[i + 2];
int refL = (refR * 30 + refG * 59 + refB * 11) / 100;
int curL = (curR * 30 + curG * 59 + curB * 11) / 100;
int d = refL - curL;
delta[y * w + x] = d;
if (d > 0)
int darkerB = refPx[i] - curPx[i];
int darkerG = refPx[i + 1] - curPx[i + 1];
int darkerR = refPx[i + 2] - curPx[i + 2];
if (darkerB + darkerG + darkerR > diffThresh)
{
sum += d;
sumSq += (long)d * d;
count++;
changed[y * w + x] = true;
totalChanged++;
}
}
}
if (count == 0)
{
if (debug) Console.Error.WriteLine(" diff-ocr: no darkening detected");
return new OcrResponse { Text = "", Lines = [] };
}
double mean = (double)sum / count;
double variance = Math.Max(0, (double)sumSq / count - mean * mean);
double std = Math.Sqrt(variance);
int minThresh = req.Threshold > 0 ? req.Threshold : 20;
int diffThresh = (int)Math.Round(Math.Max(mean + 2.0 * std, minThresh));
int brightThresh = Math.Max(minThresh, diffThresh / 2);
bool[] changed = new bool[w * h];
int totalChanged = 0;
for (int i = 0; i < delta.Length; i++)
{
int d = delta[i];
if (d >= diffThresh || d <= -brightThresh)
{
changed[i] = true;
totalChanged++;
}
}
bool debug = req.Debug;
if (totalChanged == 0)
{
if (debug) Console.Error.WriteLine(" diff-ocr: no changes detected after threshold");
if (debug) Console.Error.WriteLine(" diff-ocr: no changes detected");
return new OcrResponse { Text = "", Lines = [] };
}
bool[] closed = ErodeMask(DilateMask(changed, w, h, radius: 3), w, h, radius: 3);
bool[] cleaned = DilateMask(ErodeMask(closed, w, h, radius: 1), w, h, radius: 1);
// Two-pass density detection:
// Pass 1: Find row range using full-width row counts
// Pass 2: Find column range using only pixels within detected row range
// This makes the column threshold relative to tooltip height, not screen height.
int maxGap = 15;
if (!TryFindBestComponent(cleaned, delta, w, h, minArea: (w * h) / 1000, out var compBounds))
// Pass 1: count changed pixels per row, find longest active run
int[] rowCounts = new int[h];
for (int y = 0; y < h; y++)
for (int x = 0; x < w; x++)
if (changed[y * w + x])
rowCounts[y]++;
int rowThresh = w / 30; // ~3% of width
int bestRowStart = 0, bestRowEnd = 0, bestRowLen = 0;
int curRowStart = -1, lastActiveRow = -1;
for (int y = 0; y < h; y++)
{
if (debug) Console.Error.WriteLine(" diff-ocr: no tooltip-sized region found");
if (rowCounts[y] >= rowThresh)
{
if (curRowStart < 0) curRowStart = y;
lastActiveRow = y;
}
else if (curRowStart >= 0 && y - lastActiveRow > maxGap)
{
int len = lastActiveRow - curRowStart + 1;
if (len > bestRowLen) { bestRowStart = curRowStart; bestRowEnd = lastActiveRow; bestRowLen = len; }
curRowStart = -1;
}
}
if (curRowStart >= 0)
{
int len = lastActiveRow - curRowStart + 1;
if (len > bestRowLen) { bestRowStart = curRowStart; bestRowEnd = lastActiveRow; bestRowLen = len; }
}
// Pass 2: count changed pixels per column, but only within the detected row range
int[] colCounts = new int[w];
for (int y = bestRowStart; y <= bestRowEnd; y++)
for (int x = 0; x < w; x++)
if (changed[y * w + x])
colCounts[x]++;
int tooltipHeight = bestRowEnd - bestRowStart + 1;
int colThresh = tooltipHeight / 15; // ~7% of tooltip height
int bestColStart = 0, bestColEnd = 0, bestColLen = 0;
int curColStart = -1, lastActiveCol = -1;
for (int x = 0; x < w; x++)
{
if (colCounts[x] >= colThresh)
{
if (curColStart < 0) curColStart = x;
lastActiveCol = x;
}
else if (curColStart >= 0 && x - lastActiveCol > maxGap)
{
int len = lastActiveCol - curColStart + 1;
if (len > bestColLen) { bestColStart = curColStart; bestColEnd = lastActiveCol; bestColLen = len; }
curColStart = -1;
}
}
if (curColStart >= 0)
{
int len = lastActiveCol - curColStart + 1;
if (len > bestColLen) { bestColStart = curColStart; bestColEnd = lastActiveCol; bestColLen = len; }
}
// Log density detection results
Console.Error.WriteLine($" diff-ocr: changed={totalChanged} rows={bestRowStart}-{bestRowEnd}({bestRowLen}) cols={bestColStart}-{bestColEnd}({bestColLen}) rowThresh={rowThresh} colThresh={colThresh}");
if (bestRowLen < 50 || bestColLen < 50)
{
Console.Error.WriteLine($" diff-ocr: no tooltip-sized region found (rows={bestRowLen}, cols={bestColLen})");
return new OcrResponse { Text = "", Lines = [] };
}
int pad = Math.Clamp(Math.Min(compBounds.Width, compBounds.Height) / 20, 6, 20);
int extraRight = Math.Clamp(compBounds.Width / 6, 12, 80);
int minX = Math.Max(compBounds.Left - pad, 0);
int minY = Math.Max(compBounds.Top - pad, 0);
int maxX = Math.Min(compBounds.Right - 1 + pad + extraRight, w - 1);
int maxY = Math.Min(compBounds.Bottom - 1 + pad, h - 1);
int pad = 0;
int minX = Math.Max(bestColStart - pad, 0);
int minY = Math.Max(bestRowStart - pad, 0);
int maxX = Math.Min(bestColEnd + pad, w - 1);
int maxY = Math.Min(bestRowEnd + pad, h - 1);
// Dynamic right-edge trim: if the rightmost columns are much sparser than
// the tooltip body, trim them. This handles the ~5% of cases where ambient
// noise extends the detected region slightly on the right.
int colSpan = maxX - minX + 1;
if (colSpan > 100)
{
// Compute median column density in the middle 50% of the range
int q1 = minX + colSpan / 4;
int q3 = minX + colSpan * 3 / 4;
long midSum = 0;
int midCount = 0;
for (int x = q1; x <= q3; x++) { midSum += colCounts[x]; midCount++; }
double avgMidDensity = (double)midSum / midCount;
double cutoff = avgMidDensity * 0.3; // column must have >=30% of avg density
// Trim from right while below cutoff
while (maxX > minX + 100 && colCounts[maxX] < cutoff)
maxX--;
}
int rw = maxX - minX + 1;
int rh = maxY - minY + 1;
if (debug)
Console.Error.WriteLine($" diff-ocr: changed={totalChanged} thresh={diffThresh} mean={mean:F1} std={std:F1} region=({minX},{minY}) {rw}x{rh}");
if (debug) Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}");
// Simple crop of the tooltip region from the current frame (no per-pixel masking).
// The top-hat preprocessing will handle suppressing background text.
@ -180,7 +228,7 @@ class OcrHandler(TesseractEngine engine)
}
// Pre-process for OCR: boost contrast, invert colors
using var processed = ImagePreprocessor.PreprocessForOcr(cropped, options);
using var processed = ImagePreprocessor.PreprocessForOcr(cropped);
// Save fullscreen and preprocessed versions alongside raw
if (!string.IsNullOrEmpty(req.Path))
@ -190,18 +238,15 @@ class OcrHandler(TesseractEngine engine)
current.Save(fullPath, ImageUtils.GetImageFormat(fullPath));
if (debug) Console.Error.WriteLine($" diff-ocr: saved fullscreen to {fullPath}");
if (options.Preprocess)
{
var prePath = Path.ChangeExtension(req.Path, ".pre" + ext);
processed.Save(prePath, ImageUtils.GetImageFormat(prePath));
if (debug) Console.Error.WriteLine($" diff-ocr: saved preprocessed to {prePath}");
}
var prePath = Path.ChangeExtension(req.Path, ".pre" + ext);
processed.Save(prePath, ImageUtils.GetImageFormat(prePath));
if (debug) Console.Error.WriteLine($" diff-ocr: saved preprocessed to {prePath}");
}
using var pix = ImageUtils.BitmapToPix(processed);
using var page = engine.Process(pix);
var text = page.GetText();
var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: minX, offsetY: minY, minConfidence: options.MinConfidence);
var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: minX, offsetY: minY);
return new DiffOcrResponse
{
@ -210,147 +255,4 @@ class OcrHandler(TesseractEngine engine)
Region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh },
};
}
private static bool[] DilateMask(bool[] src, int w, int h, int radius)
{
if (radius <= 0) return src;
bool[] dst = new bool[w * h];
int r = Math.Max(1, radius);
for (int y = 0; y < h; y++)
{
int y0 = Math.Max(0, y - r);
int y1 = Math.Min(h - 1, y + r);
for (int x = 0; x < w; x++)
{
int x0 = Math.Max(0, x - r);
int x1 = Math.Min(w - 1, x + r);
bool any = false;
for (int yy = y0; yy <= y1 && !any; yy++)
{
int row = yy * w;
for (int xx = x0; xx <= x1; xx++)
{
if (src[row + xx]) { any = true; break; }
}
}
dst[y * w + x] = any;
}
}
return dst;
}
private static bool[] ErodeMask(bool[] src, int w, int h, int radius)
{
if (radius <= 0) return src;
bool[] dst = new bool[w * h];
int r = Math.Max(1, radius);
for (int y = 0; y < h; y++)
{
int y0 = Math.Max(0, y - r);
int y1 = Math.Min(h - 1, y + r);
for (int x = 0; x < w; x++)
{
int x0 = Math.Max(0, x - r);
int x1 = Math.Min(w - 1, x + r);
bool all = true;
for (int yy = y0; yy <= y1 && all; yy++)
{
int row = yy * w;
for (int xx = x0; xx <= x1; xx++)
{
if (!src[row + xx]) { all = false; break; }
}
}
dst[y * w + x] = all;
}
}
return dst;
}
private static bool TryFindBestComponent(bool[] mask, int[] delta, int w, int h, int minArea, out Rectangle bounds)
{
bounds = Rectangle.Empty;
bool[] visited = new bool[w * h];
double bestScore = 0;
Rectangle bestBounds = Rectangle.Empty;
int[] qx = new int[w * h];
int[] qy = new int[w * h];
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
{
int idx = y * w + x;
if (!mask[idx] || visited[idx]) continue;
int head = 0, tail = 0;
qx[tail] = x; qy[tail] = y; tail++;
visited[idx] = true;
int minX = x, maxX = x, minY = y, maxY = y;
int area = 0;
long sumDelta = 0;
while (head < tail)
{
int cx = qx[head];
int cy = qy[head];
head++;
area++;
int didx = cy * w + cx;
int d = delta[didx];
if (d > 0) sumDelta += d;
if (cx < minX) minX = cx;
if (cx > maxX) maxX = cx;
if (cy < minY) minY = cy;
if (cy > maxY) maxY = cy;
for (int ny = cy - 1; ny <= cy + 1; ny++)
{
if (ny < 0 || ny >= h) continue;
int row = ny * w;
for (int nx = cx - 1; nx <= cx + 1; nx++)
{
if (nx < 0 || nx >= w) continue;
int nidx = row + nx;
if (!mask[nidx] || visited[nidx]) continue;
visited[nidx] = true;
qx[tail] = nx; qy[tail] = ny; tail++;
}
}
}
if (area >= minArea)
{
int rectW = maxX - minX + 1;
int rectH = maxY - minY + 1;
int rectArea = rectW * rectH;
double fillRatio = rectArea > 0 ? (double)area / rectArea : 0;
double avgDelta = area > 0 ? (double)sumDelta / area : 0;
double score = area * fillRatio * avgDelta;
if (score > bestScore)
{
bestScore = score;
bestBounds = Rectangle.FromLTRB(minX, minY, maxX + 1, maxY + 1);
}
}
}
}
if (bestScore <= 0) return false;
bounds = bestBounds;
return true;
}
private static OcrOptions NormalizeOptions(OcrOptions? options)
{
var normalized = options ?? new OcrOptions();
if (normalized.KernelSize < 3) normalized.KernelSize = 3;
if (normalized.KernelSize % 2 == 0) normalized.KernelSize += 1;
if (normalized.Scale < 1) normalized.Scale = 1;
if (normalized.MinConfidence < 0) normalized.MinConfidence = 0;
return normalized;
}
}