diff --git a/tools/OcrDaemon/ImagePreprocessor.cs b/tools/OcrDaemon/ImagePreprocessor.cs index 6ab58d9..f085dbd 100644 --- a/tools/OcrDaemon/ImagePreprocessor.cs +++ b/tools/OcrDaemon/ImagePreprocessor.cs @@ -11,25 +11,15 @@ static class ImagePreprocessor /// Isolates bright tooltip text, suppresses dim background text visible through overlay. /// Pipeline: grayscale → morphological top-hat → Otsu binary → 2x upscale /// - public static Bitmap PreprocessForOcr(Bitmap src, OcrOptions? options = null) + public static Bitmap PreprocessForOcr(Bitmap src) { - if (options != null && !options.Preprocess) - return CloneArgb(src); - - int kernelSize = options?.KernelSize ?? 25; - if (kernelSize < 3) kernelSize = 3; - if (kernelSize % 2 == 0) kernelSize += 1; - - int scale = options?.Scale ?? 2; - if (scale < 1) scale = 1; - using var mat = BitmapConverter.ToMat(src); using var gray = new Mat(); Cv2.CvtColor(mat, gray, ColorConversionCodes.BGRA2GRAY); // Morphological white top-hat: isolates bright text on dark background // Kernel size 25x25 captures text strokes, suppresses dim background text - using var kernel = Cv2.GetStructuringElement(MorphShapes.Rect, new OpenCvSharp.Size(kernelSize, kernelSize)); + using var kernel = Cv2.GetStructuringElement(MorphShapes.Rect, new OpenCvSharp.Size(25, 25)); using var tophat = new Mat(); Cv2.MorphologyEx(gray, tophat, MorphTypes.TopHat, kernel); @@ -38,19 +28,10 @@ static class ImagePreprocessor Cv2.Threshold(tophat, binary, 0, 255, ThresholdTypes.BinaryInv | ThresholdTypes.Otsu); // 2x upscale for better LSTM recognition - if (scale == 1) - return BitmapConverter.ToBitmap(binary); - using var upscaled = new Mat(); - Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * scale, binary.Height * scale), + Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * 2, binary.Height * 2), interpolation: InterpolationFlags.Cubic); return BitmapConverter.ToBitmap(upscaled); } - - private static Bitmap CloneArgb(Bitmap src) - { - var rect = new Rectangle(0, 0, src.Width, src.Height); - return src.Clone(rect, System.Drawing.Imaging.PixelFormat.Format32bppArgb); - } } diff --git a/tools/OcrDaemon/ImageUtils.cs b/tools/OcrDaemon/ImageUtils.cs index 6be4acb..df18b9c 100644 --- a/tools/OcrDaemon/ImageUtils.cs +++ b/tools/OcrDaemon/ImageUtils.cs @@ -15,14 +15,12 @@ static class ImageUtils return Pix.LoadFromMemory(ms.ToArray()); } - public static List ExtractLinesFromPage(Page page, int offsetX, int offsetY, int minConfidence = 50) + public static List ExtractLinesFromPage(Page page, int offsetX, int offsetY) { var lines = new List(); using var iter = page.GetIterator(); if (iter == null) return lines; - int minConf = Math.Clamp(minConfidence, 0, 100); - iter.Begin(); do @@ -34,7 +32,7 @@ static class ImageUtils if (string.IsNullOrWhiteSpace(wordText)) continue; float conf = iter.GetConfidence(PageIteratorLevel.Word); - if (conf < minConf) continue; // reject low-confidence garbage from background bleed + if (conf < 50) continue; // reject low-confidence garbage from background bleed if (iter.TryGetBoundingBox(PageIteratorLevel.Word, out var bounds)) { diff --git a/tools/OcrDaemon/Models.cs b/tools/OcrDaemon/Models.cs index 9708e8a..00081eb 100644 --- a/tools/OcrDaemon/Models.cs +++ b/tools/OcrDaemon/Models.cs @@ -34,9 +34,6 @@ class Request [JsonPropertyName("debug")] public bool Debug { get; set; } - [JsonPropertyName("ocr")] - public OcrOptions? Ocr { get; set; } - [JsonPropertyName("targetRow")] public int TargetRow { get; set; } = -1; @@ -44,21 +41,6 @@ class Request public int TargetCol { get; set; } = -1; } -class OcrOptions -{ - [JsonPropertyName("preprocess")] - public bool Preprocess { get; set; } = true; - - [JsonPropertyName("kernelSize")] - public int KernelSize { get; set; } = 25; - - [JsonPropertyName("scale")] - public int Scale { get; set; } = 2; - - [JsonPropertyName("minConfidence")] - public int MinConfidence { get; set; } = 50; -} - class RegionRect { [JsonPropertyName("x")] diff --git a/tools/OcrDaemon/OcrHandler.cs b/tools/OcrDaemon/OcrHandler.cs index 1456a3d..4404336 100644 --- a/tools/OcrDaemon/OcrHandler.cs +++ b/tools/OcrDaemon/OcrHandler.cs @@ -12,14 +12,12 @@ class OcrHandler(TesseractEngine engine) public object HandleOcr(Request req) { - var options = NormalizeOptions(req.Ocr); using var bitmap = ScreenCapture.CaptureOrLoad(req.File, req.Region); - using var processed = ImagePreprocessor.PreprocessForOcr(bitmap, options); - using var pix = ImageUtils.BitmapToPix(processed); + using var pix = ImageUtils.BitmapToPix(bitmap); using var page = engine.Process(pix); var text = page.GetText(); - var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: 0, offsetY: 0, minConfidence: options.MinConfidence); + var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: 0, offsetY: 0); return new OcrResponse { Text = text, Lines = lines }; } @@ -58,7 +56,6 @@ class OcrHandler(TesseractEngine engine) public object HandleDiffOcr(Request req) { - var options = NormalizeOptions(req.Ocr); if (_referenceFrame == null) return new ErrorResponse("No reference snapshot stored. Send 'snapshot' first."); @@ -79,91 +76,142 @@ class OcrHandler(TesseractEngine engine) Marshal.Copy(curData.Scan0, curPx, 0, curPx.Length); current.UnlockBits(curData); - bool debug = req.Debug; - - int[] delta = new int[w * h]; - long sum = 0; - long sumSq = 0; - int count = 0; + // Detect pixels that got DARKER (tooltip = dark overlay). + // This filters out item highlight glow (brighter) and cursor changes. + int diffThresh = req.Threshold > 0 ? req.Threshold : 30; + bool[] changed = new bool[w * h]; + int totalChanged = 0; for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { int i = y * stride + x * 4; - int refB = refPx[i]; - int refG = refPx[i + 1]; - int refR = refPx[i + 2]; - int curB = curPx[i]; - int curG = curPx[i + 1]; - int curR = curPx[i + 2]; - - int refL = (refR * 30 + refG * 59 + refB * 11) / 100; - int curL = (curR * 30 + curG * 59 + curB * 11) / 100; - int d = refL - curL; - delta[y * w + x] = d; - - if (d > 0) + int darkerB = refPx[i] - curPx[i]; + int darkerG = refPx[i + 1] - curPx[i + 1]; + int darkerR = refPx[i + 2] - curPx[i + 2]; + if (darkerB + darkerG + darkerR > diffThresh) { - sum += d; - sumSq += (long)d * d; - count++; + changed[y * w + x] = true; + totalChanged++; } } } - if (count == 0) - { - if (debug) Console.Error.WriteLine(" diff-ocr: no darkening detected"); - return new OcrResponse { Text = "", Lines = [] }; - } - - double mean = (double)sum / count; - double variance = Math.Max(0, (double)sumSq / count - mean * mean); - double std = Math.Sqrt(variance); - - int minThresh = req.Threshold > 0 ? req.Threshold : 20; - int diffThresh = (int)Math.Round(Math.Max(mean + 2.0 * std, minThresh)); - int brightThresh = Math.Max(minThresh, diffThresh / 2); - - bool[] changed = new bool[w * h]; - int totalChanged = 0; - for (int i = 0; i < delta.Length; i++) - { - int d = delta[i]; - if (d >= diffThresh || d <= -brightThresh) - { - changed[i] = true; - totalChanged++; - } - } + bool debug = req.Debug; if (totalChanged == 0) { - if (debug) Console.Error.WriteLine(" diff-ocr: no changes detected after threshold"); + if (debug) Console.Error.WriteLine(" diff-ocr: no changes detected"); return new OcrResponse { Text = "", Lines = [] }; } - bool[] closed = ErodeMask(DilateMask(changed, w, h, radius: 3), w, h, radius: 3); - bool[] cleaned = DilateMask(ErodeMask(closed, w, h, radius: 1), w, h, radius: 1); + // Two-pass density detection: + // Pass 1: Find row range using full-width row counts + // Pass 2: Find column range using only pixels within detected row range + // This makes the column threshold relative to tooltip height, not screen height. + int maxGap = 15; - if (!TryFindBestComponent(cleaned, delta, w, h, minArea: (w * h) / 1000, out var compBounds)) + // Pass 1: count changed pixels per row, find longest active run + int[] rowCounts = new int[h]; + for (int y = 0; y < h; y++) + for (int x = 0; x < w; x++) + if (changed[y * w + x]) + rowCounts[y]++; + + int rowThresh = w / 30; // ~3% of width + int bestRowStart = 0, bestRowEnd = 0, bestRowLen = 0; + int curRowStart = -1, lastActiveRow = -1; + for (int y = 0; y < h; y++) { - if (debug) Console.Error.WriteLine(" diff-ocr: no tooltip-sized region found"); + if (rowCounts[y] >= rowThresh) + { + if (curRowStart < 0) curRowStart = y; + lastActiveRow = y; + } + else if (curRowStart >= 0 && y - lastActiveRow > maxGap) + { + int len = lastActiveRow - curRowStart + 1; + if (len > bestRowLen) { bestRowStart = curRowStart; bestRowEnd = lastActiveRow; bestRowLen = len; } + curRowStart = -1; + } + } + if (curRowStart >= 0) + { + int len = lastActiveRow - curRowStart + 1; + if (len > bestRowLen) { bestRowStart = curRowStart; bestRowEnd = lastActiveRow; bestRowLen = len; } + } + + // Pass 2: count changed pixels per column, but only within the detected row range + int[] colCounts = new int[w]; + for (int y = bestRowStart; y <= bestRowEnd; y++) + for (int x = 0; x < w; x++) + if (changed[y * w + x]) + colCounts[x]++; + + int tooltipHeight = bestRowEnd - bestRowStart + 1; + int colThresh = tooltipHeight / 15; // ~7% of tooltip height + + int bestColStart = 0, bestColEnd = 0, bestColLen = 0; + int curColStart = -1, lastActiveCol = -1; + for (int x = 0; x < w; x++) + { + if (colCounts[x] >= colThresh) + { + if (curColStart < 0) curColStart = x; + lastActiveCol = x; + } + else if (curColStart >= 0 && x - lastActiveCol > maxGap) + { + int len = lastActiveCol - curColStart + 1; + if (len > bestColLen) { bestColStart = curColStart; bestColEnd = lastActiveCol; bestColLen = len; } + curColStart = -1; + } + } + if (curColStart >= 0) + { + int len = lastActiveCol - curColStart + 1; + if (len > bestColLen) { bestColStart = curColStart; bestColEnd = lastActiveCol; bestColLen = len; } + } + + // Log density detection results + Console.Error.WriteLine($" diff-ocr: changed={totalChanged} rows={bestRowStart}-{bestRowEnd}({bestRowLen}) cols={bestColStart}-{bestColEnd}({bestColLen}) rowThresh={rowThresh} colThresh={colThresh}"); + + if (bestRowLen < 50 || bestColLen < 50) + { + Console.Error.WriteLine($" diff-ocr: no tooltip-sized region found (rows={bestRowLen}, cols={bestColLen})"); return new OcrResponse { Text = "", Lines = [] }; } - int pad = Math.Clamp(Math.Min(compBounds.Width, compBounds.Height) / 20, 6, 20); - int extraRight = Math.Clamp(compBounds.Width / 6, 12, 80); - int minX = Math.Max(compBounds.Left - pad, 0); - int minY = Math.Max(compBounds.Top - pad, 0); - int maxX = Math.Min(compBounds.Right - 1 + pad + extraRight, w - 1); - int maxY = Math.Min(compBounds.Bottom - 1 + pad, h - 1); + int pad = 0; + int minX = Math.Max(bestColStart - pad, 0); + int minY = Math.Max(bestRowStart - pad, 0); + int maxX = Math.Min(bestColEnd + pad, w - 1); + int maxY = Math.Min(bestRowEnd + pad, h - 1); + + // Dynamic right-edge trim: if the rightmost columns are much sparser than + // the tooltip body, trim them. This handles the ~5% of cases where ambient + // noise extends the detected region slightly on the right. + int colSpan = maxX - minX + 1; + if (colSpan > 100) + { + // Compute median column density in the middle 50% of the range + int q1 = minX + colSpan / 4; + int q3 = minX + colSpan * 3 / 4; + long midSum = 0; + int midCount = 0; + for (int x = q1; x <= q3; x++) { midSum += colCounts[x]; midCount++; } + double avgMidDensity = (double)midSum / midCount; + double cutoff = avgMidDensity * 0.3; // column must have >=30% of avg density + + // Trim from right while below cutoff + while (maxX > minX + 100 && colCounts[maxX] < cutoff) + maxX--; + } int rw = maxX - minX + 1; int rh = maxY - minY + 1; - if (debug) - Console.Error.WriteLine($" diff-ocr: changed={totalChanged} thresh={diffThresh} mean={mean:F1} std={std:F1} region=({minX},{minY}) {rw}x{rh}"); + if (debug) Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}"); // Simple crop of the tooltip region from the current frame (no per-pixel masking). // The top-hat preprocessing will handle suppressing background text. @@ -180,7 +228,7 @@ class OcrHandler(TesseractEngine engine) } // Pre-process for OCR: boost contrast, invert colors - using var processed = ImagePreprocessor.PreprocessForOcr(cropped, options); + using var processed = ImagePreprocessor.PreprocessForOcr(cropped); // Save fullscreen and preprocessed versions alongside raw if (!string.IsNullOrEmpty(req.Path)) @@ -190,18 +238,15 @@ class OcrHandler(TesseractEngine engine) current.Save(fullPath, ImageUtils.GetImageFormat(fullPath)); if (debug) Console.Error.WriteLine($" diff-ocr: saved fullscreen to {fullPath}"); - if (options.Preprocess) - { - var prePath = Path.ChangeExtension(req.Path, ".pre" + ext); - processed.Save(prePath, ImageUtils.GetImageFormat(prePath)); - if (debug) Console.Error.WriteLine($" diff-ocr: saved preprocessed to {prePath}"); - } + var prePath = Path.ChangeExtension(req.Path, ".pre" + ext); + processed.Save(prePath, ImageUtils.GetImageFormat(prePath)); + if (debug) Console.Error.WriteLine($" diff-ocr: saved preprocessed to {prePath}"); } using var pix = ImageUtils.BitmapToPix(processed); using var page = engine.Process(pix); var text = page.GetText(); - var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: minX, offsetY: minY, minConfidence: options.MinConfidence); + var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: minX, offsetY: minY); return new DiffOcrResponse { @@ -210,147 +255,4 @@ class OcrHandler(TesseractEngine engine) Region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh }, }; } - - private static bool[] DilateMask(bool[] src, int w, int h, int radius) - { - if (radius <= 0) return src; - bool[] dst = new bool[w * h]; - int r = Math.Max(1, radius); - for (int y = 0; y < h; y++) - { - int y0 = Math.Max(0, y - r); - int y1 = Math.Min(h - 1, y + r); - for (int x = 0; x < w; x++) - { - int x0 = Math.Max(0, x - r); - int x1 = Math.Min(w - 1, x + r); - bool any = false; - for (int yy = y0; yy <= y1 && !any; yy++) - { - int row = yy * w; - for (int xx = x0; xx <= x1; xx++) - { - if (src[row + xx]) { any = true; break; } - } - } - dst[y * w + x] = any; - } - } - return dst; - } - - private static bool[] ErodeMask(bool[] src, int w, int h, int radius) - { - if (radius <= 0) return src; - bool[] dst = new bool[w * h]; - int r = Math.Max(1, radius); - for (int y = 0; y < h; y++) - { - int y0 = Math.Max(0, y - r); - int y1 = Math.Min(h - 1, y + r); - for (int x = 0; x < w; x++) - { - int x0 = Math.Max(0, x - r); - int x1 = Math.Min(w - 1, x + r); - bool all = true; - for (int yy = y0; yy <= y1 && all; yy++) - { - int row = yy * w; - for (int xx = x0; xx <= x1; xx++) - { - if (!src[row + xx]) { all = false; break; } - } - } - dst[y * w + x] = all; - } - } - return dst; - } - - private static bool TryFindBestComponent(bool[] mask, int[] delta, int w, int h, int minArea, out Rectangle bounds) - { - bounds = Rectangle.Empty; - bool[] visited = new bool[w * h]; - double bestScore = 0; - Rectangle bestBounds = Rectangle.Empty; - int[] qx = new int[w * h]; - int[] qy = new int[w * h]; - - for (int y = 0; y < h; y++) - { - for (int x = 0; x < w; x++) - { - int idx = y * w + x; - if (!mask[idx] || visited[idx]) continue; - - int head = 0, tail = 0; - qx[tail] = x; qy[tail] = y; tail++; - visited[idx] = true; - - int minX = x, maxX = x, minY = y, maxY = y; - int area = 0; - long sumDelta = 0; - - while (head < tail) - { - int cx = qx[head]; - int cy = qy[head]; - head++; - area++; - int didx = cy * w + cx; - int d = delta[didx]; - if (d > 0) sumDelta += d; - - if (cx < minX) minX = cx; - if (cx > maxX) maxX = cx; - if (cy < minY) minY = cy; - if (cy > maxY) maxY = cy; - - for (int ny = cy - 1; ny <= cy + 1; ny++) - { - if (ny < 0 || ny >= h) continue; - int row = ny * w; - for (int nx = cx - 1; nx <= cx + 1; nx++) - { - if (nx < 0 || nx >= w) continue; - int nidx = row + nx; - if (!mask[nidx] || visited[nidx]) continue; - visited[nidx] = true; - qx[tail] = nx; qy[tail] = ny; tail++; - } - } - } - - if (area >= minArea) - { - int rectW = maxX - minX + 1; - int rectH = maxY - minY + 1; - int rectArea = rectW * rectH; - double fillRatio = rectArea > 0 ? (double)area / rectArea : 0; - double avgDelta = area > 0 ? (double)sumDelta / area : 0; - double score = area * fillRatio * avgDelta; - - if (score > bestScore) - { - bestScore = score; - bestBounds = Rectangle.FromLTRB(minX, minY, maxX + 1, maxY + 1); - } - } - } - } - - if (bestScore <= 0) return false; - bounds = bestBounds; - return true; - } - - private static OcrOptions NormalizeOptions(OcrOptions? options) - { - var normalized = options ?? new OcrOptions(); - if (normalized.KernelSize < 3) normalized.KernelSize = 3; - if (normalized.KernelSize % 2 == 0) normalized.KernelSize += 1; - if (normalized.Scale < 1) normalized.Scale = 1; - if (normalized.MinConfidence < 0) normalized.MinConfidence = 0; - return normalized; - } }