diff --git a/tools/OcrDaemon/ImagePreprocessor.cs b/tools/OcrDaemon/ImagePreprocessor.cs index f085dbd..6ab58d9 100644 --- a/tools/OcrDaemon/ImagePreprocessor.cs +++ b/tools/OcrDaemon/ImagePreprocessor.cs @@ -11,15 +11,25 @@ static class ImagePreprocessor /// Isolates bright tooltip text, suppresses dim background text visible through overlay. /// Pipeline: grayscale → morphological top-hat → Otsu binary → 2x upscale /// - public static Bitmap PreprocessForOcr(Bitmap src) + public static Bitmap PreprocessForOcr(Bitmap src, OcrOptions? options = null) { + if (options != null && !options.Preprocess) + return CloneArgb(src); + + int kernelSize = options?.KernelSize ?? 25; + if (kernelSize < 3) kernelSize = 3; + if (kernelSize % 2 == 0) kernelSize += 1; + + int scale = options?.Scale ?? 2; + if (scale < 1) scale = 1; + using var mat = BitmapConverter.ToMat(src); using var gray = new Mat(); Cv2.CvtColor(mat, gray, ColorConversionCodes.BGRA2GRAY); // Morphological white top-hat: isolates bright text on dark background // Kernel size 25x25 captures text strokes, suppresses dim background text - using var kernel = Cv2.GetStructuringElement(MorphShapes.Rect, new OpenCvSharp.Size(25, 25)); + using var kernel = Cv2.GetStructuringElement(MorphShapes.Rect, new OpenCvSharp.Size(kernelSize, kernelSize)); using var tophat = new Mat(); Cv2.MorphologyEx(gray, tophat, MorphTypes.TopHat, kernel); @@ -28,10 +38,19 @@ static class ImagePreprocessor Cv2.Threshold(tophat, binary, 0, 255, ThresholdTypes.BinaryInv | ThresholdTypes.Otsu); // 2x upscale for better LSTM recognition + if (scale == 1) + return BitmapConverter.ToBitmap(binary); + using var upscaled = new Mat(); - Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * 2, binary.Height * 2), + Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * scale, binary.Height * scale), interpolation: InterpolationFlags.Cubic); return BitmapConverter.ToBitmap(upscaled); } + + private static Bitmap CloneArgb(Bitmap src) + { + var rect = new Rectangle(0, 0, src.Width, src.Height); + return src.Clone(rect, System.Drawing.Imaging.PixelFormat.Format32bppArgb); + } } diff --git a/tools/OcrDaemon/ImageUtils.cs b/tools/OcrDaemon/ImageUtils.cs index df18b9c..6be4acb 100644 --- a/tools/OcrDaemon/ImageUtils.cs +++ b/tools/OcrDaemon/ImageUtils.cs @@ -15,12 +15,14 @@ static class ImageUtils return Pix.LoadFromMemory(ms.ToArray()); } - public static List ExtractLinesFromPage(Page page, int offsetX, int offsetY) + public static List ExtractLinesFromPage(Page page, int offsetX, int offsetY, int minConfidence = 50) { var lines = new List(); using var iter = page.GetIterator(); if (iter == null) return lines; + int minConf = Math.Clamp(minConfidence, 0, 100); + iter.Begin(); do @@ -32,7 +34,7 @@ static class ImageUtils if (string.IsNullOrWhiteSpace(wordText)) continue; float conf = iter.GetConfidence(PageIteratorLevel.Word); - if (conf < 50) continue; // reject low-confidence garbage from background bleed + if (conf < minConf) continue; // reject low-confidence garbage from background bleed if (iter.TryGetBoundingBox(PageIteratorLevel.Word, out var bounds)) { diff --git a/tools/OcrDaemon/Models.cs b/tools/OcrDaemon/Models.cs index 00081eb..9708e8a 100644 --- a/tools/OcrDaemon/Models.cs +++ b/tools/OcrDaemon/Models.cs @@ -34,6 +34,9 @@ class Request [JsonPropertyName("debug")] public bool Debug { get; set; } + [JsonPropertyName("ocr")] + public OcrOptions? Ocr { get; set; } + [JsonPropertyName("targetRow")] public int TargetRow { get; set; } = -1; @@ -41,6 +44,21 @@ class Request public int TargetCol { get; set; } = -1; } +class OcrOptions +{ + [JsonPropertyName("preprocess")] + public bool Preprocess { get; set; } = true; + + [JsonPropertyName("kernelSize")] + public int KernelSize { get; set; } = 25; + + [JsonPropertyName("scale")] + public int Scale { get; set; } = 2; + + [JsonPropertyName("minConfidence")] + public int MinConfidence { get; set; } = 50; +} + class RegionRect { [JsonPropertyName("x")] diff --git a/tools/OcrDaemon/OcrHandler.cs b/tools/OcrDaemon/OcrHandler.cs index 85513d3..4a695f0 100644 --- a/tools/OcrDaemon/OcrHandler.cs +++ b/tools/OcrDaemon/OcrHandler.cs @@ -12,12 +12,14 @@ class OcrHandler(TesseractEngine engine) public object HandleOcr(Request req) { + var options = NormalizeOptions(req.Ocr); using var bitmap = ScreenCapture.CaptureOrLoad(req.File, req.Region); - using var pix = ImageUtils.BitmapToPix(bitmap); + using var processed = ImagePreprocessor.PreprocessForOcr(bitmap, options); + using var pix = ImageUtils.BitmapToPix(processed); using var page = engine.Process(pix); var text = page.GetText(); - var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: 0, offsetY: 0); + var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: 0, offsetY: 0, minConfidence: options.MinConfidence); return new OcrResponse { Text = text, Lines = lines }; } @@ -56,6 +58,7 @@ class OcrHandler(TesseractEngine engine) public object HandleDiffOcr(Request req) { + var options = NormalizeOptions(req.Ocr); if (_referenceFrame == null) return new ErrorResponse("No reference snapshot stored. Send 'snapshot' first."); @@ -76,142 +79,91 @@ class OcrHandler(TesseractEngine engine) Marshal.Copy(curData.Scan0, curPx, 0, curPx.Length); current.UnlockBits(curData); - // Detect pixels that got DARKER (tooltip = dark overlay). - // This filters out item highlight glow (brighter) and cursor changes. - int diffThresh = req.Threshold > 0 ? req.Threshold : 30; - bool[] changed = new bool[w * h]; - int totalChanged = 0; + bool debug = req.Debug; + + int[] delta = new int[w * h]; + long sum = 0; + long sumSq = 0; + int count = 0; for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { int i = y * stride + x * 4; - int darkerB = refPx[i] - curPx[i]; - int darkerG = refPx[i + 1] - curPx[i + 1]; - int darkerR = refPx[i + 2] - curPx[i + 2]; - if (darkerB + darkerG + darkerR > diffThresh) + int refB = refPx[i]; + int refG = refPx[i + 1]; + int refR = refPx[i + 2]; + int curB = curPx[i]; + int curG = curPx[i + 1]; + int curR = curPx[i + 2]; + + int refL = (refR * 30 + refG * 59 + refB * 11) / 100; + int curL = (curR * 30 + curG * 59 + curB * 11) / 100; + int d = refL - curL; + delta[y * w + x] = d; + + if (d > 0) { - changed[y * w + x] = true; - totalChanged++; + sum += d; + sumSq += (long)d * d; + count++; } } } - bool debug = req.Debug; + if (count == 0) + { + if (debug) Console.Error.WriteLine(" diff-ocr: no darkening detected"); + return new OcrResponse { Text = "", Lines = [] }; + } + + double mean = (double)sum / count; + double variance = Math.Max(0, (double)sumSq / count - mean * mean); + double std = Math.Sqrt(variance); + + int minThresh = req.Threshold > 0 ? req.Threshold : 20; + int diffThresh = (int)Math.Round(Math.Max(mean + 2.0 * std, minThresh)); + int brightThresh = Math.Max(minThresh, diffThresh / 2); + + bool[] changed = new bool[w * h]; + int totalChanged = 0; + for (int i = 0; i < delta.Length; i++) + { + int d = delta[i]; + if (d >= diffThresh || d <= -brightThresh) + { + changed[i] = true; + totalChanged++; + } + } if (totalChanged == 0) { - if (debug) Console.Error.WriteLine(" diff-ocr: no changes detected"); + if (debug) Console.Error.WriteLine(" diff-ocr: no changes detected after threshold"); return new OcrResponse { Text = "", Lines = [] }; } - // Two-pass density detection: - // Pass 1: Find row range using full-width row counts - // Pass 2: Find column range using only pixels within detected row range - // This makes the column threshold relative to tooltip height, not screen height. - int maxGap = 15; + bool[] closed = ErodeMask(DilateMask(changed, w, h, radius: 3), w, h, radius: 3); + bool[] cleaned = DilateMask(ErodeMask(closed, w, h, radius: 1), w, h, radius: 1); - // Pass 1: count changed pixels per row, find longest active run - int[] rowCounts = new int[h]; - for (int y = 0; y < h; y++) - for (int x = 0; x < w; x++) - if (changed[y * w + x]) - rowCounts[y]++; - - int rowThresh = w / 30; // ~3% of width - int bestRowStart = 0, bestRowEnd = 0, bestRowLen = 0; - int curRowStart = -1, lastActiveRow = -1; - for (int y = 0; y < h; y++) + if (!TryFindBestComponent(cleaned, delta, w, h, minArea: (w * h) / 1000, out var compBounds)) { - if (rowCounts[y] >= rowThresh) - { - if (curRowStart < 0) curRowStart = y; - lastActiveRow = y; - } - else if (curRowStart >= 0 && y - lastActiveRow > maxGap) - { - int len = lastActiveRow - curRowStart + 1; - if (len > bestRowLen) { bestRowStart = curRowStart; bestRowEnd = lastActiveRow; bestRowLen = len; } - curRowStart = -1; - } - } - if (curRowStart >= 0) - { - int len = lastActiveRow - curRowStart + 1; - if (len > bestRowLen) { bestRowStart = curRowStart; bestRowEnd = lastActiveRow; bestRowLen = len; } - } - - // Pass 2: count changed pixels per column, but only within the detected row range - int[] colCounts = new int[w]; - for (int y = bestRowStart; y <= bestRowEnd; y++) - for (int x = 0; x < w; x++) - if (changed[y * w + x]) - colCounts[x]++; - - int tooltipHeight = bestRowEnd - bestRowStart + 1; - int colThresh = tooltipHeight / 15; // ~7% of tooltip height - - int bestColStart = 0, bestColEnd = 0, bestColLen = 0; - int curColStart = -1, lastActiveCol = -1; - for (int x = 0; x < w; x++) - { - if (colCounts[x] >= colThresh) - { - if (curColStart < 0) curColStart = x; - lastActiveCol = x; - } - else if (curColStart >= 0 && x - lastActiveCol > maxGap) - { - int len = lastActiveCol - curColStart + 1; - if (len > bestColLen) { bestColStart = curColStart; bestColEnd = lastActiveCol; bestColLen = len; } - curColStart = -1; - } - } - if (curColStart >= 0) - { - int len = lastActiveCol - curColStart + 1; - if (len > bestColLen) { bestColStart = curColStart; bestColEnd = lastActiveCol; bestColLen = len; } - } - - // Log density detection results - Console.Error.WriteLine($" diff-ocr: changed={totalChanged} rows={bestRowStart}-{bestRowEnd}({bestRowLen}) cols={bestColStart}-{bestColEnd}({bestColLen}) rowThresh={rowThresh} colThresh={colThresh}"); - - if (bestRowLen < 50 || bestColLen < 50) - { - Console.Error.WriteLine($" diff-ocr: no tooltip-sized region found (rows={bestRowLen}, cols={bestColLen})"); + if (debug) Console.Error.WriteLine(" diff-ocr: no tooltip-sized region found"); return new OcrResponse { Text = "", Lines = [] }; } - int pad = 0; - int minX = Math.Max(bestColStart - pad, 0); - int minY = Math.Max(bestRowStart - pad, 0); - int maxX = Math.Min(bestColEnd + pad, w - 1); - int maxY = Math.Min(bestRowEnd + pad, h - 1); - - // Dynamic right-edge trim: if the rightmost columns are much sparser than - // the tooltip body, trim them. This handles the ~5% of cases where ambient - // noise extends the detected region slightly on the right. - int colSpan = maxX - minX + 1; - if (colSpan > 100) - { - // Compute median column density in the middle 50% of the range - int q1 = minX + colSpan / 4; - int q3 = minX + colSpan * 3 / 4; - long midSum = 0; - int midCount = 0; - for (int x = q1; x <= q3; x++) { midSum += colCounts[x]; midCount++; } - double avgMidDensity = (double)midSum / midCount; - double cutoff = avgMidDensity * 0.3; // column must have >=30% of avg density - - // Trim from right while below cutoff - while (maxX > minX + 100 && colCounts[maxX] < cutoff) - maxX--; - } + int pad = Math.Clamp(Math.Min(compBounds.Width, compBounds.Height) / 20, 6, 20); + int extraRight = Math.Clamp(compBounds.Width / 6, 12, 80); + int minX = Math.Max(compBounds.Left - pad, 0); + int minY = Math.Max(compBounds.Top - pad, 0); + int maxX = Math.Min(compBounds.Right - 1 + pad + extraRight, w - 1); + int maxY = Math.Min(compBounds.Bottom - 1 + pad, h - 1); int rw = maxX - minX + 1; int rh = maxY - minY + 1; - if (debug) Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}"); + if (debug) + Console.Error.WriteLine($" diff-ocr: changed={totalChanged} thresh={diffThresh} mean={mean:F1} std={std:F1} region=({minX},{minY}) {rw}x{rh}"); // Simple crop of the tooltip region from the current frame (no per-pixel masking). // The top-hat preprocessing will handle suppressing background text. @@ -228,10 +180,10 @@ class OcrHandler(TesseractEngine engine) } // Pre-process for OCR: boost contrast, invert colors - using var processed = ImagePreprocessor.PreprocessForOcr(cropped); + using var processed = ImagePreprocessor.PreprocessForOcr(cropped, options); // Save preprocessed version alongside raw - if (!string.IsNullOrEmpty(req.Path)) + if (!string.IsNullOrEmpty(req.Path) && options.Preprocess) { var ext = Path.GetExtension(req.Path); var prePath = Path.ChangeExtension(req.Path, ".pre" + ext); @@ -242,7 +194,7 @@ class OcrHandler(TesseractEngine engine) using var page = engine.Process(pix); var text = page.GetText(); - var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: minX, offsetY: minY); + var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: minX, offsetY: minY, minConfidence: options.MinConfidence); return new DiffOcrResponse { @@ -251,4 +203,147 @@ class OcrHandler(TesseractEngine engine) Region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh }, }; } + + private static bool[] DilateMask(bool[] src, int w, int h, int radius) + { + if (radius <= 0) return src; + bool[] dst = new bool[w * h]; + int r = Math.Max(1, radius); + for (int y = 0; y < h; y++) + { + int y0 = Math.Max(0, y - r); + int y1 = Math.Min(h - 1, y + r); + for (int x = 0; x < w; x++) + { + int x0 = Math.Max(0, x - r); + int x1 = Math.Min(w - 1, x + r); + bool any = false; + for (int yy = y0; yy <= y1 && !any; yy++) + { + int row = yy * w; + for (int xx = x0; xx <= x1; xx++) + { + if (src[row + xx]) { any = true; break; } + } + } + dst[y * w + x] = any; + } + } + return dst; + } + + private static bool[] ErodeMask(bool[] src, int w, int h, int radius) + { + if (radius <= 0) return src; + bool[] dst = new bool[w * h]; + int r = Math.Max(1, radius); + for (int y = 0; y < h; y++) + { + int y0 = Math.Max(0, y - r); + int y1 = Math.Min(h - 1, y + r); + for (int x = 0; x < w; x++) + { + int x0 = Math.Max(0, x - r); + int x1 = Math.Min(w - 1, x + r); + bool all = true; + for (int yy = y0; yy <= y1 && all; yy++) + { + int row = yy * w; + for (int xx = x0; xx <= x1; xx++) + { + if (!src[row + xx]) { all = false; break; } + } + } + dst[y * w + x] = all; + } + } + return dst; + } + + private static bool TryFindBestComponent(bool[] mask, int[] delta, int w, int h, int minArea, out Rectangle bounds) + { + bounds = Rectangle.Empty; + bool[] visited = new bool[w * h]; + double bestScore = 0; + Rectangle bestBounds = Rectangle.Empty; + int[] qx = new int[w * h]; + int[] qy = new int[w * h]; + + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + int idx = y * w + x; + if (!mask[idx] || visited[idx]) continue; + + int head = 0, tail = 0; + qx[tail] = x; qy[tail] = y; tail++; + visited[idx] = true; + + int minX = x, maxX = x, minY = y, maxY = y; + int area = 0; + long sumDelta = 0; + + while (head < tail) + { + int cx = qx[head]; + int cy = qy[head]; + head++; + area++; + int didx = cy * w + cx; + int d = delta[didx]; + if (d > 0) sumDelta += d; + + if (cx < minX) minX = cx; + if (cx > maxX) maxX = cx; + if (cy < minY) minY = cy; + if (cy > maxY) maxY = cy; + + for (int ny = cy - 1; ny <= cy + 1; ny++) + { + if (ny < 0 || ny >= h) continue; + int row = ny * w; + for (int nx = cx - 1; nx <= cx + 1; nx++) + { + if (nx < 0 || nx >= w) continue; + int nidx = row + nx; + if (!mask[nidx] || visited[nidx]) continue; + visited[nidx] = true; + qx[tail] = nx; qy[tail] = ny; tail++; + } + } + } + + if (area >= minArea) + { + int rectW = maxX - minX + 1; + int rectH = maxY - minY + 1; + int rectArea = rectW * rectH; + double fillRatio = rectArea > 0 ? (double)area / rectArea : 0; + double avgDelta = area > 0 ? (double)sumDelta / area : 0; + double score = area * fillRatio * avgDelta; + + if (score > bestScore) + { + bestScore = score; + bestBounds = Rectangle.FromLTRB(minX, minY, maxX + 1, maxY + 1); + } + } + } + } + + if (bestScore <= 0) return false; + bounds = bestBounds; + return true; + } + + private static OcrOptions NormalizeOptions(OcrOptions? options) + { + var normalized = options ?? new OcrOptions(); + if (normalized.KernelSize < 3) normalized.KernelSize = 3; + if (normalized.KernelSize % 2 == 0) normalized.KernelSize += 1; + if (normalized.Scale < 1) normalized.Scale = 1; + if (normalized.MinConfidence < 0) normalized.MinConfidence = 0; + return normalized; + } }