diff --git a/package.json b/package.json index db7ace3..a3a0eb7 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,10 @@ "dev": "dotnet build tools/OcrDaemon -c Release && tsx src/index.ts", "build": "tsc", "build:daemon": "dotnet build tools/OcrDaemon -c Release", - "start": "node dist/index.js" + "start": "node dist/index.js", + "stop:daemon": "taskkill /IM OcrDaemon.exe /F 2>nul || exit /b 0", + "test:ocr": "taskkill /IM OcrDaemon.exe /F 2>nul & dotnet build tools/OcrDaemon -c Release && echo {\"cmd\":\"test\"} | tools\\OcrDaemon\\bin\\Release\\net8.0-windows10.0.19041.0\\OcrDaemon.exe", + "tune:ocr": "taskkill /IM OcrDaemon.exe /F 2>nul & dotnet build tools/OcrDaemon -c Release && echo {\"cmd\":\"tune\"} | tools\\OcrDaemon\\bin\\Release\\net8.0-windows10.0.19041.0\\OcrDaemon.exe" }, "dependencies": { "chokidar": "^4.0.3", diff --git a/tools/OcrDaemon/Daemon.cs b/tools/OcrDaemon/Daemon.cs index fb93f26..da4c2fd 100644 --- a/tools/OcrDaemon/Daemon.cs +++ b/tools/OcrDaemon/Daemon.cs @@ -63,6 +63,8 @@ static class Daemon "capture" => ocrHandler.HandleCapture(request), "snapshot" => ocrHandler.HandleSnapshot(request), "diff-ocr" => ocrHandler.HandleDiffOcr(request), + "test" => ocrHandler.HandleTest(request), + "tune" => ocrHandler.HandleTune(request), "grid" => gridHandler.HandleGrid(request), "detect-grid" => detectGridHandler.HandleDetectGrid(request), _ => new ErrorResponse($"Unknown command: {request.Cmd}"), diff --git a/tools/OcrDaemon/ImagePreprocessor.cs b/tools/OcrDaemon/ImagePreprocessor.cs index f085dbd..ecbce81 100644 --- a/tools/OcrDaemon/ImagePreprocessor.cs +++ b/tools/OcrDaemon/ImagePreprocessor.cs @@ -9,17 +9,16 @@ static class ImagePreprocessor /// /// Pre-process an image for OCR using morphological white top-hat filtering. /// Isolates bright tooltip text, suppresses dim background text visible through overlay. - /// Pipeline: grayscale → morphological top-hat → Otsu binary → 2x upscale + /// Pipeline: grayscale → morphological top-hat → Otsu binary → upscale /// - public static Bitmap PreprocessForOcr(Bitmap src) + public static Bitmap PreprocessForOcr(Bitmap src, int kernelSize = 25, int upscale = 2) { using var mat = BitmapConverter.ToMat(src); using var gray = new Mat(); Cv2.CvtColor(mat, gray, ColorConversionCodes.BGRA2GRAY); // Morphological white top-hat: isolates bright text on dark background - // Kernel size 25x25 captures text strokes, suppresses dim background text - using var kernel = Cv2.GetStructuringElement(MorphShapes.Rect, new OpenCvSharp.Size(25, 25)); + using var kernel = Cv2.GetStructuringElement(MorphShapes.Rect, new OpenCvSharp.Size(kernelSize, kernelSize)); using var tophat = new Mat(); Cv2.MorphologyEx(gray, tophat, MorphTypes.TopHat, kernel); @@ -27,11 +26,15 @@ static class ImagePreprocessor using var binary = new Mat(); Cv2.Threshold(tophat, binary, 0, 255, ThresholdTypes.BinaryInv | ThresholdTypes.Otsu); - // 2x upscale for better LSTM recognition - using var upscaled = new Mat(); - Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * 2, binary.Height * 2), - interpolation: InterpolationFlags.Cubic); + // Upscale for better LSTM recognition + if (upscale > 1) + { + using var upscaled = new Mat(); + Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * upscale, binary.Height * upscale), + interpolation: InterpolationFlags.Cubic); + return BitmapConverter.ToBitmap(upscaled); + } - return BitmapConverter.ToBitmap(upscaled); + return BitmapConverter.ToBitmap(binary); } } diff --git a/tools/OcrDaemon/Models.cs b/tools/OcrDaemon/Models.cs index 00081eb..ea5247b 100644 --- a/tools/OcrDaemon/Models.cs +++ b/tools/OcrDaemon/Models.cs @@ -208,3 +208,101 @@ class DetectGridResponse [JsonPropertyName("cellHeight")] public double CellHeight { get; set; } } + +class DiffOcrParams +{ + [JsonPropertyName("diffThresh")] + public int DiffThresh { get; set; } = 10; + + [JsonPropertyName("rowThreshDiv")] + public int RowThreshDiv { get; set; } = 30; + + [JsonPropertyName("colThreshDiv")] + public int ColThreshDiv { get; set; } = 8; + + [JsonPropertyName("maxGap")] + public int MaxGap { get; set; } = 20; + + [JsonPropertyName("trimCutoff")] + public double TrimCutoff { get; set; } = 0.4; + + [JsonPropertyName("kernelSize")] + public int KernelSize { get; set; } = 41; + + [JsonPropertyName("upscale")] + public int Upscale { get; set; } = 2; + + public DiffOcrParams Clone() => (DiffOcrParams)MemberwiseClone(); + + public override string ToString() => + $"diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} kernelSize={KernelSize} upscale={Upscale}"; +} + +class TestCase +{ + [JsonPropertyName("id")] + public string Id { get; set; } = ""; + + [JsonPropertyName("image")] + public string Image { get; set; } = ""; + + [JsonPropertyName("fullImage")] + public string FullImage { get; set; } = ""; + + [JsonPropertyName("expected")] + public List Expected { get; set; } = []; +} + +class TestCaseResult +{ + [JsonPropertyName("id")] + public string Id { get; set; } = ""; + + [JsonPropertyName("passed")] + public bool Passed { get; set; } + + [JsonPropertyName("score")] + public double Score { get; set; } + + [JsonPropertyName("matched")] + public List Matched { get; set; } = []; + + [JsonPropertyName("missed")] + public List Missed { get; set; } = []; + + [JsonPropertyName("extra")] + public List Extra { get; set; } = []; +} + +class TestResponse +{ + [JsonPropertyName("ok")] + public bool Ok => true; + + [JsonPropertyName("passed")] + public int Passed { get; set; } + + [JsonPropertyName("failed")] + public int Failed { get; set; } + + [JsonPropertyName("total")] + public int Total { get; set; } + + [JsonPropertyName("results")] + public List Results { get; set; } = []; +} + +class TuneResponse +{ + [JsonPropertyName("ok")] + public bool Ok => true; + + [JsonPropertyName("bestScore")] + public double BestScore { get; set; } + + [JsonPropertyName("bestParams")] + public DiffOcrParams BestParams { get; set; } = new(); + + [JsonPropertyName("iterations")] + public int Iterations { get; set; } +} diff --git a/tools/OcrDaemon/OcrDaemon.csproj b/tools/OcrDaemon/OcrDaemon.csproj index 7872912..1f750d3 100644 --- a/tools/OcrDaemon/OcrDaemon.csproj +++ b/tools/OcrDaemon/OcrDaemon.csproj @@ -22,6 +22,12 @@ PreserveNewest + + PreserveNewest + + + PreserveNewest + diff --git a/tools/OcrDaemon/OcrHandler.cs b/tools/OcrDaemon/OcrHandler.cs index 4404336..bfda313 100644 --- a/tools/OcrDaemon/OcrHandler.cs +++ b/tools/OcrDaemon/OcrHandler.cs @@ -3,6 +3,7 @@ namespace OcrDaemon; using System.Drawing; using System.Drawing.Imaging; using System.Runtime.InteropServices; +using System.Text.Json; using Tesseract; using SdImageFormat = System.Drawing.Imaging.ImageFormat; @@ -54,7 +55,12 @@ class OcrHandler(TesseractEngine engine) return new OkResponse(); } - public object HandleDiffOcr(Request req) + public object HandleDiffOcr(Request req) => HandleDiffOcr(req, new DiffOcrParams + { + DiffThresh = req.Threshold > 0 ? req.Threshold : 30, + }); + + public object HandleDiffOcr(Request req, DiffOcrParams p) { if (_referenceFrame == null) return new ErrorResponse("No reference snapshot stored. Send 'snapshot' first."); @@ -78,7 +84,7 @@ class OcrHandler(TesseractEngine engine) // Detect pixels that got DARKER (tooltip = dark overlay). // This filters out item highlight glow (brighter) and cursor changes. - int diffThresh = req.Threshold > 0 ? req.Threshold : 30; + int diffThresh = p.DiffThresh; bool[] changed = new bool[w * h]; int totalChanged = 0; @@ -110,7 +116,7 @@ class OcrHandler(TesseractEngine engine) // Pass 1: Find row range using full-width row counts // Pass 2: Find column range using only pixels within detected row range // This makes the column threshold relative to tooltip height, not screen height. - int maxGap = 15; + int maxGap = p.MaxGap; // Pass 1: count changed pixels per row, find longest active run int[] rowCounts = new int[h]; @@ -119,7 +125,7 @@ class OcrHandler(TesseractEngine engine) if (changed[y * w + x]) rowCounts[y]++; - int rowThresh = w / 30; // ~3% of width + int rowThresh = w / p.RowThreshDiv; int bestRowStart = 0, bestRowEnd = 0, bestRowLen = 0; int curRowStart = -1, lastActiveRow = -1; for (int y = 0; y < h; y++) @@ -150,7 +156,7 @@ class OcrHandler(TesseractEngine engine) colCounts[x]++; int tooltipHeight = bestRowEnd - bestRowStart + 1; - int colThresh = tooltipHeight / 15; // ~7% of tooltip height + int colThresh = tooltipHeight / p.ColThreshDiv; int bestColStart = 0, bestColEnd = 0, bestColLen = 0; int curColStart = -1, lastActiveCol = -1; @@ -202,7 +208,7 @@ class OcrHandler(TesseractEngine engine) int midCount = 0; for (int x = q1; x <= q3; x++) { midSum += colCounts[x]; midCount++; } double avgMidDensity = (double)midSum / midCount; - double cutoff = avgMidDensity * 0.3; // column must have >=30% of avg density + double cutoff = avgMidDensity * p.TrimCutoff; // Trim from right while below cutoff while (maxX > minX + 100 && colCounts[maxX] < cutoff) @@ -227,8 +233,8 @@ class OcrHandler(TesseractEngine engine) if (debug) Console.Error.WriteLine($" diff-ocr: saved raw to {req.Path}"); } - // Pre-process for OCR: boost contrast, invert colors - using var processed = ImagePreprocessor.PreprocessForOcr(cropped); + // Pre-process for OCR: top-hat + binarize + upscale + using var processed = ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale); // Save fullscreen and preprocessed versions alongside raw if (!string.IsNullOrEmpty(req.Path)) @@ -255,4 +261,257 @@ class OcrHandler(TesseractEngine engine) Region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh }, }; } + + public object HandleTest(Request req) => RunTestCases(new DiffOcrParams(), verbose: true); + + public object HandleTune(Request req) + { + // Coordinate descent: optimize one parameter at a time, repeat until stable. + var best = new DiffOcrParams(); + double bestScore = ScoreParams(best); + Console.Error.WriteLine($"\n=== Tuning start === baseline score={bestScore:F3} {best}\n"); + + // Define search ranges for each parameter + var sweeps = new (string Name, int[] Values, Action Set)[] + { + ("diffThresh", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.DiffThresh = v), + ("rowThreshDiv", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.RowThreshDiv = v), + ("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.ColThreshDiv = v), + ("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.MaxGap = v), + ("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41], (p, v) => p.KernelSize = v), + ("upscale", [1, 2, 3], (p, v) => p.Upscale = v), + }; + + // trimCutoff needs double values — handle separately + double[] trimValues = [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5]; + + int totalEvals = 0; + const int maxRounds = 3; + + for (int round = 0; round < maxRounds; round++) + { + bool improved = false; + Console.Error.WriteLine($"--- Round {round + 1} ---"); + + // Sweep integer params + foreach (var (name, values, set) in sweeps) + { + Console.Error.Write($" {name}: "); + int bestVal = 0; + double bestValScore = -1; + + foreach (int v in values) + { + var trial = best.Clone(); + set(trial, v); + double score = ScoreParams(trial); + totalEvals++; + Console.Error.Write($"{v}={score:F3} "); + + if (score > bestValScore) { bestValScore = score; bestVal = v; } + } + Console.Error.WriteLine(); + + if (bestValScore > bestScore) + { + set(best, bestVal); + bestScore = bestValScore; + improved = true; + Console.Error.WriteLine($" → {name}={bestVal} score={bestScore:F3}"); + } + } + + // Sweep trimCutoff + { + Console.Error.Write($" trimCutoff: "); + double bestTrim = best.TrimCutoff; + double bestTrimScore = bestScore; + + foreach (double v in trimValues) + { + var trial = best.Clone(); + trial.TrimCutoff = v; + double score = ScoreParams(trial); + totalEvals++; + Console.Error.Write($"{v:F2}={score:F3} "); + + if (score > bestTrimScore) { bestTrimScore = score; bestTrim = v; } + } + Console.Error.WriteLine(); + + if (bestTrimScore > bestScore) + { + best.TrimCutoff = bestTrim; + bestScore = bestTrimScore; + improved = true; + Console.Error.WriteLine($" → trimCutoff={bestTrim:F2} score={bestScore:F3}"); + } + } + + Console.Error.WriteLine($" End of round {round + 1}: score={bestScore:F3} {best}"); + if (!improved) break; + } + + Console.Error.WriteLine($"\n=== Tuning done === evals={totalEvals} bestScore={bestScore:F3}\n {best}\n"); + + // Run verbose test with best params for final report + var finalResult = RunTestCases(best, verbose: true); + + return new TuneResponse + { + BestScore = bestScore, + BestParams = best, + Iterations = totalEvals, + }; + } + + /// Score a param set: average match ratio across all test cases (0-1). + private double ScoreParams(DiffOcrParams p) + { + var result = RunTestCases(p, verbose: false); + if (result is TestResponse tr && tr.Total > 0) + return tr.Results.Average(r => r.Score); + return 0; + } + + private object RunTestCases(DiffOcrParams p, bool verbose) + { + var tessdataDir = Path.Combine(AppContext.BaseDirectory, "tessdata"); + var casesPath = Path.Combine(tessdataDir, "cases.json"); + if (!File.Exists(casesPath)) + return new ErrorResponse($"cases.json not found at {casesPath}"); + + var json = File.ReadAllText(casesPath); + var cases = JsonSerializer.Deserialize>(json); + if (cases == null || cases.Count == 0) + return new ErrorResponse("No test cases found in cases.json"); + + var results = new List(); + int passCount = 0; + + foreach (var tc in cases) + { + if (verbose) Console.Error.WriteLine($"\n=== Test: {tc.Id} ==="); + + var fullPath = Path.Combine(tessdataDir, tc.FullImage); + var imagePath = Path.Combine(tessdataDir, tc.Image); + + if (!File.Exists(fullPath)) + { + if (verbose) Console.Error.WriteLine($" SKIP: full image not found: {fullPath}"); + results.Add(new TestCaseResult { Id = tc.Id, Passed = false, Score = 0, Missed = tc.Expected }); + continue; + } + if (!File.Exists(imagePath)) + { + if (verbose) Console.Error.WriteLine($" SKIP: tooltip image not found: {imagePath}"); + results.Add(new TestCaseResult { Id = tc.Id, Passed = false, Score = 0, Missed = tc.Expected }); + continue; + } + + // Run the same pipeline: snapshot (reference) then diff-ocr (with tooltip) + HandleSnapshot(new Request { File = fullPath }); + var diffResult = HandleDiffOcr(new Request { File = imagePath, Debug = verbose }, p); + + // Extract actual lines from the response + List actualLines; + if (diffResult is DiffOcrResponse diffResp) + actualLines = diffResp.Lines.Select(l => l.Text.Trim()).Where(l => l.Length > 0).ToList(); + else if (diffResult is OcrResponse ocrResp) + actualLines = ocrResp.Lines.Select(l => l.Text.Trim()).Where(l => l.Length > 0).ToList(); + else + { + if (verbose) Console.Error.WriteLine($" ERROR: unexpected response type"); + results.Add(new TestCaseResult { Id = tc.Id, Passed = false, Score = 0, Missed = tc.Expected }); + continue; + } + + // Fuzzy match expected vs actual + var matched = new List(); + var missed = new List(); + var usedActual = new HashSet(); + + foreach (var expected in tc.Expected) + { + int bestIdx = -1; + double bestSim = 0; + for (int i = 0; i < actualLines.Count; i++) + { + if (usedActual.Contains(i)) continue; + double sim = LevenshteinSimilarity(expected, actualLines[i]); + if (sim > bestSim) { bestSim = sim; bestIdx = i; } + } + + if (bestIdx >= 0 && bestSim >= 0.75) + { + matched.Add(expected); + usedActual.Add(bestIdx); + if (verbose && bestSim < 1.0) + Console.Error.WriteLine($" ~ {expected} → {actualLines[bestIdx]} (sim={bestSim:F2})"); + } + else + { + missed.Add(expected); + if (verbose) + Console.Error.WriteLine($" MISS: {expected}" + (bestIdx >= 0 ? $" (best: {actualLines[bestIdx]}, sim={bestSim:F2})" : "")); + } + } + + var extra = actualLines.Where((_, i) => !usedActual.Contains(i)).ToList(); + if (verbose) + foreach (var e in extra) + Console.Error.WriteLine($" EXTRA: {e}"); + + double score = tc.Expected.Count > 0 ? (double)matched.Count / tc.Expected.Count : 1.0; + bool passed = missed.Count == 0; + if (passed) passCount++; + + if (verbose) + Console.Error.WriteLine($" Result: {(passed ? "PASS" : "FAIL")} matched={matched.Count}/{tc.Expected.Count} extra={extra.Count} score={score:F2}"); + + results.Add(new TestCaseResult + { + Id = tc.Id, + Passed = passed, + Score = score, + Matched = matched, + Missed = missed, + Extra = extra, + }); + } + + if (verbose) + Console.Error.WriteLine($"\n=== Summary: {passCount}/{cases.Count} passed ===\n"); + + return new TestResponse + { + Passed = passCount, + Failed = cases.Count - passCount, + Total = cases.Count, + Results = results, + }; + } + + private static double LevenshteinSimilarity(string a, string b) + { + a = a.ToLowerInvariant(); + b = b.ToLowerInvariant(); + if (a == b) return 1.0; + + int la = a.Length, lb = b.Length; + if (la == 0 || lb == 0) return 0.0; + + var d = new int[la + 1, lb + 1]; + for (int i = 0; i <= la; i++) d[i, 0] = i; + for (int j = 0; j <= lb; j++) d[0, j] = j; + + for (int i = 1; i <= la; i++) + for (int j = 1; j <= lb; j++) + { + int cost = a[i - 1] == b[j - 1] ? 0 : 1; + d[i, j] = Math.Min(Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1), d[i - 1, j - 1] + cost); + } + + return 1.0 - (double)d[la, lb] / Math.Max(la, lb); + } } diff --git a/tools/OcrDaemon/tessdata/cases.json b/tools/OcrDaemon/tessdata/cases.json index b1fb807..0c4c99f 100644 --- a/tools/OcrDaemon/tessdata/cases.json +++ b/tools/OcrDaemon/tessdata/cases.json @@ -2,6 +2,7 @@ { "id": "vertex1", "image": "images/vertex1.png", + "fullImage": "images/vertex-snapshot.png", "expected": [ "The Vertex", "Tribal Mask", @@ -26,6 +27,7 @@ { "id": "vertex2", "image": "images/vertex2.png", + "fullImage": "images/vertex-snapshot.png", "expected": [ "The Vertex", "Tribal Mask", diff --git a/tools/OcrDaemon/tessdata/images/1770837425546_10-0.full.png b/tools/OcrDaemon/tessdata/images/1770837425546_10-0.full.png new file mode 100644 index 0000000..1310d48 Binary files /dev/null and b/tools/OcrDaemon/tessdata/images/1770837425546_10-0.full.png differ diff --git a/tools/OcrDaemon/tessdata/images/1770837425546_10-2.full.png b/tools/OcrDaemon/tessdata/images/1770837425546_10-2.full.png new file mode 100644 index 0000000..8fa0fd1 Binary files /dev/null and b/tools/OcrDaemon/tessdata/images/1770837425546_10-2.full.png differ diff --git a/tools/OcrDaemon/tessdata/images/1770837425546_10-4.full.png b/tools/OcrDaemon/tessdata/images/1770837425546_10-4.full.png new file mode 100644 index 0000000..a7394ca Binary files /dev/null and b/tools/OcrDaemon/tessdata/images/1770837425546_10-4.full.png differ diff --git a/tools/OcrDaemon/tessdata/images/1770837425546_8-0.full.png b/tools/OcrDaemon/tessdata/images/1770837425546_8-0.full.png new file mode 100644 index 0000000..78ef7be Binary files /dev/null and b/tools/OcrDaemon/tessdata/images/1770837425546_8-0.full.png differ diff --git a/tools/OcrDaemon/tessdata/images/1770837425546_8-2.full.png b/tools/OcrDaemon/tessdata/images/1770837425546_8-2.full.png new file mode 100644 index 0000000..3671bc8 Binary files /dev/null and b/tools/OcrDaemon/tessdata/images/1770837425546_8-2.full.png differ diff --git a/tools/OcrDaemon/tessdata/images/1770837425546_8-4.full.png b/tools/OcrDaemon/tessdata/images/1770837425546_8-4.full.png new file mode 100644 index 0000000..dbcaf0c Binary files /dev/null and b/tools/OcrDaemon/tessdata/images/1770837425546_8-4.full.png differ diff --git a/tools/OcrDaemon/tessdata/images/1770837425546_8-6.full.png b/tools/OcrDaemon/tessdata/images/1770837425546_8-6.full.png new file mode 100644 index 0000000..7fff9e3 Binary files /dev/null and b/tools/OcrDaemon/tessdata/images/1770837425546_8-6.full.png differ diff --git a/tools/OcrDaemon/tessdata/images/vertex-snapshot.png b/tools/OcrDaemon/tessdata/images/vertex-snapshot.png new file mode 100644 index 0000000..63c94d0 Binary files /dev/null and b/tools/OcrDaemon/tessdata/images/vertex-snapshot.png differ diff --git a/tools/OcrDaemon/tessdata/images/vertex1.png b/tools/OcrDaemon/tessdata/images/vertex1.png index c6b9f82..5da1c0a 100644 Binary files a/tools/OcrDaemon/tessdata/images/vertex1.png and b/tools/OcrDaemon/tessdata/images/vertex1.png differ diff --git a/tools/OcrDaemon/tessdata/images/vertex2.png b/tools/OcrDaemon/tessdata/images/vertex2.png index 1a3263f..d8365b6 100644 Binary files a/tools/OcrDaemon/tessdata/images/vertex2.png and b/tools/OcrDaemon/tessdata/images/vertex2.png differ