diff --git a/tools/OcrDaemon/TestRunner.cs b/tools/OcrDaemon/TestRunner.cs new file mode 100644 index 0000000..c8171a3 --- /dev/null +++ b/tools/OcrDaemon/TestRunner.cs @@ -0,0 +1,232 @@ +namespace OcrDaemon; + +using System.Collections.Generic; +using System.Drawing; +using System.IO; +using System.Linq; +using System.Text.Json; +using Tesseract; + +static class TestRunner +{ + private static readonly JsonSerializerOptions JsonOptions = new() + { + PropertyNameCaseInsensitive = true, + }; + + public static int Run(string[] args) + { + string baseDir = AppContext.BaseDirectory; + string? savePreDir = null; + + for (int i = 0; i < args.Length; i++) + { + if (string.Equals(args[i], "--save-pre", StringComparison.OrdinalIgnoreCase)) + { + if (i + 1 < args.Length && !args[i + 1].StartsWith("--", StringComparison.Ordinal)) + { + savePreDir = args[i + 1]; + i++; + } + else + { + savePreDir = "processed"; + } + } + } + + string casesPath = args.Length > 0 && !string.IsNullOrWhiteSpace(args[0]) + ? args[0] + : Path.Combine(baseDir, "tessdata", "cases.json"); + + if (!File.Exists(casesPath)) + { + Console.Error.WriteLine($"cases.json not found: {casesPath}"); + return 1; + } + + string json = File.ReadAllText(casesPath); + var cases = JsonSerializer.Deserialize>(json, JsonOptions) ?? []; + if (cases.Count == 0) + { + Console.Error.WriteLine("No test cases found."); + return 1; + } + + string tessdataPath = Path.Combine(baseDir, "tessdata"); + string tessLang = File.Exists(Path.Combine(tessdataPath, "poe2.traineddata")) ? "poe2" : "eng"; + + using var engine = new TesseractEngine(tessdataPath, tessLang, EngineMode.LstmOnly); + engine.DefaultPageSegMode = PageSegMode.SingleBlock; + engine.SetVariable("preserve_interword_spaces", "1"); + var ocrHandler = new OcrHandler(engine); + + int totalExpected = 0; + int totalMatched = 0; + int caseFailures = 0; + + string casesDir = Path.GetDirectoryName(casesPath) ?? baseDir; + if (!string.IsNullOrEmpty(savePreDir)) + { + if (!Path.IsPathRooted(savePreDir)) + savePreDir = Path.Combine(casesDir, savePreDir); + if (!Directory.Exists(savePreDir)) + Directory.CreateDirectory(savePreDir); + } + foreach (var tc in cases) + { + if (string.IsNullOrWhiteSpace(tc.Image)) + { + Console.Error.WriteLine($"[SKIP] {tc.Id}: missing image path"); + continue; + } + + string imagePath = Path.IsPathRooted(tc.Image) + ? tc.Image + : Path.Combine(casesDir, tc.Image); + + if (!File.Exists(imagePath)) + { + Console.Error.WriteLine($"[SKIP] {tc.Id}: image not found: {imagePath}"); + continue; + } + + var options = new OcrOptions(); + List actualSet; + + if (!string.IsNullOrWhiteSpace(tc.BeforeImage)) + { + string beforePath = Path.IsPathRooted(tc.BeforeImage) + ? tc.BeforeImage + : Path.Combine(casesDir, tc.BeforeImage); + + if (!File.Exists(beforePath)) + { + Console.Error.WriteLine($"[SKIP] {tc.Id}: before image not found: {beforePath}"); + continue; + } + + ocrHandler.HandleSnapshot(new Request { File = beforePath }); + + string? savePath = null; + if (!string.IsNullOrEmpty(savePreDir)) + savePath = Path.Combine(savePreDir, $"{tc.Id}.raw.png"); + + var response = ocrHandler.HandleDiffOcr(new Request + { + File = imagePath, + Ocr = options, + Path = savePath, + }); + + if (response is ErrorResponse err) + { + Console.Error.WriteLine($"[FAIL] {tc.Id}: {err.Error}"); + caseFailures++; + continue; + } + + if (response is DiffOcrResponse diff) + actualSet = BuildActualSet(diff.Text, diff.Lines); + else if (response is OcrResponse ocr) + actualSet = BuildActualSet(ocr.Text, ocr.Lines); + else + actualSet = []; + } + else + { + using var bitmap = new Bitmap(imagePath); + using var processed = ImagePreprocessor.PreprocessForOcr(bitmap, options); + + if (!string.IsNullOrEmpty(savePreDir)) + { + string outPath = Path.Combine(savePreDir, $"{tc.Id}.pre.png"); + processed.Save(outPath, System.Drawing.Imaging.ImageFormat.Png); + } + using var pix = ImageUtils.BitmapToPix(processed); + using var page = engine.Process(pix); + + var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: 0, offsetY: 0, minConfidence: options.MinConfidence); + var actualLines = lines.Select(l => Normalize(l.Text)).Where(s => s.Length > 0).ToList(); + + var rawText = page.GetText() ?? string.Empty; + var rawLines = rawText.Split('\n') + .Select(Normalize) + .Where(s => s.Length > 0) + .ToList(); + + actualSet = actualLines.Concat(rawLines).Distinct().ToList(); + } + + var expectedLines = tc.Expected + .Select(Normalize) + .Where(s => s.Length > 0) + .ToList(); + + totalExpected += expectedLines.Count; + int matched = expectedLines.Count(e => actualSet.Contains(e)); + totalMatched += matched; + + if (matched < expectedLines.Count) + { + caseFailures++; + Console.Error.WriteLine($"[FAIL] {tc.Id}: matched {matched}/{expectedLines.Count}"); + var missing = expectedLines.Where(e => !actualSet.Contains(e)).ToList(); + foreach (var line in missing) + Console.Error.WriteLine($" missing: {line}"); + + Console.Error.WriteLine(" actual:"); + foreach (var line in actualSet) + Console.Error.WriteLine($" > {line}"); + } + else + { + Console.Error.WriteLine($"[OK] {tc.Id}: matched {matched}/{expectedLines.Count}"); + } + } + + Console.Error.WriteLine($"Summary: matched {totalMatched}/{totalExpected} lines, failed cases: {caseFailures}"); + return caseFailures == 0 ? 0 : 2; + } + + private static string Normalize(string input) + { + if (string.IsNullOrWhiteSpace(input)) return string.Empty; + var chars = input.Trim().ToLowerInvariant().ToCharArray(); + var sb = new System.Text.StringBuilder(chars.Length); + bool inSpace = false; + foreach (char c in chars) + { + if (char.IsWhiteSpace(c)) + { + if (!inSpace) + { + sb.Append(' '); + inSpace = true; + } + continue; + } + inSpace = false; + sb.Append(c); + } + return sb.ToString().Trim(); + } + + private static List BuildActualSet(string text, List lines) + { + var lineTexts = lines.Select(l => Normalize(l.Text)).Where(s => s.Length > 0).ToList(); + var textLines = (text ?? string.Empty).Split('\n') + .Select(Normalize) + .Where(s => s.Length > 0) + .ToList(); + return lineTexts.Concat(textLines).Distinct().ToList(); + } + + private sealed class TestCase + { + public string Id { get; set; } = ""; + public string Image { get; set; } = ""; + public string? BeforeImage { get; set; } + public List Expected { get; set; } = []; + } +}