232 lines
8 KiB
C#
232 lines
8 KiB
C#
namespace OcrDaemon;
|
|
|
|
using System.Collections.Generic;
|
|
using System.Drawing;
|
|
using System.IO;
|
|
using System.Linq;
|
|
using System.Text.Json;
|
|
using Tesseract;
|
|
|
|
static class TestRunner
|
|
{
|
|
private static readonly JsonSerializerOptions JsonOptions = new()
|
|
{
|
|
PropertyNameCaseInsensitive = true,
|
|
};
|
|
|
|
public static int Run(string[] args)
|
|
{
|
|
string baseDir = AppContext.BaseDirectory;
|
|
string? savePreDir = null;
|
|
|
|
for (int i = 0; i < args.Length; i++)
|
|
{
|
|
if (string.Equals(args[i], "--save-pre", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
if (i + 1 < args.Length && !args[i + 1].StartsWith("--", StringComparison.Ordinal))
|
|
{
|
|
savePreDir = args[i + 1];
|
|
i++;
|
|
}
|
|
else
|
|
{
|
|
savePreDir = "processed";
|
|
}
|
|
}
|
|
}
|
|
|
|
string casesPath = args.Length > 0 && !string.IsNullOrWhiteSpace(args[0])
|
|
? args[0]
|
|
: Path.Combine(baseDir, "tessdata", "cases.json");
|
|
|
|
if (!File.Exists(casesPath))
|
|
{
|
|
Console.Error.WriteLine($"cases.json not found: {casesPath}");
|
|
return 1;
|
|
}
|
|
|
|
string json = File.ReadAllText(casesPath);
|
|
var cases = JsonSerializer.Deserialize<List<TestCase>>(json, JsonOptions) ?? [];
|
|
if (cases.Count == 0)
|
|
{
|
|
Console.Error.WriteLine("No test cases found.");
|
|
return 1;
|
|
}
|
|
|
|
string tessdataPath = Path.Combine(baseDir, "tessdata");
|
|
string tessLang = File.Exists(Path.Combine(tessdataPath, "poe2.traineddata")) ? "poe2" : "eng";
|
|
|
|
using var engine = new TesseractEngine(tessdataPath, tessLang, EngineMode.LstmOnly);
|
|
engine.DefaultPageSegMode = PageSegMode.SingleBlock;
|
|
engine.SetVariable("preserve_interword_spaces", "1");
|
|
var ocrHandler = new OcrHandler(engine);
|
|
|
|
int totalExpected = 0;
|
|
int totalMatched = 0;
|
|
int caseFailures = 0;
|
|
|
|
string casesDir = Path.GetDirectoryName(casesPath) ?? baseDir;
|
|
if (!string.IsNullOrEmpty(savePreDir))
|
|
{
|
|
if (!Path.IsPathRooted(savePreDir))
|
|
savePreDir = Path.Combine(casesDir, savePreDir);
|
|
if (!Directory.Exists(savePreDir))
|
|
Directory.CreateDirectory(savePreDir);
|
|
}
|
|
foreach (var tc in cases)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(tc.Image))
|
|
{
|
|
Console.Error.WriteLine($"[SKIP] {tc.Id}: missing image path");
|
|
continue;
|
|
}
|
|
|
|
string imagePath = Path.IsPathRooted(tc.Image)
|
|
? tc.Image
|
|
: Path.Combine(casesDir, tc.Image);
|
|
|
|
if (!File.Exists(imagePath))
|
|
{
|
|
Console.Error.WriteLine($"[SKIP] {tc.Id}: image not found: {imagePath}");
|
|
continue;
|
|
}
|
|
|
|
var options = new OcrOptions();
|
|
List<string> actualSet;
|
|
|
|
if (!string.IsNullOrWhiteSpace(tc.BeforeImage))
|
|
{
|
|
string beforePath = Path.IsPathRooted(tc.BeforeImage)
|
|
? tc.BeforeImage
|
|
: Path.Combine(casesDir, tc.BeforeImage);
|
|
|
|
if (!File.Exists(beforePath))
|
|
{
|
|
Console.Error.WriteLine($"[SKIP] {tc.Id}: before image not found: {beforePath}");
|
|
continue;
|
|
}
|
|
|
|
ocrHandler.HandleSnapshot(new Request { File = beforePath });
|
|
|
|
string? savePath = null;
|
|
if (!string.IsNullOrEmpty(savePreDir))
|
|
savePath = Path.Combine(savePreDir, $"{tc.Id}.raw.png");
|
|
|
|
var response = ocrHandler.HandleDiffOcr(new Request
|
|
{
|
|
File = imagePath,
|
|
Ocr = options,
|
|
Path = savePath,
|
|
});
|
|
|
|
if (response is ErrorResponse err)
|
|
{
|
|
Console.Error.WriteLine($"[FAIL] {tc.Id}: {err.Error}");
|
|
caseFailures++;
|
|
continue;
|
|
}
|
|
|
|
if (response is DiffOcrResponse diff)
|
|
actualSet = BuildActualSet(diff.Text, diff.Lines);
|
|
else if (response is OcrResponse ocr)
|
|
actualSet = BuildActualSet(ocr.Text, ocr.Lines);
|
|
else
|
|
actualSet = [];
|
|
}
|
|
else
|
|
{
|
|
using var bitmap = new Bitmap(imagePath);
|
|
using var processed = ImagePreprocessor.PreprocessForOcr(bitmap, options);
|
|
|
|
if (!string.IsNullOrEmpty(savePreDir))
|
|
{
|
|
string outPath = Path.Combine(savePreDir, $"{tc.Id}.pre.png");
|
|
processed.Save(outPath, System.Drawing.Imaging.ImageFormat.Png);
|
|
}
|
|
using var pix = ImageUtils.BitmapToPix(processed);
|
|
using var page = engine.Process(pix);
|
|
|
|
var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: 0, offsetY: 0, minConfidence: options.MinConfidence);
|
|
var actualLines = lines.Select(l => Normalize(l.Text)).Where(s => s.Length > 0).ToList();
|
|
|
|
var rawText = page.GetText() ?? string.Empty;
|
|
var rawLines = rawText.Split('\n')
|
|
.Select(Normalize)
|
|
.Where(s => s.Length > 0)
|
|
.ToList();
|
|
|
|
actualSet = actualLines.Concat(rawLines).Distinct().ToList();
|
|
}
|
|
|
|
var expectedLines = tc.Expected
|
|
.Select(Normalize)
|
|
.Where(s => s.Length > 0)
|
|
.ToList();
|
|
|
|
totalExpected += expectedLines.Count;
|
|
int matched = expectedLines.Count(e => actualSet.Contains(e));
|
|
totalMatched += matched;
|
|
|
|
if (matched < expectedLines.Count)
|
|
{
|
|
caseFailures++;
|
|
Console.Error.WriteLine($"[FAIL] {tc.Id}: matched {matched}/{expectedLines.Count}");
|
|
var missing = expectedLines.Where(e => !actualSet.Contains(e)).ToList();
|
|
foreach (var line in missing)
|
|
Console.Error.WriteLine($" missing: {line}");
|
|
|
|
Console.Error.WriteLine(" actual:");
|
|
foreach (var line in actualSet)
|
|
Console.Error.WriteLine($" > {line}");
|
|
}
|
|
else
|
|
{
|
|
Console.Error.WriteLine($"[OK] {tc.Id}: matched {matched}/{expectedLines.Count}");
|
|
}
|
|
}
|
|
|
|
Console.Error.WriteLine($"Summary: matched {totalMatched}/{totalExpected} lines, failed cases: {caseFailures}");
|
|
return caseFailures == 0 ? 0 : 2;
|
|
}
|
|
|
|
private static string Normalize(string input)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(input)) return string.Empty;
|
|
var chars = input.Trim().ToLowerInvariant().ToCharArray();
|
|
var sb = new System.Text.StringBuilder(chars.Length);
|
|
bool inSpace = false;
|
|
foreach (char c in chars)
|
|
{
|
|
if (char.IsWhiteSpace(c))
|
|
{
|
|
if (!inSpace)
|
|
{
|
|
sb.Append(' ');
|
|
inSpace = true;
|
|
}
|
|
continue;
|
|
}
|
|
inSpace = false;
|
|
sb.Append(c);
|
|
}
|
|
return sb.ToString().Trim();
|
|
}
|
|
|
|
private static List<string> BuildActualSet(string text, List<OcrLineResult> lines)
|
|
{
|
|
var lineTexts = lines.Select(l => Normalize(l.Text)).Where(s => s.Length > 0).ToList();
|
|
var textLines = (text ?? string.Empty).Split('\n')
|
|
.Select(Normalize)
|
|
.Where(s => s.Length > 0)
|
|
.ToList();
|
|
return lineTexts.Concat(textLines).Distinct().ToList();
|
|
}
|
|
|
|
private sealed class TestCase
|
|
{
|
|
public string Id { get; set; } = "";
|
|
public string Image { get; set; } = "";
|
|
public string? BeforeImage { get; set; }
|
|
public List<string> Expected { get; set; } = [];
|
|
}
|
|
}
|