tests started and tuning

This commit is contained in:
Boki 2026-02-11 15:17:44 -05:00
parent b8f5637c49
commit 641c87121a
17 changed files with 391 additions and 18 deletions

View file

@ -8,7 +8,10 @@
"dev": "dotnet build tools/OcrDaemon -c Release && tsx src/index.ts",
"build": "tsc",
"build:daemon": "dotnet build tools/OcrDaemon -c Release",
"start": "node dist/index.js"
"start": "node dist/index.js",
"stop:daemon": "taskkill /IM OcrDaemon.exe /F 2>nul || exit /b 0",
"test:ocr": "taskkill /IM OcrDaemon.exe /F 2>nul & dotnet build tools/OcrDaemon -c Release && echo {\"cmd\":\"test\"} | tools\\OcrDaemon\\bin\\Release\\net8.0-windows10.0.19041.0\\OcrDaemon.exe",
"tune:ocr": "taskkill /IM OcrDaemon.exe /F 2>nul & dotnet build tools/OcrDaemon -c Release && echo {\"cmd\":\"tune\"} | tools\\OcrDaemon\\bin\\Release\\net8.0-windows10.0.19041.0\\OcrDaemon.exe"
},
"dependencies": {
"chokidar": "^4.0.3",

View file

@ -63,6 +63,8 @@ static class Daemon
"capture" => ocrHandler.HandleCapture(request),
"snapshot" => ocrHandler.HandleSnapshot(request),
"diff-ocr" => ocrHandler.HandleDiffOcr(request),
"test" => ocrHandler.HandleTest(request),
"tune" => ocrHandler.HandleTune(request),
"grid" => gridHandler.HandleGrid(request),
"detect-grid" => detectGridHandler.HandleDetectGrid(request),
_ => new ErrorResponse($"Unknown command: {request.Cmd}"),

View file

@ -9,17 +9,16 @@ static class ImagePreprocessor
/// <summary>
/// Pre-process an image for OCR using morphological white top-hat filtering.
/// Isolates bright tooltip text, suppresses dim background text visible through overlay.
/// Pipeline: grayscale → morphological top-hat → Otsu binary → 2x upscale
/// Pipeline: grayscale → morphological top-hat → Otsu binary → upscale
/// </summary>
public static Bitmap PreprocessForOcr(Bitmap src)
public static Bitmap PreprocessForOcr(Bitmap src, int kernelSize = 25, int upscale = 2)
{
using var mat = BitmapConverter.ToMat(src);
using var gray = new Mat();
Cv2.CvtColor(mat, gray, ColorConversionCodes.BGRA2GRAY);
// Morphological white top-hat: isolates bright text on dark background
// Kernel size 25x25 captures text strokes, suppresses dim background text
using var kernel = Cv2.GetStructuringElement(MorphShapes.Rect, new OpenCvSharp.Size(25, 25));
using var kernel = Cv2.GetStructuringElement(MorphShapes.Rect, new OpenCvSharp.Size(kernelSize, kernelSize));
using var tophat = new Mat();
Cv2.MorphologyEx(gray, tophat, MorphTypes.TopHat, kernel);
@ -27,11 +26,15 @@ static class ImagePreprocessor
using var binary = new Mat();
Cv2.Threshold(tophat, binary, 0, 255, ThresholdTypes.BinaryInv | ThresholdTypes.Otsu);
// 2x upscale for better LSTM recognition
using var upscaled = new Mat();
Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * 2, binary.Height * 2),
interpolation: InterpolationFlags.Cubic);
// Upscale for better LSTM recognition
if (upscale > 1)
{
using var upscaled = new Mat();
Cv2.Resize(binary, upscaled, new OpenCvSharp.Size(binary.Width * upscale, binary.Height * upscale),
interpolation: InterpolationFlags.Cubic);
return BitmapConverter.ToBitmap(upscaled);
}
return BitmapConverter.ToBitmap(upscaled);
return BitmapConverter.ToBitmap(binary);
}
}

View file

@ -208,3 +208,101 @@ class DetectGridResponse
[JsonPropertyName("cellHeight")]
public double CellHeight { get; set; }
}
class DiffOcrParams
{
[JsonPropertyName("diffThresh")]
public int DiffThresh { get; set; } = 10;
[JsonPropertyName("rowThreshDiv")]
public int RowThreshDiv { get; set; } = 30;
[JsonPropertyName("colThreshDiv")]
public int ColThreshDiv { get; set; } = 8;
[JsonPropertyName("maxGap")]
public int MaxGap { get; set; } = 20;
[JsonPropertyName("trimCutoff")]
public double TrimCutoff { get; set; } = 0.4;
[JsonPropertyName("kernelSize")]
public int KernelSize { get; set; } = 41;
[JsonPropertyName("upscale")]
public int Upscale { get; set; } = 2;
public DiffOcrParams Clone() => (DiffOcrParams)MemberwiseClone();
public override string ToString() =>
$"diffThresh={DiffThresh} rowThreshDiv={RowThreshDiv} colThreshDiv={ColThreshDiv} maxGap={MaxGap} trimCutoff={TrimCutoff:F2} kernelSize={KernelSize} upscale={Upscale}";
}
class TestCase
{
[JsonPropertyName("id")]
public string Id { get; set; } = "";
[JsonPropertyName("image")]
public string Image { get; set; } = "";
[JsonPropertyName("fullImage")]
public string FullImage { get; set; } = "";
[JsonPropertyName("expected")]
public List<string> Expected { get; set; } = [];
}
class TestCaseResult
{
[JsonPropertyName("id")]
public string Id { get; set; } = "";
[JsonPropertyName("passed")]
public bool Passed { get; set; }
[JsonPropertyName("score")]
public double Score { get; set; }
[JsonPropertyName("matched")]
public List<string> Matched { get; set; } = [];
[JsonPropertyName("missed")]
public List<string> Missed { get; set; } = [];
[JsonPropertyName("extra")]
public List<string> Extra { get; set; } = [];
}
class TestResponse
{
[JsonPropertyName("ok")]
public bool Ok => true;
[JsonPropertyName("passed")]
public int Passed { get; set; }
[JsonPropertyName("failed")]
public int Failed { get; set; }
[JsonPropertyName("total")]
public int Total { get; set; }
[JsonPropertyName("results")]
public List<TestCaseResult> Results { get; set; } = [];
}
class TuneResponse
{
[JsonPropertyName("ok")]
public bool Ok => true;
[JsonPropertyName("bestScore")]
public double BestScore { get; set; }
[JsonPropertyName("bestParams")]
public DiffOcrParams BestParams { get; set; } = new();
[JsonPropertyName("iterations")]
public int Iterations { get; set; }
}

View file

@ -22,6 +22,12 @@
<None Update="tessdata\poe2.traineddata" Condition="Exists('tessdata\poe2.traineddata')">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="tessdata\cases.json">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Include="tessdata\images\*">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>
</Project>

View file

@ -3,6 +3,7 @@ namespace OcrDaemon;
using System.Drawing;
using System.Drawing.Imaging;
using System.Runtime.InteropServices;
using System.Text.Json;
using Tesseract;
using SdImageFormat = System.Drawing.Imaging.ImageFormat;
@ -54,7 +55,12 @@ class OcrHandler(TesseractEngine engine)
return new OkResponse();
}
public object HandleDiffOcr(Request req)
public object HandleDiffOcr(Request req) => HandleDiffOcr(req, new DiffOcrParams
{
DiffThresh = req.Threshold > 0 ? req.Threshold : 30,
});
public object HandleDiffOcr(Request req, DiffOcrParams p)
{
if (_referenceFrame == null)
return new ErrorResponse("No reference snapshot stored. Send 'snapshot' first.");
@ -78,7 +84,7 @@ class OcrHandler(TesseractEngine engine)
// Detect pixels that got DARKER (tooltip = dark overlay).
// This filters out item highlight glow (brighter) and cursor changes.
int diffThresh = req.Threshold > 0 ? req.Threshold : 30;
int diffThresh = p.DiffThresh;
bool[] changed = new bool[w * h];
int totalChanged = 0;
@ -110,7 +116,7 @@ class OcrHandler(TesseractEngine engine)
// Pass 1: Find row range using full-width row counts
// Pass 2: Find column range using only pixels within detected row range
// This makes the column threshold relative to tooltip height, not screen height.
int maxGap = 15;
int maxGap = p.MaxGap;
// Pass 1: count changed pixels per row, find longest active run
int[] rowCounts = new int[h];
@ -119,7 +125,7 @@ class OcrHandler(TesseractEngine engine)
if (changed[y * w + x])
rowCounts[y]++;
int rowThresh = w / 30; // ~3% of width
int rowThresh = w / p.RowThreshDiv;
int bestRowStart = 0, bestRowEnd = 0, bestRowLen = 0;
int curRowStart = -1, lastActiveRow = -1;
for (int y = 0; y < h; y++)
@ -150,7 +156,7 @@ class OcrHandler(TesseractEngine engine)
colCounts[x]++;
int tooltipHeight = bestRowEnd - bestRowStart + 1;
int colThresh = tooltipHeight / 15; // ~7% of tooltip height
int colThresh = tooltipHeight / p.ColThreshDiv;
int bestColStart = 0, bestColEnd = 0, bestColLen = 0;
int curColStart = -1, lastActiveCol = -1;
@ -202,7 +208,7 @@ class OcrHandler(TesseractEngine engine)
int midCount = 0;
for (int x = q1; x <= q3; x++) { midSum += colCounts[x]; midCount++; }
double avgMidDensity = (double)midSum / midCount;
double cutoff = avgMidDensity * 0.3; // column must have >=30% of avg density
double cutoff = avgMidDensity * p.TrimCutoff;
// Trim from right while below cutoff
while (maxX > minX + 100 && colCounts[maxX] < cutoff)
@ -227,8 +233,8 @@ class OcrHandler(TesseractEngine engine)
if (debug) Console.Error.WriteLine($" diff-ocr: saved raw to {req.Path}");
}
// Pre-process for OCR: boost contrast, invert colors
using var processed = ImagePreprocessor.PreprocessForOcr(cropped);
// Pre-process for OCR: top-hat + binarize + upscale
using var processed = ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
// Save fullscreen and preprocessed versions alongside raw
if (!string.IsNullOrEmpty(req.Path))
@ -255,4 +261,257 @@ class OcrHandler(TesseractEngine engine)
Region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh },
};
}
public object HandleTest(Request req) => RunTestCases(new DiffOcrParams(), verbose: true);
public object HandleTune(Request req)
{
// Coordinate descent: optimize one parameter at a time, repeat until stable.
var best = new DiffOcrParams();
double bestScore = ScoreParams(best);
Console.Error.WriteLine($"\n=== Tuning start === baseline score={bestScore:F3} {best}\n");
// Define search ranges for each parameter
var sweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
{
("diffThresh", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.DiffThresh = v),
("rowThreshDiv", [10, 15, 20, 25, 30, 40, 50, 60], (p, v) => p.RowThreshDiv = v),
("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.ColThreshDiv = v),
("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.MaxGap = v),
("kernelSize", [11, 15, 19, 21, 25, 31, 35, 41], (p, v) => p.KernelSize = v),
("upscale", [1, 2, 3], (p, v) => p.Upscale = v),
};
// trimCutoff needs double values — handle separately
double[] trimValues = [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5];
int totalEvals = 0;
const int maxRounds = 3;
for (int round = 0; round < maxRounds; round++)
{
bool improved = false;
Console.Error.WriteLine($"--- Round {round + 1} ---");
// Sweep integer params
foreach (var (name, values, set) in sweeps)
{
Console.Error.Write($" {name}: ");
int bestVal = 0;
double bestValScore = -1;
foreach (int v in values)
{
var trial = best.Clone();
set(trial, v);
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v}={score:F3} ");
if (score > bestValScore) { bestValScore = score; bestVal = v; }
}
Console.Error.WriteLine();
if (bestValScore > bestScore)
{
set(best, bestVal);
bestScore = bestValScore;
improved = true;
Console.Error.WriteLine($" → {name}={bestVal} score={bestScore:F3}");
}
}
// Sweep trimCutoff
{
Console.Error.Write($" trimCutoff: ");
double bestTrim = best.TrimCutoff;
double bestTrimScore = bestScore;
foreach (double v in trimValues)
{
var trial = best.Clone();
trial.TrimCutoff = v;
double score = ScoreParams(trial);
totalEvals++;
Console.Error.Write($"{v:F2}={score:F3} ");
if (score > bestTrimScore) { bestTrimScore = score; bestTrim = v; }
}
Console.Error.WriteLine();
if (bestTrimScore > bestScore)
{
best.TrimCutoff = bestTrim;
bestScore = bestTrimScore;
improved = true;
Console.Error.WriteLine($" → trimCutoff={bestTrim:F2} score={bestScore:F3}");
}
}
Console.Error.WriteLine($" End of round {round + 1}: score={bestScore:F3} {best}");
if (!improved) break;
}
Console.Error.WriteLine($"\n=== Tuning done === evals={totalEvals} bestScore={bestScore:F3}\n {best}\n");
// Run verbose test with best params for final report
var finalResult = RunTestCases(best, verbose: true);
return new TuneResponse
{
BestScore = bestScore,
BestParams = best,
Iterations = totalEvals,
};
}
/// <summary>Score a param set: average match ratio across all test cases (0-1).</summary>
private double ScoreParams(DiffOcrParams p)
{
var result = RunTestCases(p, verbose: false);
if (result is TestResponse tr && tr.Total > 0)
return tr.Results.Average(r => r.Score);
return 0;
}
private object RunTestCases(DiffOcrParams p, bool verbose)
{
var tessdataDir = Path.Combine(AppContext.BaseDirectory, "tessdata");
var casesPath = Path.Combine(tessdataDir, "cases.json");
if (!File.Exists(casesPath))
return new ErrorResponse($"cases.json not found at {casesPath}");
var json = File.ReadAllText(casesPath);
var cases = JsonSerializer.Deserialize<List<TestCase>>(json);
if (cases == null || cases.Count == 0)
return new ErrorResponse("No test cases found in cases.json");
var results = new List<TestCaseResult>();
int passCount = 0;
foreach (var tc in cases)
{
if (verbose) Console.Error.WriteLine($"\n=== Test: {tc.Id} ===");
var fullPath = Path.Combine(tessdataDir, tc.FullImage);
var imagePath = Path.Combine(tessdataDir, tc.Image);
if (!File.Exists(fullPath))
{
if (verbose) Console.Error.WriteLine($" SKIP: full image not found: {fullPath}");
results.Add(new TestCaseResult { Id = tc.Id, Passed = false, Score = 0, Missed = tc.Expected });
continue;
}
if (!File.Exists(imagePath))
{
if (verbose) Console.Error.WriteLine($" SKIP: tooltip image not found: {imagePath}");
results.Add(new TestCaseResult { Id = tc.Id, Passed = false, Score = 0, Missed = tc.Expected });
continue;
}
// Run the same pipeline: snapshot (reference) then diff-ocr (with tooltip)
HandleSnapshot(new Request { File = fullPath });
var diffResult = HandleDiffOcr(new Request { File = imagePath, Debug = verbose }, p);
// Extract actual lines from the response
List<string> actualLines;
if (diffResult is DiffOcrResponse diffResp)
actualLines = diffResp.Lines.Select(l => l.Text.Trim()).Where(l => l.Length > 0).ToList();
else if (diffResult is OcrResponse ocrResp)
actualLines = ocrResp.Lines.Select(l => l.Text.Trim()).Where(l => l.Length > 0).ToList();
else
{
if (verbose) Console.Error.WriteLine($" ERROR: unexpected response type");
results.Add(new TestCaseResult { Id = tc.Id, Passed = false, Score = 0, Missed = tc.Expected });
continue;
}
// Fuzzy match expected vs actual
var matched = new List<string>();
var missed = new List<string>();
var usedActual = new HashSet<int>();
foreach (var expected in tc.Expected)
{
int bestIdx = -1;
double bestSim = 0;
for (int i = 0; i < actualLines.Count; i++)
{
if (usedActual.Contains(i)) continue;
double sim = LevenshteinSimilarity(expected, actualLines[i]);
if (sim > bestSim) { bestSim = sim; bestIdx = i; }
}
if (bestIdx >= 0 && bestSim >= 0.75)
{
matched.Add(expected);
usedActual.Add(bestIdx);
if (verbose && bestSim < 1.0)
Console.Error.WriteLine($" ~ {expected} → {actualLines[bestIdx]} (sim={bestSim:F2})");
}
else
{
missed.Add(expected);
if (verbose)
Console.Error.WriteLine($" MISS: {expected}" + (bestIdx >= 0 ? $" (best: {actualLines[bestIdx]}, sim={bestSim:F2})" : ""));
}
}
var extra = actualLines.Where((_, i) => !usedActual.Contains(i)).ToList();
if (verbose)
foreach (var e in extra)
Console.Error.WriteLine($" EXTRA: {e}");
double score = tc.Expected.Count > 0 ? (double)matched.Count / tc.Expected.Count : 1.0;
bool passed = missed.Count == 0;
if (passed) passCount++;
if (verbose)
Console.Error.WriteLine($" Result: {(passed ? "PASS" : "FAIL")} matched={matched.Count}/{tc.Expected.Count} extra={extra.Count} score={score:F2}");
results.Add(new TestCaseResult
{
Id = tc.Id,
Passed = passed,
Score = score,
Matched = matched,
Missed = missed,
Extra = extra,
});
}
if (verbose)
Console.Error.WriteLine($"\n=== Summary: {passCount}/{cases.Count} passed ===\n");
return new TestResponse
{
Passed = passCount,
Failed = cases.Count - passCount,
Total = cases.Count,
Results = results,
};
}
private static double LevenshteinSimilarity(string a, string b)
{
a = a.ToLowerInvariant();
b = b.ToLowerInvariant();
if (a == b) return 1.0;
int la = a.Length, lb = b.Length;
if (la == 0 || lb == 0) return 0.0;
var d = new int[la + 1, lb + 1];
for (int i = 0; i <= la; i++) d[i, 0] = i;
for (int j = 0; j <= lb; j++) d[0, j] = j;
for (int i = 1; i <= la; i++)
for (int j = 1; j <= lb; j++)
{
int cost = a[i - 1] == b[j - 1] ? 0 : 1;
d[i, j] = Math.Min(Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1), d[i - 1, j - 1] + cost);
}
return 1.0 - (double)d[la, lb] / Math.Max(la, lb);
}
}

View file

@ -2,6 +2,7 @@
{
"id": "vertex1",
"image": "images/vertex1.png",
"fullImage": "images/vertex-snapshot.png",
"expected": [
"The Vertex",
"Tribal Mask",
@ -26,6 +27,7 @@
{
"id": "vertex2",
"image": "images/vertex2.png",
"fullImage": "images/vertex-snapshot.png",
"expected": [
"The Vertex",
"Tribal Mask",

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.8 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.8 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.8 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.8 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.8 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.8 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.9 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.9 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 388 KiB

After

Width:  |  Height:  |  Size: 5.8 MiB

Before After
Before After

Binary file not shown.

Before

Width:  |  Height:  |  Size: 266 KiB

After

Width:  |  Height:  |  Size: 5.9 MiB

Before After
Before After