added easyOCR

This commit is contained in:
Boki 2026-02-12 01:04:19 -05:00
parent 37d6678577
commit 9f208b0606
27 changed files with 1780 additions and 112 deletions

View file

@ -53,6 +53,8 @@ static class Daemon
var ocrHandler = new OcrHandler(tessEngine);
var gridHandler = new GridHandler();
var detectGridHandler = new DetectGridHandler();
var templateMatchHandler = new TemplateMatchHandler();
var pythonBridge = new PythonOcrBridge();
// Main loop: read one JSON line, handle, write one JSON line
string? line;
@ -72,16 +74,21 @@ static class Daemon
object response = request.Cmd?.ToLowerInvariant() switch
{
"ocr" when request.Engine is "easyocr"
=> pythonBridge.HandleOcr(request, request.Engine),
"ocr" => ocrHandler.HandleOcr(request),
"screenshot" => ocrHandler.HandleScreenshot(request),
"capture" => ocrHandler.HandleCapture(request),
"snapshot" => ocrHandler.HandleSnapshot(request),
"diff-ocr" when request.Engine is "easyocr"
=> HandleDiffOcrPython(ocrHandler, pythonBridge, request),
"diff-ocr" => ocrHandler.HandleDiffOcr(request),
"test" => ocrHandler.HandleTest(request),
"tune" => ocrHandler.HandleTune(request),
"grid" => gridHandler.HandleGrid(request),
"detect-grid" => detectGridHandler.HandleDetectGrid(request),
_ => new ErrorResponse($"Unknown command: {request.Cmd}"),
"detect-grid" => detectGridHandler.HandleDetectGrid(request),
"match-template" => templateMatchHandler.HandleTemplateMatch(request),
_ => new ErrorResponse($"Unknown command: {request.Cmd}"),
};
WriteResponse(response);
}
@ -91,9 +98,59 @@ static class Daemon
}
}
pythonBridge.Dispose();
return 0;
}
private static object HandleDiffOcrPython(OcrHandler ocrHandler, PythonOcrBridge pythonBridge, Request request)
{
var sw = System.Diagnostics.Stopwatch.StartNew();
var p = request.Threshold > 0
? new DiffOcrParams { DiffThresh = request.Threshold }
: new DiffOcrParams();
var cropResult = ocrHandler.DiffCrop(request, p);
if (cropResult == null)
return new OcrResponse { Text = "", Lines = [] };
var (cropped, refCropped, current, region) = cropResult.Value;
using var _current = current;
using var _refCropped = refCropped;
var diffMs = sw.ElapsedMilliseconds;
// Save crop to requested path if provided
if (!string.IsNullOrEmpty(request.Path))
{
var dir = Path.GetDirectoryName(request.Path);
if (!string.IsNullOrEmpty(dir) && !Directory.Exists(dir))
Directory.CreateDirectory(dir);
cropped.Save(request.Path, ImageUtils.GetImageFormat(request.Path));
}
// Send crop to Python via base64 over pipe (no temp file I/O)
sw.Restart();
var ocrResult = pythonBridge.OcrFromBitmap(cropped, request.Engine!);
cropped.Dispose();
var ocrMs = sw.ElapsedMilliseconds;
Console.Error.WriteLine($" diff-ocr-python: diff={diffMs}ms ocr={ocrMs}ms total={diffMs + ocrMs}ms crop={region.Width}x{region.Height}");
// Offset word coordinates to screen space
foreach (var line in ocrResult.Lines)
foreach (var word in line.Words)
{
word.X += region.X;
word.Y += region.Y;
}
return new DiffOcrResponse
{
Text = ocrResult.Text,
Lines = ocrResult.Lines,
Region = region,
};
}
private static void WriteResponse(object response)
{
var json = JsonSerializer.Serialize(response, JsonOptions);

View file

@ -69,12 +69,13 @@ class GridHandler
templateSum += templateGray[ty * templateW + tx];
innerCount++;
}
double tmplMean = innerCount > 0 ? (double)templateSum / innerCount : 0;
// Threshold for mean absolute difference — default 6
double diffThreshold = req.Threshold > 0 ? req.Threshold : 2;
// Threshold for brightness-normalized MAD
double diffThreshold = req.Threshold > 0 ? req.Threshold : 5;
bool debug = req.Debug;
if (debug) Console.Error.WriteLine($"Grid: {cols}x{rows}, cellW={cellW:F1}, cellH={cellH:F1}, border={border}, threshold={diffThreshold}");
if (debug) Console.Error.WriteLine($"Grid: {cols}x{rows}, cellW={cellW:F1}, cellH={cellH:F1}, border={border}, threshold={diffThreshold}, tmplMean={tmplMean:F1}");
var cells = new List<List<bool>>();
for (int row = 0; row < rows; row++)
@ -88,21 +89,30 @@ class GridHandler
int cw = (int)Math.Min(cellW, captureW - cx0);
int ch = (int)Math.Min(cellH, bitmap.Height - cy0);
// Compare inner pixels of cell vs template
long diffSum = 0;
int compared = 0;
int innerW = Math.Min(cw, templateW) - border;
int innerH = Math.Min(ch, templateH) - border;
// First pass: compute cell region mean brightness
long cellSum = 0;
int compared = 0;
for (int py = border; py < innerH; py++)
{
for (int px = border; px < innerW; px++)
{
int cellVal = captureGray[(cy0 + py) * captureW + (cx0 + px)];
int tmplVal = templateGray[py * templateW + px];
diffSum += Math.Abs(cellVal - tmplVal);
cellSum += captureGray[(cy0 + py) * captureW + (cx0 + px)];
compared++;
}
}
double cellMean = compared > 0 ? (double)cellSum / compared : 0;
double offset = cellMean - tmplMean;
// Second pass: MAD on brightness-normalized values
long diffSum = 0;
for (int py = border; py < innerH; py++)
for (int px = border; px < innerW; px++)
{
double cellVal = captureGray[(cy0 + py) * captureW + (cx0 + px)];
double tmplVal = templateGray[py * templateW + px];
diffSum += (long)Math.Abs(cellVal - tmplVal - offset);
}
double meanDiff = compared > 0 ? (double)diffSum / compared : 0;
bool occupied = meanDiff > diffThreshold;
rowList.Add(occupied);

View file

@ -39,6 +39,9 @@ class Request
[JsonPropertyName("targetCol")]
public int TargetCol { get; set; } = -1;
[JsonPropertyName("engine")]
public string? Engine { get; set; }
}
class RegionRect
@ -209,6 +212,30 @@ class DetectGridResponse
public double CellHeight { get; set; }
}
class TemplateMatchResponse
{
[JsonPropertyName("ok")]
public bool Ok => true;
[JsonPropertyName("found")]
public bool Found { get; set; }
[JsonPropertyName("x")]
public int X { get; set; }
[JsonPropertyName("y")]
public int Y { get; set; }
[JsonPropertyName("width")]
public int Width { get; set; }
[JsonPropertyName("height")]
public int Height { get; set; }
[JsonPropertyName("confidence")]
public double Confidence { get; set; }
}
class DiffOcrParams
{
[JsonPropertyName("diffThresh")]

View file

@ -3,6 +3,8 @@ namespace OcrDaemon;
using System.Drawing;
using System.Drawing.Imaging;
using System.Runtime.InteropServices;
using System.Threading;
using System.Threading.Tasks;
using System.Text.Json;
using OpenCvSharp;
using OpenCvSharp.Extensions;
@ -61,17 +63,20 @@ class OcrHandler(TesseractEngine engine)
? new DiffOcrParams { DiffThresh = req.Threshold }
: new DiffOcrParams());
public object HandleDiffOcr(Request req, DiffOcrParams p)
/// <summary>
/// Diff detection + crop only. Returns the raw tooltip crop bitmap and region,
/// or null if no tooltip detected. Caller is responsible for disposing the bitmap.
/// </summary>
public (Bitmap cropped, Bitmap refCropped, Bitmap current, RegionRect region)? DiffCrop(Request req, DiffOcrParams p)
{
if (_referenceFrame == null)
return new ErrorResponse("No reference snapshot stored. Send 'snapshot' first.");
return null;
using var current = ScreenCapture.CaptureOrLoad(req.File, null);
var current = ScreenCapture.CaptureOrLoad(req.File, null);
int w = Math.Min(_referenceFrame.Width, current.Width);
int h = Math.Min(_referenceFrame.Height, current.Height);
// Get raw pixels for both frames
var refData = _referenceFrame.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
byte[] refPx = new byte[refData.Stride * h];
Marshal.Copy(refData.Scan0, refPx, 0, refPx.Length);
@ -83,49 +88,34 @@ class OcrHandler(TesseractEngine engine)
Marshal.Copy(curData.Scan0, curPx, 0, curPx.Length);
current.UnlockBits(curData);
// Detect pixels that got DARKER (tooltip = dark overlay).
// This filters out item highlight glow (brighter) and cursor changes.
int diffThresh = p.DiffThresh;
bool[] changed = new bool[w * h];
int totalChanged = 0;
for (int y = 0; y < h; y++)
// Pass 1: parallel row diff — compute rowCounts[] directly, no changed[] array
int[] rowCounts = new int[h];
Parallel.For(0, h, y =>
{
int count = 0;
int rowOffset = y * stride;
for (int x = 0; x < w; x++)
{
int i = y * stride + x * 4;
int darkerB = refPx[i] - curPx[i];
int darkerG = refPx[i + 1] - curPx[i + 1];
int darkerR = refPx[i + 2] - curPx[i + 2];
if (darkerB + darkerG + darkerR > diffThresh)
{
changed[y * w + x] = true;
totalChanged++;
}
int i = rowOffset + x * 4;
int darker = (refPx[i] - curPx[i]) + (refPx[i + 1] - curPx[i + 1]) + (refPx[i + 2] - curPx[i + 2]);
if (darker > diffThresh)
count++;
}
}
rowCounts[y] = count;
});
bool debug = req.Debug;
int totalChanged = 0;
for (int y = 0; y < h; y++) totalChanged += rowCounts[y];
if (totalChanged == 0)
{
if (debug) Console.Error.WriteLine(" diff-ocr: no changes detected");
return new OcrResponse { Text = "", Lines = [] };
current.Dispose();
return null;
}
// Two-pass density detection:
// Pass 1: Find row range using full-width row counts
// Pass 2: Find column range using only pixels within detected row range
// This makes the column threshold relative to tooltip height, not screen height.
int maxGap = p.MaxGap;
// Pass 1: count changed pixels per row, find longest active run
int[] rowCounts = new int[h];
for (int y = 0; y < h; y++)
for (int x = 0; x < w; x++)
if (changed[y * w + x])
rowCounts[y]++;
int rowThresh = w / p.RowThreshDiv;
int bestRowStart = 0, bestRowEnd = 0, bestRowLen = 0;
int curRowStart = -1, lastActiveRow = -1;
@ -149,12 +139,46 @@ class OcrHandler(TesseractEngine engine)
if (len > bestRowLen) { bestRowStart = curRowStart; bestRowEnd = lastActiveRow; bestRowLen = len; }
}
// Pass 2: count changed pixels per column, but only within the detected row range
// Pass 2: parallel column diff — only within the row range, recompute from raw pixels
int[] colCounts = new int[w];
for (int y = bestRowStart; y <= bestRowEnd; y++)
for (int x = 0; x < w; x++)
if (changed[y * w + x])
colCounts[x]++;
int rowRangeLen = bestRowEnd - bestRowStart + 1;
if (rowRangeLen <= 200)
{
// Small range: serial is faster than Parallel overhead
for (int y = bestRowStart; y <= bestRowEnd; y++)
{
int rowOffset = y * stride;
for (int x = 0; x < w; x++)
{
int i = rowOffset + x * 4;
int darker = (refPx[i] - curPx[i]) + (refPx[i + 1] - curPx[i + 1]) + (refPx[i + 2] - curPx[i + 2]);
if (darker > diffThresh)
colCounts[x]++;
}
}
}
else
{
Parallel.For(bestRowStart, bestRowEnd + 1,
() => new int[w],
(y, _, localCols) =>
{
int rowOffset = y * stride;
for (int x = 0; x < w; x++)
{
int i = rowOffset + x * 4;
int darker = (refPx[i] - curPx[i]) + (refPx[i + 1] - curPx[i + 1]) + (refPx[i + 2] - curPx[i + 2]);
if (darker > diffThresh)
localCols[x]++;
}
return localCols;
},
localCols =>
{
for (int x = 0; x < w; x++)
Interlocked.Add(ref colCounts[x], localCols[x]);
});
}
int tooltipHeight = bestRowEnd - bestRowStart + 1;
int colThresh = tooltipHeight / p.ColThreshDiv;
@ -181,13 +205,13 @@ class OcrHandler(TesseractEngine engine)
if (len > bestColLen) { bestColStart = curColStart; bestColEnd = lastActiveCol; bestColLen = len; }
}
// Log density detection results
Console.Error.WriteLine($" diff-ocr: changed={totalChanged} rows={bestRowStart}-{bestRowEnd}({bestRowLen}) cols={bestColStart}-{bestColEnd}({bestColLen}) rowThresh={rowThresh} colThresh={colThresh}");
if (bestRowLen < 50 || bestColLen < 50)
{
Console.Error.WriteLine($" diff-ocr: no tooltip-sized region found (rows={bestRowLen}, cols={bestColLen})");
return new OcrResponse { Text = "", Lines = [] };
current.Dispose();
return null;
}
int minX = bestColStart;
@ -195,13 +219,9 @@ class OcrHandler(TesseractEngine engine)
int maxX = Math.Min(bestColEnd, w - 1);
int maxY = Math.Min(bestRowEnd, h - 1);
// Dynamic right-edge trim: if the rightmost columns are much sparser than
// the tooltip body, trim them. This handles the ~5% of cases where ambient
// noise extends the detected region slightly on the right.
int colSpan = maxX - minX + 1;
if (colSpan > 100)
{
// Compute median column density in the middle 50% of the range
int q1 = minX + colSpan / 4;
int q3 = minX + colSpan * 3 / 4;
long midSum = 0;
@ -209,21 +229,38 @@ class OcrHandler(TesseractEngine engine)
for (int x = q1; x <= q3; x++) { midSum += colCounts[x]; midCount++; }
double avgMidDensity = (double)midSum / midCount;
double cutoff = avgMidDensity * p.TrimCutoff;
// Trim from right while below cutoff
while (maxX > minX + 100 && colCounts[maxX] < cutoff)
maxX--;
}
int rw = maxX - minX + 1;
int rh = maxY - minY + 1;
if (debug) Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}");
var cropped = CropFromBytes(curPx, stride, minX, minY, rw, rh);
var refCropped = CropFromBytes(refPx, stride, minX, minY, rw, rh);
var region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh };
// Crop tooltip region from both current and reference frames
using var cropped = current.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb);
using var refCropped = _referenceFrame.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb);
Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}");
// Save before/after preprocessing images if path is provided
return (cropped, refCropped, current, region);
}
public object HandleDiffOcr(Request req, DiffOcrParams p)
{
if (_referenceFrame == null)
return new ErrorResponse("No reference snapshot stored. Send 'snapshot' first.");
var cropResult = DiffCrop(req, p);
if (cropResult == null)
return new OcrResponse { Text = "", Lines = [] };
var (cropped, refCropped, current, region) = cropResult.Value;
using var _current = current;
using var _cropped = cropped;
using var _refCropped = refCropped;
bool debug = req.Debug;
int minX = region.X, minY = region.Y, rw = region.Width, rh = region.Height;
// Save raw crop if path is provided
if (!string.IsNullOrEmpty(req.Path))
{
var dir = Path.GetDirectoryName(req.Path);
@ -634,6 +671,24 @@ class OcrHandler(TesseractEngine engine)
};
}
/// <summary>
/// Fast crop from raw pixel bytes — avoids slow GDI+ Bitmap.Clone().
/// </summary>
private static Bitmap CropFromBytes(byte[] px, int srcStride, int cropX, int cropY, int cropW, int cropH)
{
var bmp = new Bitmap(cropW, cropH, PixelFormat.Format32bppArgb);
var data = bmp.LockBits(new Rectangle(0, 0, cropW, cropH), ImageLockMode.WriteOnly, PixelFormat.Format32bppArgb);
int dstStride = data.Stride;
int rowBytes = cropW * 4;
for (int y = 0; y < cropH; y++)
{
int srcOffset = (cropY + y) * srcStride + cropX * 4;
Marshal.Copy(px, srcOffset, data.Scan0 + y * dstStride, rowBytes);
}
bmp.UnlockBits(data);
return bmp;
}
private static double LevenshteinSimilarity(string a, string b)
{
a = a.ToLowerInvariant();

View file

@ -0,0 +1,193 @@
namespace OcrDaemon;
using System.Diagnostics;
using System.Drawing;
using System.Text.Json;
using System.Text.Json.Serialization;
using SdImageFormat = System.Drawing.Imaging.ImageFormat;
/// <summary>
/// Manages a persistent Python subprocess for EasyOCR / PaddleOCR.
/// Lazy-starts on first request; reuses the process for subsequent calls.
/// Same stdin/stdout JSON-per-line protocol as the C# daemon itself.
/// </summary>
class PythonOcrBridge : IDisposable
{
private static readonly JsonSerializerOptions JsonOptions = new()
{
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
};
private Process? _proc;
private readonly string _daemonScript;
private readonly string _pythonExe;
private readonly object _lock = new();
public PythonOcrBridge()
{
// Resolve paths relative to this exe
var exeDir = AppContext.BaseDirectory;
// exeDir = tools/OcrDaemon/bin/Release/net8.0-.../
// Walk up 4 levels to tools/
var toolsDir = Path.GetFullPath(Path.Combine(exeDir, "..", "..", "..", ".."));
_daemonScript = Path.GetFullPath(Path.Combine(toolsDir, "python-ocr", "daemon.py"));
// Use the venv Python if it exists, otherwise fall back to system python
var venvPython = Path.GetFullPath(Path.Combine(toolsDir, "python-ocr", ".venv", "Scripts", "python.exe"));
_pythonExe = File.Exists(venvPython) ? venvPython : "python";
}
/// <summary>
/// Run OCR on a screen region using the specified Python engine.
/// Captures screenshot, saves to temp file, sends to Python, returns OcrResponse.
/// </summary>
public object HandleOcr(Request req, string engine)
{
var tmpPath = Path.Combine(Path.GetTempPath(), $"ocr_{Guid.NewGuid():N}.png");
try
{
using var bitmap = ScreenCapture.CaptureOrLoad(req.File, req.Region);
bitmap.Save(tmpPath, SdImageFormat.Png);
return OcrFromFile(tmpPath, engine);
}
finally
{
try { File.Delete(tmpPath); } catch { /* ignore */ }
}
}
/// <summary>
/// Run OCR on an already-saved image file via the Python engine.
/// </summary>
public OcrResponse OcrFromFile(string imagePath, string engine)
{
EnsureRunning();
var pyReq = new { cmd = "ocr", engine, imagePath };
return SendPythonRequest(pyReq);
}
/// <summary>
/// Run OCR on a bitmap via the Python engine (base64 PNG over pipe, no temp file).
/// </summary>
public OcrResponse OcrFromBitmap(Bitmap bitmap, string engine)
{
EnsureRunning();
using var ms = new MemoryStream();
bitmap.Save(ms, SdImageFormat.Png);
var imageBase64 = Convert.ToBase64String(ms.ToArray());
var pyReq = new { cmd = "ocr", engine, imageBase64 };
return SendPythonRequest(pyReq);
}
private OcrResponse SendPythonRequest(object pyReq)
{
var json = JsonSerializer.Serialize(pyReq, JsonOptions);
string responseLine;
lock (_lock)
{
_proc!.StandardInput.WriteLine(json);
_proc.StandardInput.Flush();
responseLine = _proc.StandardOutput.ReadLine()
?? throw new Exception("Python daemon returned null");
}
var resp = JsonSerializer.Deserialize<PythonResponse>(responseLine, JsonOptions);
if (resp == null)
throw new Exception("Failed to parse Python OCR response");
if (!resp.Ok)
throw new Exception(resp.Error ?? "Python OCR failed");
return new OcrResponse
{
Text = resp.Text ?? "",
Lines = resp.Lines ?? [],
};
}
private void EnsureRunning()
{
if (_proc != null && !_proc.HasExited)
return;
_proc?.Dispose();
_proc = null;
if (!File.Exists(_daemonScript))
throw new Exception($"Python OCR daemon not found at {_daemonScript}");
Console.Error.WriteLine($"Spawning Python OCR daemon: {_pythonExe} {_daemonScript}");
_proc = new Process
{
StartInfo = new ProcessStartInfo
{
FileName = _pythonExe,
Arguments = $"\"{_daemonScript}\"",
UseShellExecute = false,
RedirectStandardInput = true,
RedirectStandardOutput = true,
RedirectStandardError = true,
CreateNoWindow = true,
}
};
_proc.ErrorDataReceived += (_, e) =>
{
if (!string.IsNullOrEmpty(e.Data))
Console.Error.WriteLine($"[python-ocr] {e.Data}");
};
_proc.Start();
_proc.BeginErrorReadLine();
// Wait for ready signal (up to 30s for first model load)
var readyLine = _proc.StandardOutput.ReadLine();
if (readyLine == null)
throw new Exception("Python OCR daemon exited before ready signal");
var ready = JsonSerializer.Deserialize<PythonResponse>(readyLine, JsonOptions);
if (ready?.Ready != true)
throw new Exception($"Python OCR daemon did not send ready signal: {readyLine}");
Console.Error.WriteLine("Python OCR daemon ready");
}
public void Dispose()
{
if (_proc != null && !_proc.HasExited)
{
try
{
_proc.StandardInput.Close();
_proc.WaitForExit(3000);
if (!_proc.HasExited) _proc.Kill();
}
catch { /* ignore */ }
}
_proc?.Dispose();
_proc = null;
}
private class PythonResponse
{
[JsonPropertyName("ok")]
public bool Ok { get; set; }
[JsonPropertyName("ready")]
public bool? Ready { get; set; }
[JsonPropertyName("text")]
public string? Text { get; set; }
[JsonPropertyName("lines")]
public List<OcrLineResult>? Lines { get; set; }
[JsonPropertyName("error")]
public string? Error { get; set; }
}
}

View file

@ -0,0 +1,60 @@
namespace OcrDaemon;
using System.Drawing;
using System.Drawing.Imaging;
using OpenCvSharp;
using OpenCvSharp.Extensions;
class TemplateMatchHandler
{
public object HandleTemplateMatch(Request req)
{
if (string.IsNullOrEmpty(req.Path))
return new ErrorResponse("match-template command requires 'path' (template image file)");
if (!System.IO.File.Exists(req.Path))
return new ErrorResponse($"Template file not found: {req.Path}");
using var screenshot = ScreenCapture.CaptureOrLoad(req.File, req.Region);
using var screenMat = BitmapConverter.ToMat(screenshot);
using var template = Cv2.ImRead(req.Path, ImreadModes.Color);
if (template.Empty())
return new ErrorResponse($"Failed to load template image: {req.Path}");
// Convert screenshot from BGRA to BGR if needed
using var screenBgr = new Mat();
if (screenMat.Channels() == 4)
Cv2.CvtColor(screenMat, screenBgr, ColorConversionCodes.BGRA2BGR);
else
screenMat.CopyTo(screenBgr);
// Template must fit within screenshot
if (template.Rows > screenBgr.Rows || template.Cols > screenBgr.Cols)
return new TemplateMatchResponse { Found = false };
using var result = new Mat();
Cv2.MatchTemplate(screenBgr, template, result, TemplateMatchModes.CCoeffNormed);
Cv2.MinMaxLoc(result, out _, out double maxVal, out _, out OpenCvSharp.Point maxLoc);
double threshold = req.Threshold > 0 ? req.Threshold / 100.0 : 0.7;
if (maxVal < threshold)
return new TemplateMatchResponse { Found = false, Confidence = maxVal };
// Calculate center coordinates — offset by region origin if provided
int offsetX = req.Region?.X ?? 0;
int offsetY = req.Region?.Y ?? 0;
return new TemplateMatchResponse
{
Found = true,
X = offsetX + maxLoc.X + template.Cols / 2,
Y = offsetY + maxLoc.Y + template.Rows / 2,
Width = template.Cols,
Height = template.Rows,
Confidence = maxVal,
};
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.3 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 397 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.6 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 157 KiB

Binary file not shown.

157
tools/python-ocr/daemon.py Normal file
View file

@ -0,0 +1,157 @@
"""
Persistent Python OCR daemon (stdin/stdout JSON-per-line protocol).
Supports EasyOCR engine, lazy-loaded on first use.
Managed as a subprocess by the C# OcrDaemon.
Request: {"cmd": "ocr", "engine": "easyocr", "imagePath": "C:\\temp\\screenshot.png"}
Response: {"ok": true, "text": "...", "lines": [{"text": "...", "words": [...]}]}
"""
import sys
import json
_easyocr_reader = None
def _redirect_stdout_to_stderr():
"""Redirect stdout to stderr so library print() calls don't corrupt the JSON protocol."""
real_stdout = sys.stdout
sys.stdout = sys.stderr
return real_stdout
def _restore_stdout(real_stdout):
sys.stdout = real_stdout
def get_easyocr():
global _easyocr_reader
if _easyocr_reader is None:
sys.stderr.write("Loading EasyOCR model...\n")
sys.stderr.flush()
# EasyOCR prints download progress to stdout — redirect during load
real_stdout = _redirect_stdout_to_stderr()
try:
import easyocr
_easyocr_reader = easyocr.Reader(["en"], gpu=True)
finally:
_restore_stdout(real_stdout)
sys.stderr.write("EasyOCR model loaded.\n")
sys.stderr.flush()
return _easyocr_reader
def bbox_to_rect(corners):
"""Convert 4-corner bbox [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] to axis-aligned {x, y, width, height}."""
xs = [c[0] for c in corners]
ys = [c[1] for c in corners]
x = int(min(xs))
y = int(min(ys))
return x, y, int(max(xs)) - x, int(max(ys)) - y
def split_into_words(text, x, y, width, height):
"""Split a detection's text into individual words with proportional bounding boxes."""
parts = text.split()
if len(parts) <= 1:
return [{"text": text.strip(), "x": x, "y": y, "width": width, "height": height}]
total_chars = sum(len(p) for p in parts)
if total_chars == 0:
return [{"text": text.strip(), "x": x, "y": y, "width": width, "height": height}]
words = []
cx = x
for part in parts:
w = max(1, int(width * len(part) / total_chars))
words.append({"text": part, "x": cx, "y": y, "width": w, "height": height})
cx += w
return words
def run_easyocr(image_path):
from PIL import Image
import numpy as np
img = np.array(Image.open(image_path))
return run_easyocr_array(img)
def run_easyocr_array(img):
reader = get_easyocr()
# Redirect stdout during inference — easyocr can print warnings
real_stdout = _redirect_stdout_to_stderr()
try:
# batch_size=32: batch GPU recognition of detected text regions
results = reader.readtext(img, batch_size=32)
finally:
_restore_stdout(real_stdout)
# results: [(bbox_4corners, text, conf), ...]
lines = []
all_text_parts = []
for bbox, text, conf in results:
if not text.strip():
continue
x, y, w, h = bbox_to_rect(bbox)
words = split_into_words(text, x, y, w, h)
lines.append({"text": text.strip(), "words": words})
all_text_parts.append(text.strip())
return {"ok": True, "text": "\n".join(all_text_parts), "lines": lines}
def load_image(req):
"""Load image from either imagePath (file) or imageBase64 (base64-encoded PNG)."""
from PIL import Image
import numpy as np
image_base64 = req.get("imageBase64")
if image_base64:
import base64
import io
img_bytes = base64.b64decode(image_base64)
return np.array(Image.open(io.BytesIO(img_bytes)))
image_path = req.get("imagePath")
if image_path:
return np.array(Image.open(image_path))
return None
def handle_request(req):
cmd = req.get("cmd")
if cmd != "ocr":
return {"ok": False, "error": f"Unknown command: {cmd}"}
engine = req.get("engine", "")
img = load_image(req)
if img is None:
return {"ok": False, "error": "Missing imagePath or imageBase64"}
if engine == "easyocr":
return run_easyocr_array(img)
else:
return {"ok": False, "error": f"Unknown engine: {engine}"}
def main():
# Signal ready
sys.stdout.write(json.dumps({"ok": True, "ready": True}) + "\n")
sys.stdout.flush()
for line in sys.stdin:
line = line.strip()
if not line:
continue
try:
req = json.loads(line)
resp = handle_request(req)
except Exception as e:
resp = {"ok": False, "error": str(e)}
sys.stdout.write(json.dumps(resp) + "\n")
sys.stdout.flush()
if __name__ == "__main__":
main()

View file

@ -0,0 +1,3 @@
easyocr
pillow
numpy