added easyOCR
This commit is contained in:
parent
37d6678577
commit
9f208b0606
27 changed files with 1780 additions and 112 deletions
|
|
@ -53,6 +53,8 @@ static class Daemon
|
|||
var ocrHandler = new OcrHandler(tessEngine);
|
||||
var gridHandler = new GridHandler();
|
||||
var detectGridHandler = new DetectGridHandler();
|
||||
var templateMatchHandler = new TemplateMatchHandler();
|
||||
var pythonBridge = new PythonOcrBridge();
|
||||
|
||||
// Main loop: read one JSON line, handle, write one JSON line
|
||||
string? line;
|
||||
|
|
@ -72,16 +74,21 @@ static class Daemon
|
|||
|
||||
object response = request.Cmd?.ToLowerInvariant() switch
|
||||
{
|
||||
"ocr" when request.Engine is "easyocr"
|
||||
=> pythonBridge.HandleOcr(request, request.Engine),
|
||||
"ocr" => ocrHandler.HandleOcr(request),
|
||||
"screenshot" => ocrHandler.HandleScreenshot(request),
|
||||
"capture" => ocrHandler.HandleCapture(request),
|
||||
"snapshot" => ocrHandler.HandleSnapshot(request),
|
||||
"diff-ocr" when request.Engine is "easyocr"
|
||||
=> HandleDiffOcrPython(ocrHandler, pythonBridge, request),
|
||||
"diff-ocr" => ocrHandler.HandleDiffOcr(request),
|
||||
"test" => ocrHandler.HandleTest(request),
|
||||
"tune" => ocrHandler.HandleTune(request),
|
||||
"grid" => gridHandler.HandleGrid(request),
|
||||
"detect-grid" => detectGridHandler.HandleDetectGrid(request),
|
||||
_ => new ErrorResponse($"Unknown command: {request.Cmd}"),
|
||||
"detect-grid" => detectGridHandler.HandleDetectGrid(request),
|
||||
"match-template" => templateMatchHandler.HandleTemplateMatch(request),
|
||||
_ => new ErrorResponse($"Unknown command: {request.Cmd}"),
|
||||
};
|
||||
WriteResponse(response);
|
||||
}
|
||||
|
|
@ -91,9 +98,59 @@ static class Daemon
|
|||
}
|
||||
}
|
||||
|
||||
pythonBridge.Dispose();
|
||||
return 0;
|
||||
}
|
||||
|
||||
private static object HandleDiffOcrPython(OcrHandler ocrHandler, PythonOcrBridge pythonBridge, Request request)
|
||||
{
|
||||
var sw = System.Diagnostics.Stopwatch.StartNew();
|
||||
var p = request.Threshold > 0
|
||||
? new DiffOcrParams { DiffThresh = request.Threshold }
|
||||
: new DiffOcrParams();
|
||||
|
||||
var cropResult = ocrHandler.DiffCrop(request, p);
|
||||
if (cropResult == null)
|
||||
return new OcrResponse { Text = "", Lines = [] };
|
||||
|
||||
var (cropped, refCropped, current, region) = cropResult.Value;
|
||||
using var _current = current;
|
||||
using var _refCropped = refCropped;
|
||||
var diffMs = sw.ElapsedMilliseconds;
|
||||
|
||||
// Save crop to requested path if provided
|
||||
if (!string.IsNullOrEmpty(request.Path))
|
||||
{
|
||||
var dir = Path.GetDirectoryName(request.Path);
|
||||
if (!string.IsNullOrEmpty(dir) && !Directory.Exists(dir))
|
||||
Directory.CreateDirectory(dir);
|
||||
cropped.Save(request.Path, ImageUtils.GetImageFormat(request.Path));
|
||||
}
|
||||
|
||||
// Send crop to Python via base64 over pipe (no temp file I/O)
|
||||
sw.Restart();
|
||||
var ocrResult = pythonBridge.OcrFromBitmap(cropped, request.Engine!);
|
||||
cropped.Dispose();
|
||||
var ocrMs = sw.ElapsedMilliseconds;
|
||||
|
||||
Console.Error.WriteLine($" diff-ocr-python: diff={diffMs}ms ocr={ocrMs}ms total={diffMs + ocrMs}ms crop={region.Width}x{region.Height}");
|
||||
|
||||
// Offset word coordinates to screen space
|
||||
foreach (var line in ocrResult.Lines)
|
||||
foreach (var word in line.Words)
|
||||
{
|
||||
word.X += region.X;
|
||||
word.Y += region.Y;
|
||||
}
|
||||
|
||||
return new DiffOcrResponse
|
||||
{
|
||||
Text = ocrResult.Text,
|
||||
Lines = ocrResult.Lines,
|
||||
Region = region,
|
||||
};
|
||||
}
|
||||
|
||||
private static void WriteResponse(object response)
|
||||
{
|
||||
var json = JsonSerializer.Serialize(response, JsonOptions);
|
||||
|
|
|
|||
|
|
@ -69,12 +69,13 @@ class GridHandler
|
|||
templateSum += templateGray[ty * templateW + tx];
|
||||
innerCount++;
|
||||
}
|
||||
double tmplMean = innerCount > 0 ? (double)templateSum / innerCount : 0;
|
||||
|
||||
// Threshold for mean absolute difference — default 6
|
||||
double diffThreshold = req.Threshold > 0 ? req.Threshold : 2;
|
||||
// Threshold for brightness-normalized MAD
|
||||
double diffThreshold = req.Threshold > 0 ? req.Threshold : 5;
|
||||
bool debug = req.Debug;
|
||||
|
||||
if (debug) Console.Error.WriteLine($"Grid: {cols}x{rows}, cellW={cellW:F1}, cellH={cellH:F1}, border={border}, threshold={diffThreshold}");
|
||||
if (debug) Console.Error.WriteLine($"Grid: {cols}x{rows}, cellW={cellW:F1}, cellH={cellH:F1}, border={border}, threshold={diffThreshold}, tmplMean={tmplMean:F1}");
|
||||
|
||||
var cells = new List<List<bool>>();
|
||||
for (int row = 0; row < rows; row++)
|
||||
|
|
@ -88,21 +89,30 @@ class GridHandler
|
|||
int cw = (int)Math.Min(cellW, captureW - cx0);
|
||||
int ch = (int)Math.Min(cellH, bitmap.Height - cy0);
|
||||
|
||||
// Compare inner pixels of cell vs template
|
||||
long diffSum = 0;
|
||||
int compared = 0;
|
||||
int innerW = Math.Min(cw, templateW) - border;
|
||||
int innerH = Math.Min(ch, templateH) - border;
|
||||
|
||||
// First pass: compute cell region mean brightness
|
||||
long cellSum = 0;
|
||||
int compared = 0;
|
||||
for (int py = border; py < innerH; py++)
|
||||
{
|
||||
for (int px = border; px < innerW; px++)
|
||||
{
|
||||
int cellVal = captureGray[(cy0 + py) * captureW + (cx0 + px)];
|
||||
int tmplVal = templateGray[py * templateW + px];
|
||||
diffSum += Math.Abs(cellVal - tmplVal);
|
||||
cellSum += captureGray[(cy0 + py) * captureW + (cx0 + px)];
|
||||
compared++;
|
||||
}
|
||||
}
|
||||
double cellMean = compared > 0 ? (double)cellSum / compared : 0;
|
||||
double offset = cellMean - tmplMean;
|
||||
|
||||
// Second pass: MAD on brightness-normalized values
|
||||
long diffSum = 0;
|
||||
for (int py = border; py < innerH; py++)
|
||||
for (int px = border; px < innerW; px++)
|
||||
{
|
||||
double cellVal = captureGray[(cy0 + py) * captureW + (cx0 + px)];
|
||||
double tmplVal = templateGray[py * templateW + px];
|
||||
diffSum += (long)Math.Abs(cellVal - tmplVal - offset);
|
||||
}
|
||||
double meanDiff = compared > 0 ? (double)diffSum / compared : 0;
|
||||
bool occupied = meanDiff > diffThreshold;
|
||||
rowList.Add(occupied);
|
||||
|
|
|
|||
|
|
@ -39,6 +39,9 @@ class Request
|
|||
|
||||
[JsonPropertyName("targetCol")]
|
||||
public int TargetCol { get; set; } = -1;
|
||||
|
||||
[JsonPropertyName("engine")]
|
||||
public string? Engine { get; set; }
|
||||
}
|
||||
|
||||
class RegionRect
|
||||
|
|
@ -209,6 +212,30 @@ class DetectGridResponse
|
|||
public double CellHeight { get; set; }
|
||||
}
|
||||
|
||||
class TemplateMatchResponse
|
||||
{
|
||||
[JsonPropertyName("ok")]
|
||||
public bool Ok => true;
|
||||
|
||||
[JsonPropertyName("found")]
|
||||
public bool Found { get; set; }
|
||||
|
||||
[JsonPropertyName("x")]
|
||||
public int X { get; set; }
|
||||
|
||||
[JsonPropertyName("y")]
|
||||
public int Y { get; set; }
|
||||
|
||||
[JsonPropertyName("width")]
|
||||
public int Width { get; set; }
|
||||
|
||||
[JsonPropertyName("height")]
|
||||
public int Height { get; set; }
|
||||
|
||||
[JsonPropertyName("confidence")]
|
||||
public double Confidence { get; set; }
|
||||
}
|
||||
|
||||
class DiffOcrParams
|
||||
{
|
||||
[JsonPropertyName("diffThresh")]
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ namespace OcrDaemon;
|
|||
using System.Drawing;
|
||||
using System.Drawing.Imaging;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using System.Text.Json;
|
||||
using OpenCvSharp;
|
||||
using OpenCvSharp.Extensions;
|
||||
|
|
@ -61,17 +63,20 @@ class OcrHandler(TesseractEngine engine)
|
|||
? new DiffOcrParams { DiffThresh = req.Threshold }
|
||||
: new DiffOcrParams());
|
||||
|
||||
public object HandleDiffOcr(Request req, DiffOcrParams p)
|
||||
/// <summary>
|
||||
/// Diff detection + crop only. Returns the raw tooltip crop bitmap and region,
|
||||
/// or null if no tooltip detected. Caller is responsible for disposing the bitmap.
|
||||
/// </summary>
|
||||
public (Bitmap cropped, Bitmap refCropped, Bitmap current, RegionRect region)? DiffCrop(Request req, DiffOcrParams p)
|
||||
{
|
||||
if (_referenceFrame == null)
|
||||
return new ErrorResponse("No reference snapshot stored. Send 'snapshot' first.");
|
||||
return null;
|
||||
|
||||
using var current = ScreenCapture.CaptureOrLoad(req.File, null);
|
||||
var current = ScreenCapture.CaptureOrLoad(req.File, null);
|
||||
|
||||
int w = Math.Min(_referenceFrame.Width, current.Width);
|
||||
int h = Math.Min(_referenceFrame.Height, current.Height);
|
||||
|
||||
// Get raw pixels for both frames
|
||||
var refData = _referenceFrame.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
|
||||
byte[] refPx = new byte[refData.Stride * h];
|
||||
Marshal.Copy(refData.Scan0, refPx, 0, refPx.Length);
|
||||
|
|
@ -83,49 +88,34 @@ class OcrHandler(TesseractEngine engine)
|
|||
Marshal.Copy(curData.Scan0, curPx, 0, curPx.Length);
|
||||
current.UnlockBits(curData);
|
||||
|
||||
// Detect pixels that got DARKER (tooltip = dark overlay).
|
||||
// This filters out item highlight glow (brighter) and cursor changes.
|
||||
int diffThresh = p.DiffThresh;
|
||||
bool[] changed = new bool[w * h];
|
||||
int totalChanged = 0;
|
||||
|
||||
for (int y = 0; y < h; y++)
|
||||
// Pass 1: parallel row diff — compute rowCounts[] directly, no changed[] array
|
||||
int[] rowCounts = new int[h];
|
||||
Parallel.For(0, h, y =>
|
||||
{
|
||||
int count = 0;
|
||||
int rowOffset = y * stride;
|
||||
for (int x = 0; x < w; x++)
|
||||
{
|
||||
int i = y * stride + x * 4;
|
||||
int darkerB = refPx[i] - curPx[i];
|
||||
int darkerG = refPx[i + 1] - curPx[i + 1];
|
||||
int darkerR = refPx[i + 2] - curPx[i + 2];
|
||||
if (darkerB + darkerG + darkerR > diffThresh)
|
||||
{
|
||||
changed[y * w + x] = true;
|
||||
totalChanged++;
|
||||
}
|
||||
int i = rowOffset + x * 4;
|
||||
int darker = (refPx[i] - curPx[i]) + (refPx[i + 1] - curPx[i + 1]) + (refPx[i + 2] - curPx[i + 2]);
|
||||
if (darker > diffThresh)
|
||||
count++;
|
||||
}
|
||||
}
|
||||
rowCounts[y] = count;
|
||||
});
|
||||
|
||||
bool debug = req.Debug;
|
||||
int totalChanged = 0;
|
||||
for (int y = 0; y < h; y++) totalChanged += rowCounts[y];
|
||||
|
||||
if (totalChanged == 0)
|
||||
{
|
||||
if (debug) Console.Error.WriteLine(" diff-ocr: no changes detected");
|
||||
return new OcrResponse { Text = "", Lines = [] };
|
||||
current.Dispose();
|
||||
return null;
|
||||
}
|
||||
|
||||
// Two-pass density detection:
|
||||
// Pass 1: Find row range using full-width row counts
|
||||
// Pass 2: Find column range using only pixels within detected row range
|
||||
// This makes the column threshold relative to tooltip height, not screen height.
|
||||
int maxGap = p.MaxGap;
|
||||
|
||||
// Pass 1: count changed pixels per row, find longest active run
|
||||
int[] rowCounts = new int[h];
|
||||
for (int y = 0; y < h; y++)
|
||||
for (int x = 0; x < w; x++)
|
||||
if (changed[y * w + x])
|
||||
rowCounts[y]++;
|
||||
|
||||
int rowThresh = w / p.RowThreshDiv;
|
||||
int bestRowStart = 0, bestRowEnd = 0, bestRowLen = 0;
|
||||
int curRowStart = -1, lastActiveRow = -1;
|
||||
|
|
@ -149,12 +139,46 @@ class OcrHandler(TesseractEngine engine)
|
|||
if (len > bestRowLen) { bestRowStart = curRowStart; bestRowEnd = lastActiveRow; bestRowLen = len; }
|
||||
}
|
||||
|
||||
// Pass 2: count changed pixels per column, but only within the detected row range
|
||||
// Pass 2: parallel column diff — only within the row range, recompute from raw pixels
|
||||
int[] colCounts = new int[w];
|
||||
for (int y = bestRowStart; y <= bestRowEnd; y++)
|
||||
for (int x = 0; x < w; x++)
|
||||
if (changed[y * w + x])
|
||||
colCounts[x]++;
|
||||
int rowRangeLen = bestRowEnd - bestRowStart + 1;
|
||||
if (rowRangeLen <= 200)
|
||||
{
|
||||
// Small range: serial is faster than Parallel overhead
|
||||
for (int y = bestRowStart; y <= bestRowEnd; y++)
|
||||
{
|
||||
int rowOffset = y * stride;
|
||||
for (int x = 0; x < w; x++)
|
||||
{
|
||||
int i = rowOffset + x * 4;
|
||||
int darker = (refPx[i] - curPx[i]) + (refPx[i + 1] - curPx[i + 1]) + (refPx[i + 2] - curPx[i + 2]);
|
||||
if (darker > diffThresh)
|
||||
colCounts[x]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Parallel.For(bestRowStart, bestRowEnd + 1,
|
||||
() => new int[w],
|
||||
(y, _, localCols) =>
|
||||
{
|
||||
int rowOffset = y * stride;
|
||||
for (int x = 0; x < w; x++)
|
||||
{
|
||||
int i = rowOffset + x * 4;
|
||||
int darker = (refPx[i] - curPx[i]) + (refPx[i + 1] - curPx[i + 1]) + (refPx[i + 2] - curPx[i + 2]);
|
||||
if (darker > diffThresh)
|
||||
localCols[x]++;
|
||||
}
|
||||
return localCols;
|
||||
},
|
||||
localCols =>
|
||||
{
|
||||
for (int x = 0; x < w; x++)
|
||||
Interlocked.Add(ref colCounts[x], localCols[x]);
|
||||
});
|
||||
}
|
||||
|
||||
int tooltipHeight = bestRowEnd - bestRowStart + 1;
|
||||
int colThresh = tooltipHeight / p.ColThreshDiv;
|
||||
|
|
@ -181,13 +205,13 @@ class OcrHandler(TesseractEngine engine)
|
|||
if (len > bestColLen) { bestColStart = curColStart; bestColEnd = lastActiveCol; bestColLen = len; }
|
||||
}
|
||||
|
||||
// Log density detection results
|
||||
Console.Error.WriteLine($" diff-ocr: changed={totalChanged} rows={bestRowStart}-{bestRowEnd}({bestRowLen}) cols={bestColStart}-{bestColEnd}({bestColLen}) rowThresh={rowThresh} colThresh={colThresh}");
|
||||
|
||||
if (bestRowLen < 50 || bestColLen < 50)
|
||||
{
|
||||
Console.Error.WriteLine($" diff-ocr: no tooltip-sized region found (rows={bestRowLen}, cols={bestColLen})");
|
||||
return new OcrResponse { Text = "", Lines = [] };
|
||||
current.Dispose();
|
||||
return null;
|
||||
}
|
||||
|
||||
int minX = bestColStart;
|
||||
|
|
@ -195,13 +219,9 @@ class OcrHandler(TesseractEngine engine)
|
|||
int maxX = Math.Min(bestColEnd, w - 1);
|
||||
int maxY = Math.Min(bestRowEnd, h - 1);
|
||||
|
||||
// Dynamic right-edge trim: if the rightmost columns are much sparser than
|
||||
// the tooltip body, trim them. This handles the ~5% of cases where ambient
|
||||
// noise extends the detected region slightly on the right.
|
||||
int colSpan = maxX - minX + 1;
|
||||
if (colSpan > 100)
|
||||
{
|
||||
// Compute median column density in the middle 50% of the range
|
||||
int q1 = minX + colSpan / 4;
|
||||
int q3 = minX + colSpan * 3 / 4;
|
||||
long midSum = 0;
|
||||
|
|
@ -209,21 +229,38 @@ class OcrHandler(TesseractEngine engine)
|
|||
for (int x = q1; x <= q3; x++) { midSum += colCounts[x]; midCount++; }
|
||||
double avgMidDensity = (double)midSum / midCount;
|
||||
double cutoff = avgMidDensity * p.TrimCutoff;
|
||||
|
||||
// Trim from right while below cutoff
|
||||
while (maxX > minX + 100 && colCounts[maxX] < cutoff)
|
||||
maxX--;
|
||||
}
|
||||
int rw = maxX - minX + 1;
|
||||
int rh = maxY - minY + 1;
|
||||
|
||||
if (debug) Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}");
|
||||
var cropped = CropFromBytes(curPx, stride, minX, minY, rw, rh);
|
||||
var refCropped = CropFromBytes(refPx, stride, minX, minY, rw, rh);
|
||||
var region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh };
|
||||
|
||||
// Crop tooltip region from both current and reference frames
|
||||
using var cropped = current.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb);
|
||||
using var refCropped = _referenceFrame.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb);
|
||||
Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}");
|
||||
|
||||
// Save before/after preprocessing images if path is provided
|
||||
return (cropped, refCropped, current, region);
|
||||
}
|
||||
|
||||
public object HandleDiffOcr(Request req, DiffOcrParams p)
|
||||
{
|
||||
if (_referenceFrame == null)
|
||||
return new ErrorResponse("No reference snapshot stored. Send 'snapshot' first.");
|
||||
|
||||
var cropResult = DiffCrop(req, p);
|
||||
if (cropResult == null)
|
||||
return new OcrResponse { Text = "", Lines = [] };
|
||||
|
||||
var (cropped, refCropped, current, region) = cropResult.Value;
|
||||
using var _current = current;
|
||||
using var _cropped = cropped;
|
||||
using var _refCropped = refCropped;
|
||||
bool debug = req.Debug;
|
||||
int minX = region.X, minY = region.Y, rw = region.Width, rh = region.Height;
|
||||
|
||||
// Save raw crop if path is provided
|
||||
if (!string.IsNullOrEmpty(req.Path))
|
||||
{
|
||||
var dir = Path.GetDirectoryName(req.Path);
|
||||
|
|
@ -634,6 +671,24 @@ class OcrHandler(TesseractEngine engine)
|
|||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fast crop from raw pixel bytes — avoids slow GDI+ Bitmap.Clone().
|
||||
/// </summary>
|
||||
private static Bitmap CropFromBytes(byte[] px, int srcStride, int cropX, int cropY, int cropW, int cropH)
|
||||
{
|
||||
var bmp = new Bitmap(cropW, cropH, PixelFormat.Format32bppArgb);
|
||||
var data = bmp.LockBits(new Rectangle(0, 0, cropW, cropH), ImageLockMode.WriteOnly, PixelFormat.Format32bppArgb);
|
||||
int dstStride = data.Stride;
|
||||
int rowBytes = cropW * 4;
|
||||
for (int y = 0; y < cropH; y++)
|
||||
{
|
||||
int srcOffset = (cropY + y) * srcStride + cropX * 4;
|
||||
Marshal.Copy(px, srcOffset, data.Scan0 + y * dstStride, rowBytes);
|
||||
}
|
||||
bmp.UnlockBits(data);
|
||||
return bmp;
|
||||
}
|
||||
|
||||
private static double LevenshteinSimilarity(string a, string b)
|
||||
{
|
||||
a = a.ToLowerInvariant();
|
||||
|
|
|
|||
193
tools/OcrDaemon/PythonOcrBridge.cs
Normal file
193
tools/OcrDaemon/PythonOcrBridge.cs
Normal file
|
|
@ -0,0 +1,193 @@
|
|||
namespace OcrDaemon;
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Drawing;
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
using SdImageFormat = System.Drawing.Imaging.ImageFormat;
|
||||
|
||||
/// <summary>
|
||||
/// Manages a persistent Python subprocess for EasyOCR / PaddleOCR.
|
||||
/// Lazy-starts on first request; reuses the process for subsequent calls.
|
||||
/// Same stdin/stdout JSON-per-line protocol as the C# daemon itself.
|
||||
/// </summary>
|
||||
class PythonOcrBridge : IDisposable
|
||||
{
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
|
||||
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
|
||||
};
|
||||
|
||||
private Process? _proc;
|
||||
private readonly string _daemonScript;
|
||||
private readonly string _pythonExe;
|
||||
private readonly object _lock = new();
|
||||
|
||||
public PythonOcrBridge()
|
||||
{
|
||||
// Resolve paths relative to this exe
|
||||
var exeDir = AppContext.BaseDirectory;
|
||||
// exeDir = tools/OcrDaemon/bin/Release/net8.0-.../
|
||||
// Walk up 4 levels to tools/
|
||||
var toolsDir = Path.GetFullPath(Path.Combine(exeDir, "..", "..", "..", ".."));
|
||||
_daemonScript = Path.GetFullPath(Path.Combine(toolsDir, "python-ocr", "daemon.py"));
|
||||
|
||||
// Use the venv Python if it exists, otherwise fall back to system python
|
||||
var venvPython = Path.GetFullPath(Path.Combine(toolsDir, "python-ocr", ".venv", "Scripts", "python.exe"));
|
||||
_pythonExe = File.Exists(venvPython) ? venvPython : "python";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Run OCR on a screen region using the specified Python engine.
|
||||
/// Captures screenshot, saves to temp file, sends to Python, returns OcrResponse.
|
||||
/// </summary>
|
||||
public object HandleOcr(Request req, string engine)
|
||||
{
|
||||
var tmpPath = Path.Combine(Path.GetTempPath(), $"ocr_{Guid.NewGuid():N}.png");
|
||||
try
|
||||
{
|
||||
using var bitmap = ScreenCapture.CaptureOrLoad(req.File, req.Region);
|
||||
bitmap.Save(tmpPath, SdImageFormat.Png);
|
||||
return OcrFromFile(tmpPath, engine);
|
||||
}
|
||||
finally
|
||||
{
|
||||
try { File.Delete(tmpPath); } catch { /* ignore */ }
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Run OCR on an already-saved image file via the Python engine.
|
||||
/// </summary>
|
||||
public OcrResponse OcrFromFile(string imagePath, string engine)
|
||||
{
|
||||
EnsureRunning();
|
||||
|
||||
var pyReq = new { cmd = "ocr", engine, imagePath };
|
||||
return SendPythonRequest(pyReq);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Run OCR on a bitmap via the Python engine (base64 PNG over pipe, no temp file).
|
||||
/// </summary>
|
||||
public OcrResponse OcrFromBitmap(Bitmap bitmap, string engine)
|
||||
{
|
||||
EnsureRunning();
|
||||
|
||||
using var ms = new MemoryStream();
|
||||
bitmap.Save(ms, SdImageFormat.Png);
|
||||
var imageBase64 = Convert.ToBase64String(ms.ToArray());
|
||||
|
||||
var pyReq = new { cmd = "ocr", engine, imageBase64 };
|
||||
return SendPythonRequest(pyReq);
|
||||
}
|
||||
|
||||
private OcrResponse SendPythonRequest(object pyReq)
|
||||
{
|
||||
var json = JsonSerializer.Serialize(pyReq, JsonOptions);
|
||||
|
||||
string responseLine;
|
||||
lock (_lock)
|
||||
{
|
||||
_proc!.StandardInput.WriteLine(json);
|
||||
_proc.StandardInput.Flush();
|
||||
responseLine = _proc.StandardOutput.ReadLine()
|
||||
?? throw new Exception("Python daemon returned null");
|
||||
}
|
||||
|
||||
var resp = JsonSerializer.Deserialize<PythonResponse>(responseLine, JsonOptions);
|
||||
if (resp == null)
|
||||
throw new Exception("Failed to parse Python OCR response");
|
||||
if (!resp.Ok)
|
||||
throw new Exception(resp.Error ?? "Python OCR failed");
|
||||
|
||||
return new OcrResponse
|
||||
{
|
||||
Text = resp.Text ?? "",
|
||||
Lines = resp.Lines ?? [],
|
||||
};
|
||||
}
|
||||
|
||||
private void EnsureRunning()
|
||||
{
|
||||
if (_proc != null && !_proc.HasExited)
|
||||
return;
|
||||
|
||||
_proc?.Dispose();
|
||||
_proc = null;
|
||||
|
||||
if (!File.Exists(_daemonScript))
|
||||
throw new Exception($"Python OCR daemon not found at {_daemonScript}");
|
||||
|
||||
Console.Error.WriteLine($"Spawning Python OCR daemon: {_pythonExe} {_daemonScript}");
|
||||
|
||||
_proc = new Process
|
||||
{
|
||||
StartInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = _pythonExe,
|
||||
Arguments = $"\"{_daemonScript}\"",
|
||||
UseShellExecute = false,
|
||||
RedirectStandardInput = true,
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true,
|
||||
CreateNoWindow = true,
|
||||
}
|
||||
};
|
||||
|
||||
_proc.ErrorDataReceived += (_, e) =>
|
||||
{
|
||||
if (!string.IsNullOrEmpty(e.Data))
|
||||
Console.Error.WriteLine($"[python-ocr] {e.Data}");
|
||||
};
|
||||
|
||||
_proc.Start();
|
||||
_proc.BeginErrorReadLine();
|
||||
|
||||
// Wait for ready signal (up to 30s for first model load)
|
||||
var readyLine = _proc.StandardOutput.ReadLine();
|
||||
if (readyLine == null)
|
||||
throw new Exception("Python OCR daemon exited before ready signal");
|
||||
|
||||
var ready = JsonSerializer.Deserialize<PythonResponse>(readyLine, JsonOptions);
|
||||
if (ready?.Ready != true)
|
||||
throw new Exception($"Python OCR daemon did not send ready signal: {readyLine}");
|
||||
|
||||
Console.Error.WriteLine("Python OCR daemon ready");
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
if (_proc != null && !_proc.HasExited)
|
||||
{
|
||||
try
|
||||
{
|
||||
_proc.StandardInput.Close();
|
||||
_proc.WaitForExit(3000);
|
||||
if (!_proc.HasExited) _proc.Kill();
|
||||
}
|
||||
catch { /* ignore */ }
|
||||
}
|
||||
_proc?.Dispose();
|
||||
_proc = null;
|
||||
}
|
||||
|
||||
private class PythonResponse
|
||||
{
|
||||
[JsonPropertyName("ok")]
|
||||
public bool Ok { get; set; }
|
||||
|
||||
[JsonPropertyName("ready")]
|
||||
public bool? Ready { get; set; }
|
||||
|
||||
[JsonPropertyName("text")]
|
||||
public string? Text { get; set; }
|
||||
|
||||
[JsonPropertyName("lines")]
|
||||
public List<OcrLineResult>? Lines { get; set; }
|
||||
|
||||
[JsonPropertyName("error")]
|
||||
public string? Error { get; set; }
|
||||
}
|
||||
}
|
||||
60
tools/OcrDaemon/TemplateMatchHandler.cs
Normal file
60
tools/OcrDaemon/TemplateMatchHandler.cs
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
namespace OcrDaemon;
|
||||
|
||||
using System.Drawing;
|
||||
using System.Drawing.Imaging;
|
||||
using OpenCvSharp;
|
||||
using OpenCvSharp.Extensions;
|
||||
|
||||
class TemplateMatchHandler
|
||||
{
|
||||
public object HandleTemplateMatch(Request req)
|
||||
{
|
||||
if (string.IsNullOrEmpty(req.Path))
|
||||
return new ErrorResponse("match-template command requires 'path' (template image file)");
|
||||
|
||||
if (!System.IO.File.Exists(req.Path))
|
||||
return new ErrorResponse($"Template file not found: {req.Path}");
|
||||
|
||||
using var screenshot = ScreenCapture.CaptureOrLoad(req.File, req.Region);
|
||||
using var screenMat = BitmapConverter.ToMat(screenshot);
|
||||
using var template = Cv2.ImRead(req.Path, ImreadModes.Color);
|
||||
|
||||
if (template.Empty())
|
||||
return new ErrorResponse($"Failed to load template image: {req.Path}");
|
||||
|
||||
// Convert screenshot from BGRA to BGR if needed
|
||||
using var screenBgr = new Mat();
|
||||
if (screenMat.Channels() == 4)
|
||||
Cv2.CvtColor(screenMat, screenBgr, ColorConversionCodes.BGRA2BGR);
|
||||
else
|
||||
screenMat.CopyTo(screenBgr);
|
||||
|
||||
// Template must fit within screenshot
|
||||
if (template.Rows > screenBgr.Rows || template.Cols > screenBgr.Cols)
|
||||
return new TemplateMatchResponse { Found = false };
|
||||
|
||||
using var result = new Mat();
|
||||
Cv2.MatchTemplate(screenBgr, template, result, TemplateMatchModes.CCoeffNormed);
|
||||
|
||||
Cv2.MinMaxLoc(result, out _, out double maxVal, out _, out OpenCvSharp.Point maxLoc);
|
||||
|
||||
double threshold = req.Threshold > 0 ? req.Threshold / 100.0 : 0.7;
|
||||
|
||||
if (maxVal < threshold)
|
||||
return new TemplateMatchResponse { Found = false, Confidence = maxVal };
|
||||
|
||||
// Calculate center coordinates — offset by region origin if provided
|
||||
int offsetX = req.Region?.X ?? 0;
|
||||
int offsetY = req.Region?.Y ?? 0;
|
||||
|
||||
return new TemplateMatchResponse
|
||||
{
|
||||
Found = true,
|
||||
X = offsetX + maxLoc.X + template.Cols / 2,
|
||||
Y = offsetY + maxLoc.Y + template.Rows / 2,
|
||||
Width = template.Cols,
|
||||
Height = template.Rows,
|
||||
Confidence = maxVal,
|
||||
};
|
||||
}
|
||||
}
|
||||
BIN
tools/OcrDaemon/tessdata/images/vertex1_crop.png
Normal file
BIN
tools/OcrDaemon/tessdata/images/vertex1_crop.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 3.3 MiB |
BIN
tools/OcrDaemon/tessdata/images/vertex1_tight.png
Normal file
BIN
tools/OcrDaemon/tessdata/images/vertex1_tight.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 397 KiB |
BIN
tools/OcrDaemon/tessdata/images/vertex2_crop.png
Normal file
BIN
tools/OcrDaemon/tessdata/images/vertex2_crop.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 4.6 MiB |
BIN
tools/OcrDaemon/tessdata/images/vertex2_tight.png
Normal file
BIN
tools/OcrDaemon/tessdata/images/vertex2_tight.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 157 KiB |
BIN
tools/python-ocr/__pycache__/daemon.cpython-313.pyc
Normal file
BIN
tools/python-ocr/__pycache__/daemon.cpython-313.pyc
Normal file
Binary file not shown.
157
tools/python-ocr/daemon.py
Normal file
157
tools/python-ocr/daemon.py
Normal file
|
|
@ -0,0 +1,157 @@
|
|||
"""
|
||||
Persistent Python OCR daemon (stdin/stdout JSON-per-line protocol).
|
||||
|
||||
Supports EasyOCR engine, lazy-loaded on first use.
|
||||
Managed as a subprocess by the C# OcrDaemon.
|
||||
|
||||
Request: {"cmd": "ocr", "engine": "easyocr", "imagePath": "C:\\temp\\screenshot.png"}
|
||||
Response: {"ok": true, "text": "...", "lines": [{"text": "...", "words": [...]}]}
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
|
||||
_easyocr_reader = None
|
||||
|
||||
|
||||
def _redirect_stdout_to_stderr():
|
||||
"""Redirect stdout to stderr so library print() calls don't corrupt the JSON protocol."""
|
||||
real_stdout = sys.stdout
|
||||
sys.stdout = sys.stderr
|
||||
return real_stdout
|
||||
|
||||
|
||||
def _restore_stdout(real_stdout):
|
||||
sys.stdout = real_stdout
|
||||
|
||||
|
||||
def get_easyocr():
|
||||
global _easyocr_reader
|
||||
if _easyocr_reader is None:
|
||||
sys.stderr.write("Loading EasyOCR model...\n")
|
||||
sys.stderr.flush()
|
||||
# EasyOCR prints download progress to stdout — redirect during load
|
||||
real_stdout = _redirect_stdout_to_stderr()
|
||||
try:
|
||||
import easyocr
|
||||
_easyocr_reader = easyocr.Reader(["en"], gpu=True)
|
||||
finally:
|
||||
_restore_stdout(real_stdout)
|
||||
sys.stderr.write("EasyOCR model loaded.\n")
|
||||
sys.stderr.flush()
|
||||
return _easyocr_reader
|
||||
|
||||
|
||||
def bbox_to_rect(corners):
|
||||
"""Convert 4-corner bbox [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] to axis-aligned {x, y, width, height}."""
|
||||
xs = [c[0] for c in corners]
|
||||
ys = [c[1] for c in corners]
|
||||
x = int(min(xs))
|
||||
y = int(min(ys))
|
||||
return x, y, int(max(xs)) - x, int(max(ys)) - y
|
||||
|
||||
|
||||
def split_into_words(text, x, y, width, height):
|
||||
"""Split a detection's text into individual words with proportional bounding boxes."""
|
||||
parts = text.split()
|
||||
if len(parts) <= 1:
|
||||
return [{"text": text.strip(), "x": x, "y": y, "width": width, "height": height}]
|
||||
|
||||
total_chars = sum(len(p) for p in parts)
|
||||
if total_chars == 0:
|
||||
return [{"text": text.strip(), "x": x, "y": y, "width": width, "height": height}]
|
||||
|
||||
words = []
|
||||
cx = x
|
||||
for part in parts:
|
||||
w = max(1, int(width * len(part) / total_chars))
|
||||
words.append({"text": part, "x": cx, "y": y, "width": w, "height": height})
|
||||
cx += w
|
||||
return words
|
||||
|
||||
|
||||
def run_easyocr(image_path):
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
img = np.array(Image.open(image_path))
|
||||
return run_easyocr_array(img)
|
||||
|
||||
|
||||
def run_easyocr_array(img):
|
||||
reader = get_easyocr()
|
||||
|
||||
# Redirect stdout during inference — easyocr can print warnings
|
||||
real_stdout = _redirect_stdout_to_stderr()
|
||||
try:
|
||||
# batch_size=32: batch GPU recognition of detected text regions
|
||||
results = reader.readtext(img, batch_size=32)
|
||||
finally:
|
||||
_restore_stdout(real_stdout)
|
||||
# results: [(bbox_4corners, text, conf), ...]
|
||||
lines = []
|
||||
all_text_parts = []
|
||||
for bbox, text, conf in results:
|
||||
if not text.strip():
|
||||
continue
|
||||
x, y, w, h = bbox_to_rect(bbox)
|
||||
words = split_into_words(text, x, y, w, h)
|
||||
lines.append({"text": text.strip(), "words": words})
|
||||
all_text_parts.append(text.strip())
|
||||
return {"ok": True, "text": "\n".join(all_text_parts), "lines": lines}
|
||||
|
||||
|
||||
def load_image(req):
|
||||
"""Load image from either imagePath (file) or imageBase64 (base64-encoded PNG)."""
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
image_base64 = req.get("imageBase64")
|
||||
if image_base64:
|
||||
import base64
|
||||
import io
|
||||
img_bytes = base64.b64decode(image_base64)
|
||||
return np.array(Image.open(io.BytesIO(img_bytes)))
|
||||
|
||||
image_path = req.get("imagePath")
|
||||
if image_path:
|
||||
return np.array(Image.open(image_path))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def handle_request(req):
|
||||
cmd = req.get("cmd")
|
||||
if cmd != "ocr":
|
||||
return {"ok": False, "error": f"Unknown command: {cmd}"}
|
||||
|
||||
engine = req.get("engine", "")
|
||||
img = load_image(req)
|
||||
if img is None:
|
||||
return {"ok": False, "error": "Missing imagePath or imageBase64"}
|
||||
|
||||
if engine == "easyocr":
|
||||
return run_easyocr_array(img)
|
||||
else:
|
||||
return {"ok": False, "error": f"Unknown engine: {engine}"}
|
||||
|
||||
|
||||
def main():
|
||||
# Signal ready
|
||||
sys.stdout.write(json.dumps({"ok": True, "ready": True}) + "\n")
|
||||
sys.stdout.flush()
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
req = json.loads(line)
|
||||
resp = handle_request(req)
|
||||
except Exception as e:
|
||||
resp = {"ok": False, "error": str(e)}
|
||||
sys.stdout.write(json.dumps(resp) + "\n")
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
3
tools/python-ocr/requirements.txt
Normal file
3
tools/python-ocr/requirements.txt
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
easyocr
|
||||
pillow
|
||||
numpy
|
||||
Loading…
Add table
Add a link
Reference in a new issue