much better bot and ocr

This commit is contained in:
Boki 2026-02-22 14:21:32 -05:00
parent bb8f50116a
commit 6257bcf122
25 changed files with 583 additions and 101 deletions

View file

@ -104,25 +104,6 @@ public sealed class OcrParams
[JsonPropertyName("softThreshold")]
public bool SoftThreshold { get; set; } = false;
// EasyOCR tuning
[JsonPropertyName("mergeGap")]
public int MergeGap { get; set; } = 0;
[JsonPropertyName("linkThreshold")]
public double? LinkThreshold { get; set; }
[JsonPropertyName("textThreshold")]
public double? TextThreshold { get; set; }
[JsonPropertyName("lowText")]
public double? LowText { get; set; }
[JsonPropertyName("widthThs")]
public double? WidthThs { get; set; }
[JsonPropertyName("paragraph")]
public bool? Paragraph { get; set; }
}
public sealed class DiffOcrParams

View file

@ -0,0 +1,9 @@
using System.Drawing;
namespace Poe2Trade.Screen;
public interface IOcrEngine : IDisposable
{
string Name { get; }
OcrResponse Recognize(Bitmap bitmap);
}

View file

@ -0,0 +1,35 @@
using System.Drawing;
namespace Poe2Trade.Screen.Ocr;
/// <summary>
/// OCR engine wrapping the Python EasyOCR daemon.
/// EasyOCR-specific tuning params live here, not in shared OcrParams.
/// </summary>
public sealed class EasyOcrEngine : IOcrEngine
{
private readonly PythonOcrBridge _bridge = new();
public string Name => "EasyOCR";
// EasyOCR-specific tuning (formerly in OcrParams)
public int MergeGap { get; set; }
public double? LinkThreshold { get; set; }
public double? TextThreshold { get; set; }
public double? LowText { get; set; }
public double? WidthThs { get; set; }
public bool? Paragraph { get; set; }
public OcrResponse Recognize(Bitmap bitmap)
{
return _bridge.OcrFromBitmap(bitmap,
mergeGap: MergeGap,
linkThreshold: LinkThreshold,
textThreshold: TextThreshold,
lowText: LowText,
widthThs: WidthThs,
paragraph: Paragraph);
}
public void Dispose() => _bridge.Dispose();
}

View file

@ -0,0 +1,18 @@
using Serilog;
namespace Poe2Trade.Screen.Ocr;
public static class OcrEngineFactory
{
public static IOcrEngine Create(string engineName)
{
Log.Information("Creating OCR engine: {Engine}", engineName);
return engineName switch
{
"OneOCR" => new OneOcrEngine(Path.GetFullPath(Path.Combine("tools", "oneocr"))),
"EasyOCR" => new EasyOcrEngine(),
_ => new WinOcrEngine(),
};
}
}

View file

@ -0,0 +1,268 @@
using System.Drawing;
using System.Drawing.Imaging;
using System.Runtime.InteropServices;
using Serilog;
namespace Poe2Trade.Screen.Ocr;
/// <summary>
/// OCR engine using OneOCR (Windows 11 Snipping Tool's built-in engine).
/// Requires oneocr.dll, oneocr.onemodel, and onnxruntime.dll in the model directory.
/// </summary>
public sealed class OneOcrEngine : IOcrEngine
{
public string Name => "OneOCR";
// Native handles (int64) — created once, reused per call
private long _pipeline;
private long _initOptions;
private long _processOptions;
private static readonly byte[] ModelKey = "kj)TGtrK>f]b[Piow.gU+nC@s\"\"\"\"\"\"4"u8.ToArray();
public OneOcrEngine(string modelDir)
{
if (!Directory.Exists(modelDir))
throw new DirectoryNotFoundException($"OneOCR model directory not found: {modelDir}");
var modelPath = Path.Combine(modelDir, "oneocr.onemodel");
if (!File.Exists(modelPath))
throw new FileNotFoundException($"OneOCR model not found: {modelPath}");
var dllPath = Path.Combine(modelDir, "oneocr.dll");
if (!File.Exists(dllPath))
throw new FileNotFoundException($"oneocr.dll not found: {dllPath}");
// Set DLL search directory so oneocr.dll can find onnxruntime.dll
SetDllDirectoryW(modelDir);
// Load the DLL explicitly from modelDir
var hDll = NativeLibrary.Load(dllPath);
NativeLibrary.SetDllImportResolver(typeof(OneOcrEngine).Assembly, (name, _, _) =>
name == Dll ? hDll : IntPtr.Zero);
// Init options
CheckResult(Native.CreateOcrInitOptions(out _initOptions), "CreateOcrInitOptions");
CheckResult(Native.OcrInitOptionsSetUseModelDelayLoad(_initOptions, 0), "SetUseModelDelayLoad");
// Pipeline (pass full model path as byte string)
CheckResult(Native.CreateOcrPipeline(modelPath, ModelKey, _initOptions, out _pipeline), "CreateOcrPipeline");
// Process options
CheckResult(Native.CreateOcrProcessOptions(out _processOptions), "CreateOcrProcessOptions");
CheckResult(Native.OcrProcessOptionsSetMaxRecognitionLineCount(_processOptions, 1000), "SetMaxLineCount");
Log.Information("OneOcrEngine initialized (modelDir: {Dir})", modelDir);
}
public OcrResponse Recognize(Bitmap bitmap)
{
// Convert bitmap to BGRA pixel data
var rect = new Rectangle(0, 0, bitmap.Width, bitmap.Height);
var bmpData = bitmap.LockBits(rect, ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
try
{
var imageStruct = new ImageStructure
{
Type = 3, // CV_8UC4 / BGRA
Width = bitmap.Width,
Height = bitmap.Height,
Reserved = 0,
Step = bmpData.Stride,
Data = bmpData.Scan0,
};
long result;
long rc = Native.RunOcrPipeline(_pipeline, ref imageStruct, _processOptions, out result);
if (rc != 0)
{
Log.Warning("OneOCR: RunOcrPipeline failed (code {Code})", rc);
return new OcrResponse { Text = "", Lines = [] };
}
try
{
return ParseResult(result);
}
finally
{
Native.ReleaseOcrResult(result);
}
}
finally
{
bitmap.UnlockBits(bmpData);
}
}
private static OcrResponse ParseResult(long result)
{
long lineCount;
if (Native.GetOcrLineCount(result, out lineCount) != 0)
return new OcrResponse { Text = "", Lines = [] };
var lines = new List<OcrLine>();
for (long i = 0; i < lineCount; i++)
{
long line;
if (Native.GetOcrLine(result, i, out line) != 0 || line == 0) continue;
long wordCount;
if (Native.GetOcrLineWordCount(line, out wordCount) != 0) continue;
var words = new List<OcrWord>();
for (long j = 0; j < wordCount; j++)
{
long word;
if (Native.GetOcrWord(line, j, out word) != 0 || word == 0) continue;
IntPtr contentPtr;
if (Native.GetOcrWordContent(word, out contentPtr) != 0) continue;
var text = Marshal.PtrToStringUTF8(contentPtr);
if (string.IsNullOrEmpty(text)) continue;
// BoundingBox: 4 corners as floats → axis-aligned rect
IntPtr bboxPtr;
int x = 0, y = 0, w = 0, h = 0;
if (Native.GetOcrWordBoundingBox(word, out bboxPtr) == 0 && bboxPtr != IntPtr.Zero)
{
var bbox = Marshal.PtrToStructure<BoundingBox>(bboxPtr);
int x1 = (int)MathF.Min(MathF.Min(bbox.X1, bbox.X2), MathF.Min(bbox.X3, bbox.X4));
int y1 = (int)MathF.Min(MathF.Min(bbox.Y1, bbox.Y2), MathF.Min(bbox.Y3, bbox.Y4));
int x2 = (int)MathF.Max(MathF.Max(bbox.X1, bbox.X2), MathF.Max(bbox.X3, bbox.X4));
int y2 = (int)MathF.Max(MathF.Max(bbox.Y1, bbox.Y2), MathF.Max(bbox.Y3, bbox.Y4));
x = x1; y = y1; w = x2 - x1; h = y2 - y1;
}
words.Add(new OcrWord { Text = text, X = x, Y = y, Width = w, Height = h });
}
if (words.Count > 0)
{
lines.Add(new OcrLine
{
Text = string.Join(" ", words.Select(wd => wd.Text)),
Words = words,
});
}
}
var fullText = string.Join("\n", lines.Select(l => l.Text));
return new OcrResponse { Text = fullText, Lines = lines };
}
public void Dispose()
{
if (_processOptions != 0) { Native.ReleaseOcrProcessOptions(_processOptions); _processOptions = 0; }
if (_pipeline != 0) { Native.ReleaseOcrPipeline(_pipeline); _pipeline = 0; }
if (_initOptions != 0) { Native.ReleaseOcrInitOptions(_initOptions); _initOptions = 0; }
}
private static void CheckResult(long rc, string func)
{
if (rc != 0)
throw new InvalidOperationException($"OneOCR {func} failed (code {rc})");
}
// -- Native structs --
// Matches C struct: { int32 t, int32 col, int32 row, int32 _unk, int64 step, int64 data_ptr } = 0x20 bytes
[StructLayout(LayoutKind.Sequential)]
private struct ImageStructure
{
public int Type; // 3 = CV_8UC4 (BGRA)
public int Width;
public int Height;
public int Reserved;
public long Step; // stride in bytes per row
public IntPtr Data; // pointer to BGRA pixel data
}
[StructLayout(LayoutKind.Sequential)]
private struct BoundingBox
{
public float X1, Y1;
public float X2, Y2;
public float X3, Y3;
public float X4, Y4;
}
// -- P/Invoke --
private const string Dll = "oneocr.dll";
[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
[return: MarshalAs(UnmanagedType.Bool)]
private static extern bool SetDllDirectoryW(string lpPathName);
// All OneOCR functions return int64 error code (0 = success) and use out-pointer params for handles.
private static class Native
{
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long CreateOcrInitOptions(out long options);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long OcrInitOptionsSetUseModelDelayLoad(long options, byte flag);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long CreateOcrPipeline(
[MarshalAs(UnmanagedType.LPUTF8Str)] string modelPath,
byte[] key,
long initOptions,
out long pipeline);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long CreateOcrProcessOptions(out long options);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long OcrProcessOptionsSetMaxRecognitionLineCount(long options, long maxLines);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long RunOcrPipeline(long pipeline, ref ImageStructure image, long processOptions, out long result);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long GetImageAngle(long result, out float angle);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long GetOcrLineCount(long result, out long count);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long GetOcrLine(long result, long index, out long line);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long GetOcrLineContent(long line, out IntPtr content);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long GetOcrLineBoundingBox(long line, out IntPtr bbox);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long GetOcrLineWordCount(long line, out long count);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long GetOcrWord(long line, long index, out long word);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long GetOcrWordContent(long word, out IntPtr content);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long GetOcrWordBoundingBox(long word, out IntPtr bbox);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern long GetOcrWordConfidence(long word, out float confidence);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern void ReleaseOcrResult(long result);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern void ReleaseOcrPipeline(long pipeline);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern void ReleaseOcrInitOptions(long options);
[DllImport(Dll, CallingConvention = CallingConvention.Cdecl)]
public static extern void ReleaseOcrProcessOptions(long options);
}
}

View file

@ -0,0 +1,67 @@
using System.Drawing;
using System.Drawing.Imaging;
using Serilog;
using Windows.Graphics.Imaging;
using Windows.Media.Ocr;
using Windows.Storage.Streams;
using BitmapDecoder = Windows.Graphics.Imaging.BitmapDecoder;
using SdImageFormat = System.Drawing.Imaging.ImageFormat;
namespace Poe2Trade.Screen.Ocr;
public sealed class WinOcrEngine : IOcrEngine
{
private readonly OcrEngine _engine;
public string Name => "WinOCR";
public WinOcrEngine()
{
_engine = OcrEngine.TryCreateFromUserProfileLanguages()
?? throw new InvalidOperationException("Windows OCR engine not available");
Log.Information("WinOcrEngine initialized (language: {Lang})", _engine.RecognizerLanguage.DisplayName);
}
public OcrResponse Recognize(Bitmap bitmap)
{
// Convert System.Drawing.Bitmap → PNG stream → WinRT SoftwareBitmap
using var ms = new MemoryStream();
bitmap.Save(ms, SdImageFormat.Png);
ms.Position = 0;
var stream = ms.AsRandomAccessStream();
var decoder = BitmapDecoder.CreateAsync(stream).AsTask().GetAwaiter().GetResult();
var softwareBitmap = decoder.GetSoftwareBitmapAsync().AsTask().GetAwaiter().GetResult();
var ocrResult = _engine.RecognizeAsync(softwareBitmap).AsTask().GetAwaiter().GetResult();
var lines = new List<OcrLine>();
foreach (var winLine in ocrResult.Lines)
{
var words = new List<OcrWord>();
foreach (var winWord in winLine.Words)
{
var r = winWord.BoundingRect;
words.Add(new OcrWord
{
Text = winWord.Text,
X = (int)r.X,
Y = (int)r.Y,
Width = (int)r.Width,
Height = (int)r.Height,
});
}
lines.Add(new OcrLine
{
Text = winLine.Text,
Words = words,
});
}
var fullText = string.Join("\n", lines.Select(l => l.Text));
return new OcrResponse { Text = fullText, Lines = lines };
}
public void Dispose() { }
}

View file

@ -37,7 +37,14 @@ class PythonOcrBridge : IDisposable
/// <summary>
/// Run OCR on a bitmap via the Python EasyOCR engine (base64 PNG over pipe).
/// </summary>
public OcrResponse OcrFromBitmap(Bitmap bitmap, OcrParams? ocrParams = null)
public OcrResponse OcrFromBitmap(
Bitmap bitmap,
int mergeGap = 0,
double? linkThreshold = null,
double? textThreshold = null,
double? lowText = null,
double? widthThs = null,
bool? paragraph = null)
{
EnsureRunning();
@ -45,26 +52,18 @@ class PythonOcrBridge : IDisposable
bitmap.Save(ms, SdImageFormat.Png);
var imageBase64 = Convert.ToBase64String(ms.ToArray());
var pyReq = BuildPythonRequest(ocrParams);
var pyReq = new Dictionary<string, object?> { ["cmd"] = "ocr", ["engine"] = "easyocr" };
if (mergeGap > 0) pyReq["mergeGap"] = mergeGap;
if (linkThreshold.HasValue) pyReq["linkThreshold"] = linkThreshold.Value;
if (textThreshold.HasValue) pyReq["textThreshold"] = textThreshold.Value;
if (lowText.HasValue) pyReq["lowText"] = lowText.Value;
if (widthThs.HasValue) pyReq["widthThs"] = widthThs.Value;
if (paragraph.HasValue) pyReq["paragraph"] = paragraph.Value;
pyReq["imageBase64"] = imageBase64;
return SendPythonRequest(pyReq);
}
private static Dictionary<string, object?> BuildPythonRequest(OcrParams? ocrParams)
{
var req = new Dictionary<string, object?> { ["cmd"] = "ocr", ["engine"] = "easyocr" };
if (ocrParams == null) return req;
if (ocrParams.MergeGap > 0) req["mergeGap"] = ocrParams.MergeGap;
if (ocrParams.LinkThreshold.HasValue) req["linkThreshold"] = ocrParams.LinkThreshold.Value;
if (ocrParams.TextThreshold.HasValue) req["textThreshold"] = ocrParams.TextThreshold.Value;
if (ocrParams.LowText.HasValue) req["lowText"] = ocrParams.LowText.Value;
if (ocrParams.WidthThs.HasValue) req["widthThs"] = ocrParams.WidthThs.Value;
if (ocrParams.Paragraph.HasValue) req["paragraph"] = ocrParams.Paragraph.Value;
return req;
}
private OcrResponse SendPythonRequest(object pyReq)
{
var json = JsonSerializer.Serialize(pyReq, JsonOptions);

View file

@ -16,13 +16,14 @@ public class ScreenReader : IScreenReader
private readonly GridHandler _gridHandler = new();
private readonly TemplateMatchHandler _templateMatch = new();
private readonly EdgeCropHandler _edgeCrop = new();
private readonly PythonOcrBridge _pythonBridge = new();
private readonly IOcrEngine _ocrEngine;
private bool _initialized;
public GridReader Grid { get; }
public ScreenReader()
public ScreenReader(IOcrEngine ocrEngine)
{
_ocrEngine = ocrEngine;
Grid = new GridReader(_gridHandler);
}
@ -59,16 +60,16 @@ public class ScreenReader : IScreenReader
if (preprocess == "tophat")
{
using var processed = ImagePreprocessor.PreprocessForOcr(bitmap);
result = _pythonBridge.OcrFromBitmap(processed);
result = _ocrEngine.Recognize(processed);
}
else if (preprocess == "clahe")
{
using var processed = ImagePreprocessor.PreprocessClahe(bitmap);
result = _pythonBridge.OcrFromBitmap(processed);
result = _ocrEngine.Recognize(processed);
}
else
{
result = _pythonBridge.OcrFromBitmap(bitmap);
result = _ocrEngine.Recognize(bitmap);
}
var allText = string.Join(" | ", result.Lines.Select(l => l.Text));
@ -149,7 +150,7 @@ public class ScreenReader : IScreenReader
? ImagePreprocessor.PreprocessWithBackgroundSub(cropped, refCropped, ocr.DimPercentile, ocr.TextThresh, 1, ocr.SoftThreshold)
: ImagePreprocessor.PreprocessForOcr(cropped, ocr.KernelSize, 1);
var ocrResult = _pythonBridge.OcrFromBitmap(processedBmp, ocr);
var ocrResult = _ocrEngine.Recognize(processedBmp);
// Offset coordinates to screen space
foreach (var line in ocrResult.Lines)
@ -299,7 +300,7 @@ public class ScreenReader : IScreenReader
var ocrSw2 = System.Diagnostics.Stopwatch.StartNew();
OcrResponse ocrResult2;
try { ocrResult2 = _pythonBridge.OcrFromBitmap(crop); }
try { ocrResult2 = _ocrEngine.Recognize(crop); }
catch (TimeoutException)
{
Log.Warning("NameplateDiffOcr: crop OCR timed out");
@ -386,7 +387,7 @@ public class ScreenReader : IScreenReader
OcrResponse ocrResult;
try
{
ocrResult = _pythonBridge.OcrFromBitmap(stitched);
ocrResult = _ocrEngine.Recognize(stitched);
}
catch (TimeoutException)
{
@ -975,7 +976,7 @@ public class ScreenReader : IScreenReader
return keep;
}
public void Dispose() => _pythonBridge.Dispose();
public void Dispose() => _ocrEngine.Dispose();
// -- OCR text matching --