poe2-bot/tools/OcrDaemon/OcrHandler.cs

349 lines
13 KiB
C#

namespace OcrDaemon;
using System.Drawing;
using System.Drawing.Imaging;
using System.Runtime.InteropServices;
using Tesseract;
using SdImageFormat = System.Drawing.Imaging.ImageFormat;
class OcrHandler(TesseractEngine engine)
{
private Bitmap? _referenceFrame;
public object HandleOcr(Request req)
{
var options = NormalizeOptions(req.Ocr);
using var bitmap = ScreenCapture.CaptureOrLoad(req.File, req.Region);
using var processed = ImagePreprocessor.PreprocessForOcr(bitmap, options);
using var pix = ImageUtils.BitmapToPix(processed);
using var page = engine.Process(pix);
var text = page.GetText();
var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: 0, offsetY: 0, minConfidence: options.MinConfidence);
return new OcrResponse { Text = text, Lines = lines };
}
public object HandleScreenshot(Request req)
{
if (string.IsNullOrEmpty(req.Path))
return new ErrorResponse("screenshot command requires 'path'");
// If a reference frame exists, save that (same image used for diff-ocr).
// Otherwise capture a new frame.
var bitmap = _referenceFrame ?? ScreenCapture.CaptureOrLoad(req.File, req.Region);
var format = ImageUtils.GetImageFormat(req.Path);
var dir = Path.GetDirectoryName(req.Path);
if (!string.IsNullOrEmpty(dir) && !Directory.Exists(dir))
Directory.CreateDirectory(dir);
bitmap.Save(req.Path, format);
if (bitmap != _referenceFrame) bitmap.Dispose();
return new OkResponse();
}
public object HandleCapture(Request req)
{
using var bitmap = ScreenCapture.CaptureOrLoad(req.File, req.Region);
using var ms = new MemoryStream();
bitmap.Save(ms, SdImageFormat.Png);
var base64 = Convert.ToBase64String(ms.ToArray());
return new CaptureResponse { Image = base64 };
}
public object HandleSnapshot(Request req)
{
_referenceFrame?.Dispose();
_referenceFrame = ScreenCapture.CaptureOrLoad(req.File, req.Region);
return new OkResponse();
}
public object HandleDiffOcr(Request req)
{
var options = NormalizeOptions(req.Ocr);
if (_referenceFrame == null)
return new ErrorResponse("No reference snapshot stored. Send 'snapshot' first.");
using var current = ScreenCapture.CaptureOrLoad(req.File, null);
int w = Math.Min(_referenceFrame.Width, current.Width);
int h = Math.Min(_referenceFrame.Height, current.Height);
// Get raw pixels for both frames
var refData = _referenceFrame.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
byte[] refPx = new byte[refData.Stride * h];
Marshal.Copy(refData.Scan0, refPx, 0, refPx.Length);
_referenceFrame.UnlockBits(refData);
int stride = refData.Stride;
var curData = current.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
byte[] curPx = new byte[curData.Stride * h];
Marshal.Copy(curData.Scan0, curPx, 0, curPx.Length);
current.UnlockBits(curData);
bool debug = req.Debug;
int[] delta = new int[w * h];
long sum = 0;
long sumSq = 0;
int count = 0;
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
{
int i = y * stride + x * 4;
int refB = refPx[i];
int refG = refPx[i + 1];
int refR = refPx[i + 2];
int curB = curPx[i];
int curG = curPx[i + 1];
int curR = curPx[i + 2];
int refL = (refR * 30 + refG * 59 + refB * 11) / 100;
int curL = (curR * 30 + curG * 59 + curB * 11) / 100;
int d = refL - curL;
delta[y * w + x] = d;
if (d > 0)
{
sum += d;
sumSq += (long)d * d;
count++;
}
}
}
if (count == 0)
{
if (debug) Console.Error.WriteLine(" diff-ocr: no darkening detected");
return new OcrResponse { Text = "", Lines = [] };
}
double mean = (double)sum / count;
double variance = Math.Max(0, (double)sumSq / count - mean * mean);
double std = Math.Sqrt(variance);
int minThresh = req.Threshold > 0 ? req.Threshold : 20;
int diffThresh = (int)Math.Round(Math.Max(mean + 2.0 * std, minThresh));
int brightThresh = Math.Max(minThresh, diffThresh / 2);
bool[] changed = new bool[w * h];
int totalChanged = 0;
for (int i = 0; i < delta.Length; i++)
{
int d = delta[i];
if (d >= diffThresh || d <= -brightThresh)
{
changed[i] = true;
totalChanged++;
}
}
if (totalChanged == 0)
{
if (debug) Console.Error.WriteLine(" diff-ocr: no changes detected after threshold");
return new OcrResponse { Text = "", Lines = [] };
}
bool[] closed = ErodeMask(DilateMask(changed, w, h, radius: 3), w, h, radius: 3);
bool[] cleaned = DilateMask(ErodeMask(closed, w, h, radius: 1), w, h, radius: 1);
if (!TryFindBestComponent(cleaned, delta, w, h, minArea: (w * h) / 1000, out var compBounds))
{
if (debug) Console.Error.WriteLine(" diff-ocr: no tooltip-sized region found");
return new OcrResponse { Text = "", Lines = [] };
}
int pad = Math.Clamp(Math.Min(compBounds.Width, compBounds.Height) / 20, 6, 20);
int extraRight = Math.Clamp(compBounds.Width / 6, 12, 80);
int minX = Math.Max(compBounds.Left - pad, 0);
int minY = Math.Max(compBounds.Top - pad, 0);
int maxX = Math.Min(compBounds.Right - 1 + pad + extraRight, w - 1);
int maxY = Math.Min(compBounds.Bottom - 1 + pad, h - 1);
int rw = maxX - minX + 1;
int rh = maxY - minY + 1;
if (debug)
Console.Error.WriteLine($" diff-ocr: changed={totalChanged} thresh={diffThresh} mean={mean:F1} std={std:F1} region=({minX},{minY}) {rw}x{rh}");
// Simple crop of the tooltip region from the current frame (no per-pixel masking).
// The top-hat preprocessing will handle suppressing background text.
using var cropped = current.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb);
// Save before/after preprocessing images if path is provided
if (!string.IsNullOrEmpty(req.Path))
{
var dir = Path.GetDirectoryName(req.Path);
if (!string.IsNullOrEmpty(dir) && !Directory.Exists(dir))
Directory.CreateDirectory(dir);
cropped.Save(req.Path, ImageUtils.GetImageFormat(req.Path));
if (debug) Console.Error.WriteLine($" diff-ocr: saved raw to {req.Path}");
}
// Pre-process for OCR: boost contrast, invert colors
using var processed = ImagePreprocessor.PreprocessForOcr(cropped, options);
// Save preprocessed version alongside raw
if (!string.IsNullOrEmpty(req.Path) && options.Preprocess)
{
var ext = Path.GetExtension(req.Path);
var prePath = Path.ChangeExtension(req.Path, ".pre" + ext);
processed.Save(prePath, ImageUtils.GetImageFormat(prePath));
if (debug) Console.Error.WriteLine($" diff-ocr: saved preprocessed to {prePath}");
}
using var pix = ImageUtils.BitmapToPix(processed);
using var page = engine.Process(pix);
var text = page.GetText();
var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: minX, offsetY: minY, minConfidence: options.MinConfidence);
return new DiffOcrResponse
{
Text = text,
Lines = lines,
Region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh },
};
}
private static bool[] DilateMask(bool[] src, int w, int h, int radius)
{
if (radius <= 0) return src;
bool[] dst = new bool[w * h];
int r = Math.Max(1, radius);
for (int y = 0; y < h; y++)
{
int y0 = Math.Max(0, y - r);
int y1 = Math.Min(h - 1, y + r);
for (int x = 0; x < w; x++)
{
int x0 = Math.Max(0, x - r);
int x1 = Math.Min(w - 1, x + r);
bool any = false;
for (int yy = y0; yy <= y1 && !any; yy++)
{
int row = yy * w;
for (int xx = x0; xx <= x1; xx++)
{
if (src[row + xx]) { any = true; break; }
}
}
dst[y * w + x] = any;
}
}
return dst;
}
private static bool[] ErodeMask(bool[] src, int w, int h, int radius)
{
if (radius <= 0) return src;
bool[] dst = new bool[w * h];
int r = Math.Max(1, radius);
for (int y = 0; y < h; y++)
{
int y0 = Math.Max(0, y - r);
int y1 = Math.Min(h - 1, y + r);
for (int x = 0; x < w; x++)
{
int x0 = Math.Max(0, x - r);
int x1 = Math.Min(w - 1, x + r);
bool all = true;
for (int yy = y0; yy <= y1 && all; yy++)
{
int row = yy * w;
for (int xx = x0; xx <= x1; xx++)
{
if (!src[row + xx]) { all = false; break; }
}
}
dst[y * w + x] = all;
}
}
return dst;
}
private static bool TryFindBestComponent(bool[] mask, int[] delta, int w, int h, int minArea, out Rectangle bounds)
{
bounds = Rectangle.Empty;
bool[] visited = new bool[w * h];
double bestScore = 0;
Rectangle bestBounds = Rectangle.Empty;
int[] qx = new int[w * h];
int[] qy = new int[w * h];
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
{
int idx = y * w + x;
if (!mask[idx] || visited[idx]) continue;
int head = 0, tail = 0;
qx[tail] = x; qy[tail] = y; tail++;
visited[idx] = true;
int minX = x, maxX = x, minY = y, maxY = y;
int area = 0;
long sumDelta = 0;
while (head < tail)
{
int cx = qx[head];
int cy = qy[head];
head++;
area++;
int didx = cy * w + cx;
int d = delta[didx];
if (d > 0) sumDelta += d;
if (cx < minX) minX = cx;
if (cx > maxX) maxX = cx;
if (cy < minY) minY = cy;
if (cy > maxY) maxY = cy;
for (int ny = cy - 1; ny <= cy + 1; ny++)
{
if (ny < 0 || ny >= h) continue;
int row = ny * w;
for (int nx = cx - 1; nx <= cx + 1; nx++)
{
if (nx < 0 || nx >= w) continue;
int nidx = row + nx;
if (!mask[nidx] || visited[nidx]) continue;
visited[nidx] = true;
qx[tail] = nx; qy[tail] = ny; tail++;
}
}
}
if (area >= minArea)
{
int rectW = maxX - minX + 1;
int rectH = maxY - minY + 1;
int rectArea = rectW * rectH;
double fillRatio = rectArea > 0 ? (double)area / rectArea : 0;
double avgDelta = area > 0 ? (double)sumDelta / area : 0;
double score = area * fillRatio * avgDelta;
if (score > bestScore)
{
bestScore = score;
bestBounds = Rectangle.FromLTRB(minX, minY, maxX + 1, maxY + 1);
}
}
}
}
if (bestScore <= 0) return false;
bounds = bestBounds;
return true;
}
private static OcrOptions NormalizeOptions(OcrOptions? options)
{
var normalized = options ?? new OcrOptions();
if (normalized.KernelSize < 3) normalized.KernelSize = 3;
if (normalized.KernelSize % 2 == 0) normalized.KernelSize += 1;
if (normalized.Scale < 1) normalized.Scale = 1;
if (normalized.MinConfidence < 0) normalized.MinConfidence = 0;
return normalized;
}
}