tooltip bounds

This commit is contained in:
Boki 2026-02-10 21:21:07 -05:00
parent 930e00c9cc
commit bb2b9cf507
7 changed files with 474 additions and 56 deletions

View file

@ -9,6 +9,13 @@
<ItemGroup>
<PackageReference Include="System.Drawing.Common" Version="8.0.12" />
<PackageReference Include="Tesseract" Version="5.2.0" />
</ItemGroup>
<ItemGroup>
<None Update="tessdata\eng.traineddata">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>
</Project>

View file

@ -4,18 +4,23 @@ using System.Runtime.InteropServices;
using System.Text;
using System.Text.Json;
using System.Text.Json.Serialization;
using Windows.Graphics.Imaging;
using Windows.Media.Ocr;
using Windows.Storage.Streams;
using Tesseract;
using SdImageFormat = System.Drawing.Imaging.ImageFormat;
// Make GDI capture DPI-aware so coordinates match physical pixels
SetProcessDPIAware();
// Pre-create the OCR engine (reused across all requests)
var ocrEngine = OcrEngine.TryCreateFromUserProfileLanguages();
if (ocrEngine == null)
// Pre-create the Tesseract OCR engine (reused across all requests)
var tessdataPath = Path.Combine(AppContext.BaseDirectory, "tessdata");
TesseractEngine tessEngine;
try
{
WriteResponse(new ErrorResponse("Failed to create OCR engine. Ensure a language pack is installed."));
tessEngine = new TesseractEngine(tessdataPath, "eng", EngineMode.LstmOnly);
tessEngine.DefaultPageSegMode = PageSegMode.Auto;
}
catch (Exception ex)
{
WriteResponse(new ErrorResponse($"Failed to create Tesseract engine: {ex.Message}. Ensure tessdata/eng.traineddata exists."));
return 1;
}
@ -49,7 +54,7 @@ while ((line = stdin.ReadLine()) != null)
switch (request.Cmd?.ToLowerInvariant())
{
case "ocr":
HandleOcr(request, ocrEngine);
HandleOcr(request, tessEngine);
break;
case "screenshot":
HandleScreenshot(request);
@ -63,6 +68,12 @@ while ((line = stdin.ReadLine()) != null)
case "detect-grid":
HandleDetectGrid(request);
break;
case "snapshot":
HandleSnapshot(request);
break;
case "diff-ocr":
HandleDiffOcr(request, tessEngine);
break;
default:
WriteResponse(new ErrorResponse($"Unknown command: {request.Cmd}"));
break;
@ -78,31 +89,17 @@ return 0;
// ── Handlers ────────────────────────────────────────────────────────────────
void HandleOcr(Request req, OcrEngine engine)
Bitmap? referenceFrame = null;
void HandleOcr(Request req, TesseractEngine engine)
{
using var bitmap = CaptureOrLoad(req.File, req.Region);
var softwareBitmap = BitmapToSoftwareBitmap(bitmap);
var result = engine.RecognizeAsync(softwareBitmap).AsTask().GetAwaiter().GetResult();
using var pix = BitmapToPix(bitmap);
using var page = engine.Process(pix);
var lines = new List<OcrLineResult>();
foreach (var ocrLine in result.Lines)
{
var words = new List<OcrWordResult>();
foreach (var word in ocrLine.Words)
{
words.Add(new OcrWordResult
{
Text = word.Text,
X = (int)Math.Round(word.BoundingRect.X),
Y = (int)Math.Round(word.BoundingRect.Y),
Width = (int)Math.Round(word.BoundingRect.Width),
Height = (int)Math.Round(word.BoundingRect.Height),
});
}
lines.Add(new OcrLineResult { Text = ocrLine.Text, Words = words });
}
WriteResponse(new OcrResponse { Text = result.Text, Lines = lines });
var text = page.GetText();
var lines = ExtractLinesFromPage(page, offsetX: 0, offsetY: 0);
WriteResponse(new OcrResponse { Text = text, Lines = lines });
}
void HandleScreenshot(Request req)
@ -113,9 +110,15 @@ void HandleScreenshot(Request req)
return;
}
using var bitmap = CaptureOrLoad(req.File, req.Region);
// If a reference frame exists, save that (same image used for diff-ocr).
// Otherwise capture a new frame.
var bitmap = referenceFrame ?? CaptureOrLoad(req.File, req.Region);
var format = GetImageFormat(req.Path);
var dir = System.IO.Path.GetDirectoryName(req.Path);
if (!string.IsNullOrEmpty(dir) && !System.IO.Directory.Exists(dir))
System.IO.Directory.CreateDirectory(dir);
bitmap.Save(req.Path, format);
if (bitmap != referenceFrame) bitmap.Dispose();
WriteResponse(new OkResponse());
}
@ -123,11 +126,253 @@ void HandleCapture(Request req)
{
using var bitmap = CaptureOrLoad(req.File, req.Region);
using var ms = new MemoryStream();
bitmap.Save(ms, ImageFormat.Png);
bitmap.Save(ms, SdImageFormat.Png);
var base64 = Convert.ToBase64String(ms.ToArray());
WriteResponse(new CaptureResponse { Image = base64 });
}
// ── Snapshot / Diff-OCR ─────────────────────────────────────────────────────
void HandleSnapshot(Request req)
{
referenceFrame?.Dispose();
referenceFrame = CaptureOrLoad(req.File, req.Region);
WriteResponse(new OkResponse());
}
void HandleDiffOcr(Request req, TesseractEngine engine)
{
if (referenceFrame == null)
{
WriteResponse(new ErrorResponse("No reference snapshot stored. Send 'snapshot' first."));
return;
}
using var current = CaptureOrLoad(req.File, null);
int w = Math.Min(referenceFrame.Width, current.Width);
int h = Math.Min(referenceFrame.Height, current.Height);
// Get raw pixels for both frames
var refData = referenceFrame.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
byte[] refPx = new byte[refData.Stride * h];
Marshal.Copy(refData.Scan0, refPx, 0, refPx.Length);
referenceFrame.UnlockBits(refData);
int stride = refData.Stride;
var curData = current.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
byte[] curPx = new byte[curData.Stride * h];
Marshal.Copy(curData.Scan0, curPx, 0, curPx.Length);
current.UnlockBits(curData);
// Detect pixels that got DARKER (tooltip = dark overlay).
// This filters out item highlight glow (brighter) and cursor changes.
int diffThresh = req.Threshold > 0 ? req.Threshold : 30;
bool[] changed = new bool[w * h];
int totalChanged = 0;
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
{
int i = y * stride + x * 4;
int darkerB = refPx[i] - curPx[i];
int darkerG = refPx[i + 1] - curPx[i + 1];
int darkerR = refPx[i + 2] - curPx[i + 2];
if (darkerB + darkerG + darkerR > diffThresh)
{
changed[y * w + x] = true;
totalChanged++;
}
}
}
bool debug = req.Debug;
if (totalChanged == 0)
{
if (debug) Console.Error.WriteLine(" diff-ocr: no changes detected");
WriteResponse(new OcrResponse { Text = "", Lines = [] });
return;
}
// Two-pass density detection:
// Pass 1: Find row range using full-width row counts
// Pass 2: Find column range using only pixels within detected row range
// This makes the column threshold relative to tooltip height, not screen height.
int maxGap = 15;
// Pass 1: count changed pixels per row, find longest active run
int[] rowCounts = new int[h];
for (int y = 0; y < h; y++)
for (int x = 0; x < w; x++)
if (changed[y * w + x])
rowCounts[y]++;
int rowThresh = w / 30; // ~3% of width
int bestRowStart = 0, bestRowEnd = 0, bestRowLen = 0;
int curRowStart = -1, lastActiveRow = -1;
for (int y = 0; y < h; y++)
{
if (rowCounts[y] >= rowThresh)
{
if (curRowStart < 0) curRowStart = y;
lastActiveRow = y;
}
else if (curRowStart >= 0 && y - lastActiveRow > maxGap)
{
int len = lastActiveRow - curRowStart + 1;
if (len > bestRowLen) { bestRowStart = curRowStart; bestRowEnd = lastActiveRow; bestRowLen = len; }
curRowStart = -1;
}
}
if (curRowStart >= 0)
{
int len = lastActiveRow - curRowStart + 1;
if (len > bestRowLen) { bestRowStart = curRowStart; bestRowEnd = lastActiveRow; bestRowLen = len; }
}
// Pass 2: count changed pixels per column, but only within the detected row range
int[] colCounts = new int[w];
for (int y = bestRowStart; y <= bestRowEnd; y++)
for (int x = 0; x < w; x++)
if (changed[y * w + x])
colCounts[x]++;
int tooltipHeight = bestRowEnd - bestRowStart + 1;
int colThresh = tooltipHeight / 15; // ~7% of tooltip height
int bestColStart = 0, bestColEnd = 0, bestColLen = 0;
int curColStart = -1, lastActiveCol = -1;
for (int x = 0; x < w; x++)
{
if (colCounts[x] >= colThresh)
{
if (curColStart < 0) curColStart = x;
lastActiveCol = x;
}
else if (curColStart >= 0 && x - lastActiveCol > maxGap)
{
int len = lastActiveCol - curColStart + 1;
if (len > bestColLen) { bestColStart = curColStart; bestColEnd = lastActiveCol; bestColLen = len; }
curColStart = -1;
}
}
if (curColStart >= 0)
{
int len = lastActiveCol - curColStart + 1;
if (len > bestColLen) { bestColStart = curColStart; bestColEnd = lastActiveCol; bestColLen = len; }
}
// Log density detection results
Console.Error.WriteLine($" diff-ocr: changed={totalChanged} rows={bestRowStart}-{bestRowEnd}({bestRowLen}) cols={bestColStart}-{bestColEnd}({bestColLen}) rowThresh={rowThresh} colThresh={colThresh}");
if (bestRowLen < 50 || bestColLen < 50)
{
Console.Error.WriteLine($" diff-ocr: no tooltip-sized region found (rows={bestRowLen}, cols={bestColLen})");
WriteResponse(new OcrResponse { Text = "", Lines = [] });
return;
}
int pad = 0;
int minX = Math.Max(bestColStart - pad, 0);
int minY = Math.Max(bestRowStart - pad, 0);
int maxX = Math.Min(bestColEnd + pad, w - 1);
int maxY = Math.Min(bestRowEnd + pad, h - 1);
// Trim 5px from left/right/bottom to remove tooltip border/shadow artifacts
int trim = 5;
minX = Math.Min(minX + trim, maxX);
maxX = Math.Max(maxX - trim, minX);
maxY = Math.Max(maxY - trim, minY);
int rw = maxX - minX + 1;
int rh = maxY - minY + 1;
if (debug) Console.Error.WriteLine($" diff-ocr: tooltip region ({minX},{minY}) {rw}x{rh}");
// Crop the current frame to the diff bounding box
using var cropped = current.Clone(new Rectangle(minX, minY, rw, rh), PixelFormat.Format32bppArgb);
// Save raw tooltip image if path is provided
if (!string.IsNullOrEmpty(req.Path))
{
var dir = System.IO.Path.GetDirectoryName(req.Path);
if (!string.IsNullOrEmpty(dir) && !System.IO.Directory.Exists(dir))
System.IO.Directory.CreateDirectory(dir);
cropped.Save(req.Path, GetImageFormat(req.Path));
if (debug) Console.Error.WriteLine($" diff-ocr: saved tooltip to {req.Path}");
}
// Pre-process for OCR: scale up 2x, boost contrast, invert colors
using var processed = PreprocessForOcr(cropped);
using var pix = BitmapToPix(processed);
using var page = engine.Process(pix);
var text = page.GetText();
var lines = ExtractLinesFromPage(page, offsetX: minX, offsetY: minY);
WriteResponse(new DiffOcrResponse
{
Text = text,
Lines = lines,
Region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh },
});
}
/// Pre-process an image for better OCR: boost contrast and invert colors.
/// No upscaling — tooltip text is large enough at native resolution.
Bitmap PreprocessForOcr(Bitmap src)
{
int dw = src.Width, dh = src.Height;
var scaled = (Bitmap)src.Clone();
// Boost contrast: find min/max brightness, stretch to full 0-255 range
var data = scaled.LockBits(new Rectangle(0, 0, dw, dh), ImageLockMode.ReadWrite, PixelFormat.Format32bppArgb);
byte[] px = new byte[data.Stride * dh];
Marshal.Copy(data.Scan0, px, 0, px.Length);
int stride = data.Stride;
// Find 5th and 95th percentile brightness for robust stretching
int[] histogram = new int[256];
for (int y = 0; y < dh; y++)
for (int x = 0; x < dw; x++)
{
int i = y * stride + x * 4;
int bright = Math.Max(px[i], Math.Max(px[i + 1], px[i + 2]));
histogram[bright]++;
}
int totalPixels = dw * dh;
int lo = 0, hi = 255;
int cumLo = 0, cumHi = 0;
for (int b = 0; b < 256; b++)
{
cumLo += histogram[b];
if (cumLo >= totalPixels * 0.05) { lo = b; break; }
}
for (int b = 255; b >= 0; b--)
{
cumHi += histogram[b];
if (cumHi >= totalPixels * 0.05) { hi = b; break; }
}
if (hi <= lo) hi = lo + 1;
double scale = 255.0 / (hi - lo);
// Stretch contrast and invert colors (light text on dark → dark text on light for Tesseract)
for (int y = 0; y < dh; y++)
for (int x = 0; x < dw; x++)
{
int i = y * stride + x * 4;
px[i] = (byte)(255 - Math.Clamp((int)((px[i] - lo) * scale), 0, 255));
px[i + 1] = (byte)(255 - Math.Clamp((int)((px[i + 1] - lo) * scale), 0, 255));
px[i + 2] = (byte)(255 - Math.Clamp((int)((px[i + 2] - lo) * scale), 0, 255));
}
Marshal.Copy(px, 0, data.Scan0, px.Length);
scaled.UnlockBits(data);
return scaled;
}
// Pre-loaded empty cell templates (loaded lazily on first grid scan)
// Stored as both grayscale (for occupied detection) and ARGB (for item border detection)
byte[]? emptyTemplate70Gray = null;
@ -929,18 +1174,54 @@ Bitmap CaptureScreen(RegionRect? region)
return bitmap;
}
// ── Bitmap → SoftwareBitmap conversion (in-memory) ─────────────────────────
// ── Bitmap → Tesseract Pix conversion (in-memory) ──────────────────────────
SoftwareBitmap BitmapToSoftwareBitmap(Bitmap bitmap)
Pix BitmapToPix(Bitmap bitmap)
{
using var ms = new MemoryStream();
bitmap.Save(ms, ImageFormat.Bmp);
ms.Position = 0;
bitmap.Save(ms, SdImageFormat.Png);
return Pix.LoadFromMemory(ms.ToArray());
}
var stream = ms.AsRandomAccessStream();
var decoder = BitmapDecoder.CreateAsync(stream).AsTask().GetAwaiter().GetResult();
var softwareBitmap = decoder.GetSoftwareBitmapAsync().AsTask().GetAwaiter().GetResult();
return softwareBitmap;
// ── Extract lines/words from Tesseract page result ──────────────────────────
List<OcrLineResult> ExtractLinesFromPage(Page page, int offsetX, int offsetY)
{
var lines = new List<OcrLineResult>();
using var iter = page.GetIterator();
if (iter == null) return lines;
iter.Begin();
do
{
var words = new List<OcrWordResult>();
do
{
var wordText = iter.GetText(PageIteratorLevel.Word);
if (string.IsNullOrWhiteSpace(wordText)) continue;
if (iter.TryGetBoundingBox(PageIteratorLevel.Word, out var bounds))
{
words.Add(new OcrWordResult
{
Text = wordText.Trim(),
X = bounds.X1 + offsetX,
Y = bounds.Y1 + offsetY,
Width = bounds.Width,
Height = bounds.Height,
});
}
} while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));
if (words.Count > 0)
{
var lineText = string.Join(" ", words.Select(w => w.Text));
lines.Add(new OcrLineResult { Text = lineText, Words = words });
}
} while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.TextLine));
return lines;
}
// ── Response writing ────────────────────────────────────────────────────────
@ -952,14 +1233,14 @@ void WriteResponse(object response)
Console.Out.Flush();
}
ImageFormat GetImageFormat(string path)
SdImageFormat GetImageFormat(string path)
{
var ext = Path.GetExtension(path).ToLowerInvariant();
return ext switch
{
".jpg" or ".jpeg" => ImageFormat.Jpeg,
".bmp" => ImageFormat.Bmp,
_ => ImageFormat.Png,
".jpg" or ".jpeg" => SdImageFormat.Jpeg,
".bmp" => SdImageFormat.Bmp,
_ => SdImageFormat.Png,
};
}
@ -1063,6 +1344,21 @@ class OcrResponse
public List<OcrLineResult> Lines { get; set; } = [];
}
class DiffOcrResponse
{
[JsonPropertyName("ok")]
public bool Ok => true;
[JsonPropertyName("text")]
public string Text { get; set; } = "";
[JsonPropertyName("lines")]
public List<OcrLineResult> Lines { get; set; } = [];
[JsonPropertyName("region")]
public RegionRect? Region { get; set; }
}
class OcrLineResult
{
[JsonPropertyName("text")]