poe2-bot/src/Poe2Trade.Screen/ScreenReader.cs

407 lines
14 KiB
C#

using System.Drawing;
using System.Drawing.Imaging;
using System.Runtime.InteropServices;
using Poe2Trade.Core;
using OpenCvSharp.Extensions;
using Serilog;
using Region = Poe2Trade.Core.Region;
namespace Poe2Trade.Screen;
public class ScreenReader : IScreenReader
{
private readonly DiffCropHandler _diffCrop = new();
private readonly GridHandler _gridHandler = new();
private readonly TemplateMatchHandler _templateMatch = new();
private readonly EdgeCropHandler _edgeCrop = new();
private readonly PythonOcrBridge _pythonBridge = new();
private bool _initialized;
public GridReader Grid { get; }
public ScreenReader()
{
Grid = new GridReader(_gridHandler);
}
public Task Warmup()
{
if (!_initialized)
{
ScreenCapture.InitDpiAwareness();
_initialized = true;
}
return Task.CompletedTask;
}
// -- Capture --
public Task<byte[]> CaptureScreen()
{
return Task.FromResult(_diffCrop.HandleCapture());
}
public Task<byte[]> CaptureRegion(Region region)
{
return Task.FromResult(_diffCrop.HandleCapture(region));
}
// -- OCR --
public Task<OcrResponse> Ocr(Region? region = null, string? preprocess = null)
{
using var bitmap = ScreenCapture.CaptureOrLoad(null, region);
if (preprocess == "tophat")
{
using var processed = ImagePreprocessor.PreprocessForOcr(bitmap);
return Task.FromResult(_pythonBridge.OcrFromBitmap(processed));
}
if (preprocess == "clahe")
{
using var processed = ImagePreprocessor.PreprocessClahe(bitmap);
return Task.FromResult(_pythonBridge.OcrFromBitmap(processed));
}
return Task.FromResult(_pythonBridge.OcrFromBitmap(bitmap));
}
public async Task<(int X, int Y)?> FindTextOnScreen(string searchText, bool fuzzy = false)
{
var result = await Ocr();
var pos = FindWordInOcrResult(result, searchText, fuzzy);
if (pos.HasValue)
Log.Information("Found text '{Text}' at ({X},{Y})", searchText, pos.Value.X, pos.Value.Y);
else
Log.Information("Text '{Text}' not found on screen", searchText);
return pos;
}
public async Task<string> ReadFullScreen()
{
var result = await Ocr();
return result.Text;
}
public async Task<(int X, int Y)?> FindTextInRegion(Region region, string searchText)
{
var result = await Ocr(region);
var pos = FindWordInOcrResult(result, searchText);
if (pos.HasValue)
return (region.X + pos.Value.X, region.Y + pos.Value.Y);
return null;
}
public async Task<string> ReadRegionText(Region region)
{
var result = await Ocr(region);
return result.Text;
}
public async Task<bool> CheckForText(Region region, string searchText)
{
var pos = await FindTextInRegion(region, searchText);
return pos.HasValue;
}
// -- Snapshot / Diff OCR --
public Task Snapshot()
{
_diffCrop.HandleSnapshot();
return Task.CompletedTask;
}
public Task<DiffOcrResponse> DiffOcr(string? savePath = null, Region? region = null)
{
var p = new DiffOcrParams();
var cropResult = _diffCrop.DiffCrop(p.Crop, region: region);
if (cropResult == null)
return Task.FromResult(new DiffOcrResponse { Text = "", Lines = [] });
var (cropped, refCropped, current, cropRegion) = cropResult.Value;
using var _current = current;
using var _cropped = cropped;
using var _refCropped = refCropped;
// Save raw crop if path is provided
if (!string.IsNullOrEmpty(savePath))
{
var dir = Path.GetDirectoryName(savePath);
if (!string.IsNullOrEmpty(dir) && !Directory.Exists(dir))
Directory.CreateDirectory(dir);
cropped.Save(savePath, ImageUtils.GetImageFormat(savePath));
}
// Preprocess with background subtraction
var ocr = p.Ocr;
using var processedBmp = ocr.UseBackgroundSub
? ImagePreprocessor.PreprocessWithBackgroundSub(cropped, refCropped, ocr.DimPercentile, ocr.TextThresh, 1, ocr.SoftThreshold)
: ImagePreprocessor.PreprocessForOcr(cropped, ocr.KernelSize, 1);
var ocrResult = _pythonBridge.OcrFromBitmap(processedBmp, ocr);
// Offset coordinates to screen space
foreach (var line in ocrResult.Lines)
foreach (var word in line.Words)
{
word.X += cropRegion.X;
word.Y += cropRegion.Y;
}
return Task.FromResult(new DiffOcrResponse
{
Text = ocrResult.Text,
Lines = ocrResult.Lines,
Region = cropRegion,
});
}
// -- Template matching --
public Task<TemplateMatchResult?> TemplateMatch(string templatePath, Region? region = null)
{
var result = _templateMatch.Match(templatePath, region);
if (result != null)
Log.Information("Template match found: ({X},{Y}) confidence={Conf:F3}", result.X, result.Y, result.Confidence);
return Task.FromResult(result);
}
// -- Save --
public Task SaveScreenshot(string path)
{
_diffCrop.HandleScreenshot(path);
return Task.CompletedTask;
}
public Task SaveRegion(Region region, string path)
{
_diffCrop.HandleScreenshot(path, region);
return Task.CompletedTask;
}
// -- Nameplate diff OCR --
public Bitmap CaptureRawBitmap() => ScreenCapture.CaptureOrLoad(null, null);
public Task<OcrResponse> NameplateDiffOcr(Bitmap reference, Bitmap current)
{
int w = Math.Min(reference.Width, current.Width);
int h = Math.Min(reference.Height, current.Height);
var refData = reference.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
var curData = current.LockBits(new Rectangle(0, 0, w, h), ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb);
byte[] refPx = new byte[refData.Stride * h];
byte[] curPx = new byte[curData.Stride * h];
Marshal.Copy(refData.Scan0, refPx, 0, refPx.Length);
Marshal.Copy(curData.Scan0, curPx, 0, curPx.Length);
int stride = refData.Stride;
reference.UnlockBits(refData);
current.UnlockBits(curData);
// Build a binary mask of pixels that got significantly brighter (nameplates are bright text)
const int brightThresh = 30;
bool[] mask = new bool[w * h];
Parallel.For(0, h, y =>
{
int rowOff = y * stride;
for (int x = 0; x < w; x++)
{
int i = rowOff + x * 4;
int brighter = (curPx[i] - refPx[i]) + (curPx[i + 1] - refPx[i + 1]) + (curPx[i + 2] - refPx[i + 2]);
if (brighter > brightThresh)
mask[y * w + x] = true;
}
});
// Find connected clusters via row-scan: collect bounding boxes of bright regions
var boxes = FindBrightClusters(mask, w, h, minWidth: 40, minHeight: 10, maxGap: 8);
Log.Information("NameplateDiff: found {Count} bright clusters", boxes.Count);
if (boxes.Count == 0)
return Task.FromResult(new OcrResponse { Text = "", Lines = [] });
// OCR each cluster crop, accumulate results with screen-space coordinates
var allLines = new List<OcrLine>();
var allText = new List<string>();
foreach (var box in boxes)
{
// Pad the crop slightly
int pad = 4;
int cx = Math.Max(0, box.X - pad);
int cy = Math.Max(0, box.Y - pad);
int cw = Math.Min(w - cx, box.Width + pad * 2);
int ch = Math.Min(h - cy, box.Height + pad * 2);
using var crop = current.Clone(new Rectangle(cx, cy, cw, ch), PixelFormat.Format32bppArgb);
var ocrResult = _pythonBridge.OcrFromBitmap(crop);
// Offset word coordinates to screen space
foreach (var line in ocrResult.Lines)
{
foreach (var word in line.Words)
{
word.X += cx;
word.Y += cy;
}
allLines.Add(line);
allText.Add(line.Text);
}
}
return Task.FromResult(new OcrResponse
{
Text = string.Join("\n", allText),
Lines = allLines,
});
}
private static List<Rectangle> FindBrightClusters(bool[] mask, int w, int h, int minWidth, int minHeight, int maxGap)
{
// Row density
int[] rowCounts = new int[h];
for (int y = 0; y < h; y++)
for (int x = 0; x < w; x++)
if (mask[y * w + x]) rowCounts[y]++;
// Find horizontal bands of bright rows
int rowThresh = 3;
var bands = new List<(int Top, int Bottom)>();
int bandStart = -1, lastActive = -1;
for (int y = 0; y < h; y++)
{
if (rowCounts[y] >= rowThresh)
{
if (bandStart < 0) bandStart = y;
lastActive = y;
}
else if (bandStart >= 0 && y - lastActive > maxGap)
{
if (lastActive - bandStart + 1 >= minHeight)
bands.Add((bandStart, lastActive));
bandStart = -1;
}
}
if (bandStart >= 0 && lastActive - bandStart + 1 >= minHeight)
bands.Add((bandStart, lastActive));
// For each band, find column extents to get individual nameplate boxes
var boxes = new List<Rectangle>();
foreach (var (top, bottom) in bands)
{
int[] colCounts = new int[w];
for (int y = top; y <= bottom; y++)
for (int x = 0; x < w; x++)
if (mask[y * w + x]) colCounts[x]++;
int colThresh = 1;
int colStart = -1, lastCol = -1;
for (int x = 0; x < w; x++)
{
if (colCounts[x] >= colThresh)
{
if (colStart < 0) colStart = x;
lastCol = x;
}
else if (colStart >= 0 && x - lastCol > maxGap)
{
if (lastCol - colStart + 1 >= minWidth)
boxes.Add(new Rectangle(colStart, top, lastCol - colStart + 1, bottom - top + 1));
colStart = -1;
}
}
if (colStart >= 0 && lastCol - colStart + 1 >= minWidth)
boxes.Add(new Rectangle(colStart, top, lastCol - colStart + 1, bottom - top + 1));
}
return boxes;
}
public void Dispose() => _pythonBridge.Dispose();
// -- OCR text matching --
private static (int X, int Y)? FindWordInOcrResult(OcrResponse result, string needle, bool fuzzy = false)
{
var lower = needle.ToLowerInvariant();
const double fuzzyThreshold = 0.55;
if (lower.Contains(' '))
{
var needleNorm = Normalize(needle);
foreach (var line in result.Lines)
{
if (line.Words.Count == 0) continue;
if (line.Text.ToLowerInvariant().Contains(lower))
return LineBoundsCenter(line);
if (fuzzy)
{
var lineNorm = Normalize(line.Text);
var windowLen = needleNorm.Length;
for (var i = 0; i <= lineNorm.Length - windowLen + 2; i++)
{
var end = Math.Min(i + windowLen + 2, lineNorm.Length);
var window = lineNorm[i..end];
if (BigramSimilarity(needleNorm, window) >= fuzzyThreshold)
return LineBoundsCenter(line);
}
}
}
return null;
}
var needleN = Normalize(needle);
foreach (var line in result.Lines)
{
foreach (var word in line.Words)
{
if (word.Text.ToLowerInvariant().Contains(lower))
return (word.X + word.Width / 2, word.Y + word.Height / 2);
if (fuzzy && BigramSimilarity(needleN, Normalize(word.Text)) >= fuzzyThreshold)
return (word.X + word.Width / 2, word.Y + word.Height / 2);
}
}
return null;
}
private static (int X, int Y) LineBoundsCenter(OcrLine line)
{
var first = line.Words[0];
var last = line.Words[^1];
var x1 = first.X;
var y1 = first.Y;
var x2 = last.X + last.Width;
var y2 = line.Words.Max(w => w.Y + w.Height);
return ((x1 + x2) / 2, (y1 + y2) / 2);
}
private static string Normalize(string s) =>
new(s.ToLowerInvariant().Where(char.IsLetterOrDigit).ToArray());
private static double BigramSimilarity(string a, string b)
{
if (a.Length < 2 || b.Length < 2) return a == b ? 1 : 0;
var bigramsA = new Dictionary<(char, char), int>();
for (var i = 0; i < a.Length - 1; i++)
{
var bg = (a[i], a[i + 1]);
bigramsA[bg] = bigramsA.GetValueOrDefault(bg) + 1;
}
var matches = 0;
for (var i = 0; i < b.Length - 1; i++)
{
var bg = (b[i], b[i + 1]);
if (bigramsA.TryGetValue(bg, out var count) && count > 0)
{
matches++;
bigramsA[bg] = count - 1;
}
}
return 2.0 * matches / (a.Length - 1 + b.Length - 1);
}
}