using System.Runtime.InteropServices;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using OpenCvSharp;
using OpenCvSharp.Dnn;
using Serilog;
namespace Automata.Screen;
///
/// YOLO11 object detection via ONNX Runtime with CUDA GPU acceleration.
/// Handles letterbox preprocessing, inference, and NMS postprocessing.
/// Buffers are pooled to avoid LOH allocations that trigger Gen2 GC pauses.
///
public class OnnxYoloDetector : IDisposable
{
private readonly InferenceSession _session;
private readonly string[] _classNames;
private readonly int _imgSize;
private readonly float _confThreshold;
private readonly float _iouThreshold;
private readonly string _inputName;
private bool _warmedUp;
// Pooled buffers — allocated once, reused every inference (avoids LOH/GC pressure)
private readonly float[] _tensorBuffer; // 3 * imgSize * imgSize (~1.2MB for 640)
private float[]? _outputBuffer; // rowSize * numDetections, sized on first use
// Pre-allocated Mats for preprocessing (reused every inference — avoids alloc/GC per frame)
private readonly Mat _resized = new();
private readonly Mat _padded;
private readonly Mat _rgb = new();
private readonly Mat _floatMat = new();
public OnnxYoloDetector(string modelPath, string[] classNames,
float confThreshold = 0.40f, float iouThreshold = 0.45f)
{
_classNames = classNames;
_confThreshold = confThreshold;
_iouThreshold = iouThreshold;
var opts = new SessionOptions();
opts.GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_ALL;
opts.InterOpNumThreads = 1; // single model, no inter-op parallelism needed
opts.IntraOpNumThreads = Environment.ProcessorCount / 2; // use half the cores (leave room for game + pipeline)
opts.ExecutionMode = ExecutionMode.ORT_SEQUENTIAL; // sequential is faster for single inference
// CPU EP — avoids GPU contention with DXGI screen capture
Log.Information("OnnxYolo: using CPU EP, intra threads={Threads}", opts.IntraOpNumThreads);
_session = new InferenceSession(modelPath, opts);
_inputName = _session.InputNames[0];
// Read imgSize from the model's input shape (NCHW: [1, 3, H, W])
var inputMeta = _session.InputMetadata[_inputName];
_imgSize = inputMeta.Dimensions[2]; // H == W for square YOLO input
_tensorBuffer = new float[3 * _imgSize * _imgSize];
_padded = new Mat(_imgSize, _imgSize, MatType.CV_8UC3, new Scalar(114, 114, 114));
Log.Information("OnnxYolo: loaded {Path} (input: {Input}, imgSize: {ImgSize})",
modelPath, _inputName, _imgSize);
}
///
/// Run detection on a BGR Mat. Returns detected bosses in original image coordinates.
///
public (List Detections, float TotalMs, float PreMs, float InfMs) Detect(Mat bgrMat)
{
var swTotal = System.Diagnostics.Stopwatch.StartNew();
// 1. Letterbox preprocess (reuses _tensorBuffer)
var swPre = System.Diagnostics.Stopwatch.StartNew();
var (tensor, scale, padX, padY) = Preprocess(bgrMat);
swPre.Stop();
// 2. Run inference
var swInf = System.Diagnostics.Stopwatch.StartNew();
var inputs = new List
{
NamedOnnxValue.CreateFromTensor(_inputName, tensor)
};
using var results = _session.Run(inputs);
swInf.Stop();
// 3. Parse output (reuses _outputBuffer)
var outputTensor = results.First().AsTensor();
var detections = Postprocess(outputTensor, scale, padX, padY, bgrMat.Width, bgrMat.Height);
swTotal.Stop();
var totalMs = (float)swTotal.Elapsed.TotalMilliseconds;
if (!_warmedUp)
{
_warmedUp = true;
Log.Information("OnnxYolo warmup: pre={Pre:F0}ms inf={Inf:F0}ms total={Total:F0}ms",
swPre.Elapsed.TotalMilliseconds, swInf.Elapsed.TotalMilliseconds, totalMs);
}
return (detections, totalMs, (float)swPre.Elapsed.TotalMilliseconds, (float)swInf.Elapsed.TotalMilliseconds);
}
private (DenseTensor tensor, float scale, int padX, int padY) Preprocess(Mat bgrMat)
{
int origW = bgrMat.Width, origH = bgrMat.Height;
float scale = Math.Min((float)_imgSize / origW, (float)_imgSize / origH);
int newW = (int)Math.Round(origW * scale);
int newH = (int)Math.Round(origH * scale);
int padX = (_imgSize - newW) / 2;
int padY = (_imgSize - newH) / 2;
Cv2.Resize(bgrMat, _resized, new Size(newW, newH), interpolation: InterpolationFlags.Linear);
_padded.SetTo(new Scalar(114, 114, 114));
_resized.CopyTo(_padded[new Rect(padX, padY, newW, newH)]);
Cv2.CvtColor(_padded, _rgb, ColorConversionCodes.BGR2RGB);
_rgb.ConvertTo(_floatMat, MatType.CV_32FC3, 1.0 / 255.0);
// HWC → NCHW via channel split + Marshal.Copy into pooled buffer
int pixels = _imgSize * _imgSize;
Cv2.Split(_floatMat, out Mat[] channels);
try
{
for (int c = 0; c < 3; c++)
Marshal.Copy(channels[c].Data, _tensorBuffer, c * pixels, pixels);
}
finally
{
foreach (var ch in channels) ch.Dispose();
}
// Wrap pooled buffer in tensor (no copy — DenseTensor references the array)
var tensor = new DenseTensor(_tensorBuffer, [1, 3, _imgSize, _imgSize]);
return (tensor, scale, padX, padY);
}
private List Postprocess(Tensor output, float scale,
int padX, int padY, int origW, int origH)
{
int numClasses = _classNames.Length;
int numDetections = output.Dimensions[2];
int rowSize = output.Dimensions[1]; // 4 + nc
int flatSize = rowSize * numDetections;
// Reuse output buffer (resize only if model output shape changed)
if (_outputBuffer == null || _outputBuffer.Length < flatSize)
_outputBuffer = new float[flatSize];
if (output is DenseTensor dense)
dense.Buffer.Span.CopyTo(_outputBuffer);
else
for (int r = 0; r < rowSize; r++)
for (int i = 0; i < numDetections; i++)
_outputBuffer[r * numDetections + i] = output[0, r, i];
var boxes = new List();
var confidences = new List();
var classIds = new List();
for (int i = 0; i < numDetections; i++)
{
float bestConf = 0;
int bestClass = 0;
for (int c = 0; c < numClasses; c++)
{
float conf = _outputBuffer[(4 + c) * numDetections + i];
if (conf > bestConf)
{
bestConf = conf;
bestClass = c;
}
}
if (bestConf < _confThreshold) continue;
float cx = _outputBuffer[0 * numDetections + i];
float cy = _outputBuffer[1 * numDetections + i];
float w = _outputBuffer[2 * numDetections + i];
float h = _outputBuffer[3 * numDetections + i];
float x1 = (cx - w / 2 - padX) / scale;
float y1 = (cy - h / 2 - padY) / scale;
float bw = w / scale;
float bh = h / scale;
x1 = Math.Max(0, x1);
y1 = Math.Max(0, y1);
bw = Math.Min(bw, origW - x1);
bh = Math.Min(bh, origH - y1);
boxes.Add(new Rect((int)x1, (int)y1, (int)bw, (int)bh));
confidences.Add(bestConf);
classIds.Add(bestClass);
}
if (boxes.Count == 0)
return [];
CvDnn.NMSBoxes(boxes, confidences, _confThreshold, _iouThreshold, out int[] indices);
var detections = new List(indices.Length);
foreach (var idx in indices)
{
var box = boxes[idx];
detections.Add(new DetectedBoss(
_classNames[classIds[idx]],
confidences[idx],
box.X, box.Y, box.Width, box.Height,
box.X + box.Width / 2,
box.Y + box.Height / 2));
}
return detections;
}
public void Dispose()
{
_session.Dispose();
_resized.Dispose();
_padded.Dispose();
_rgb.Dispose();
_floatMat.Dispose();
}
}