poe2-bot/src/Automata.Screen/OnnxYoloDetector.cs

using System.Runtime.InteropServices;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using OpenCvSharp;
using OpenCvSharp.Dnn;
using Serilog;

namespace Automata.Screen;

/// <summary>
/// YOLO11 object detection via ONNX Runtime with CUDA GPU acceleration.
/// Handles letterbox preprocessing, inference, and NMS postprocessing.
/// Buffers are pooled to avoid LOH allocations that trigger Gen2 GC pauses.
/// </summary>
public class OnnxYoloDetector : IDisposable
{
    private readonly InferenceSession _session;
    private readonly string[] _classNames;
    private readonly int _imgSize;
    private readonly float _confThreshold;
    private readonly float _iouThreshold;
    private readonly string _inputName;
    private bool _warmedUp;

    // Pooled buffers — allocated once, reused every inference (avoids LOH/GC pressure)
    private readonly float[] _tensorBuffer;       // 3 * imgSize * imgSize (~1.2MB for 640)
    private float[]? _outputBuffer;               // rowSize * numDetections, sized on first use

    // Pre-allocated Mats for preprocessing (reused every inference — avoids alloc/GC per frame)
    private readonly Mat _resized = new();
    private readonly Mat _padded;
    private readonly Mat _rgb = new();
    private readonly Mat _floatMat = new();

    public OnnxYoloDetector(string modelPath, string[] classNames,
        float confThreshold = 0.40f, float iouThreshold = 0.45f)
    {
        _classNames = classNames;
        _confThreshold = confThreshold;
        _iouThreshold = iouThreshold;

        var opts = new SessionOptions();
        opts.GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_ALL;
        opts.InterOpNumThreads = 1;                                    // single model, no inter-op parallelism needed
        opts.IntraOpNumThreads = Environment.ProcessorCount / 2;       // use half the cores (leave room for game + pipeline)
        opts.ExecutionMode = ExecutionMode.ORT_SEQUENTIAL;             // sequential is faster for single inference
        // CPU EP — avoids GPU contention with DXGI screen capture
        Log.Information("OnnxYolo: using CPU EP, intra threads={Threads}", opts.IntraOpNumThreads);

        _session = new InferenceSession(modelPath, opts);
        _inputName = _session.InputNames[0];

        // Read imgSize from the model's input shape (NCHW: [1, 3, H, W])
        var inputMeta = _session.InputMetadata[_inputName];
        _imgSize = inputMeta.Dimensions[2];  // H == W for square YOLO input

        _tensorBuffer = new float[3 * _imgSize * _imgSize];
        _padded = new Mat(_imgSize, _imgSize, MatType.CV_8UC3, new Scalar(114, 114, 114));

        Log.Information("OnnxYolo: loaded {Path} (input: {Input}, imgSize: {ImgSize})",
            modelPath, _inputName, _imgSize);
    }

    /// <summary>
    /// Run detection on a BGR Mat. Returns detected bosses in original image coordinates.
    /// </summary>
    public (List<DetectedBoss> Detections, float TotalMs, float PreMs, float InfMs) Detect(Mat bgrMat)
    {
        var swTotal = System.Diagnostics.Stopwatch.StartNew();

        // 1. Letterbox preprocess (reuses _tensorBuffer)
        var swPre = System.Diagnostics.Stopwatch.StartNew();
        var (tensor, scale, padX, padY) = Preprocess(bgrMat);
        swPre.Stop();

        // 2. Run inference
        var swInf = System.Diagnostics.Stopwatch.StartNew();
        var inputs = new List<NamedOnnxValue>
        {
            NamedOnnxValue.CreateFromTensor(_inputName, tensor)
        };
        using var results = _session.Run(inputs);
        swInf.Stop();

        // 3. Parse output (reuses _outputBuffer)
        var outputTensor = results.First().AsTensor<float>();
        var detections = Postprocess(outputTensor, scale, padX, padY, bgrMat.Width, bgrMat.Height);

        swTotal.Stop();
        var totalMs = (float)swTotal.Elapsed.TotalMilliseconds;

        if (!_warmedUp)
        {
            _warmedUp = true;
            Log.Information("OnnxYolo warmup: pre={Pre:F0}ms inf={Inf:F0}ms total={Total:F0}ms",
                swPre.Elapsed.TotalMilliseconds, swInf.Elapsed.TotalMilliseconds, totalMs);
        }

        return (detections, totalMs, (float)swPre.Elapsed.TotalMilliseconds, (float)swInf.Elapsed.TotalMilliseconds);
    }

    private (DenseTensor<float> tensor, float scale, int padX, int padY) Preprocess(Mat bgrMat)
    {
        int origW = bgrMat.Width, origH = bgrMat.Height;

        float scale = Math.Min((float)_imgSize / origW, (float)_imgSize / origH);
        int newW = (int)Math.Round(origW * scale);
        int newH = (int)Math.Round(origH * scale);

        int padX = (_imgSize - newW) / 2;
        int padY = (_imgSize - newH) / 2;

        Cv2.Resize(bgrMat, _resized, new Size(newW, newH), interpolation: InterpolationFlags.Linear);

        _padded.SetTo(new Scalar(114, 114, 114));
        _resized.CopyTo(_padded[new Rect(padX, padY, newW, newH)]);

        Cv2.CvtColor(_padded, _rgb, ColorConversionCodes.BGR2RGB);

        _rgb.ConvertTo(_floatMat, MatType.CV_32FC3, 1.0 / 255.0);

        // HWC → NCHW via channel split + Marshal.Copy into pooled buffer
        int pixels = _imgSize * _imgSize;
        Cv2.Split(_floatMat, out Mat[] channels);
        try
        {
            for (int c = 0; c < 3; c++)
                Marshal.Copy(channels[c].Data, _tensorBuffer, c * pixels, pixels);
        }
        finally
        {
            foreach (var ch in channels) ch.Dispose();
        }

        // Wrap pooled buffer in tensor (no copy — DenseTensor references the array)
        var tensor = new DenseTensor<float>(_tensorBuffer, [1, 3, _imgSize, _imgSize]);
        return (tensor, scale, padX, padY);
    }

    private List<DetectedBoss> Postprocess(Tensor<float> output, float scale,
        int padX, int padY, int origW, int origH)
    {
        int numClasses = _classNames.Length;
        int numDetections = output.Dimensions[2];
        int rowSize = output.Dimensions[1]; // 4 + nc
        int flatSize = rowSize * numDetections;

        // Reuse output buffer (resize only if model output shape changed)
        if (_outputBuffer == null || _outputBuffer.Length < flatSize)
            _outputBuffer = new float[flatSize];

        if (output is DenseTensor<float> dense)
            dense.Buffer.Span.CopyTo(_outputBuffer);
        else
            for (int r = 0; r < rowSize; r++)
                for (int i = 0; i < numDetections; i++)
                    _outputBuffer[r * numDetections + i] = output[0, r, i];

        var boxes = new List<Rect>();
        var confidences = new List<float>();
        var classIds = new List<int>();

        for (int i = 0; i < numDetections; i++)
        {
            float bestConf = 0;
            int bestClass = 0;
            for (int c = 0; c < numClasses; c++)
            {
                float conf = _outputBuffer[(4 + c) * numDetections + i];
                if (conf > bestConf)
                {
                    bestConf = conf;
                    bestClass = c;
                }
            }

            if (bestConf < _confThreshold) continue;

            float cx = _outputBuffer[0 * numDetections + i];
            float cy = _outputBuffer[1 * numDetections + i];
            float w = _outputBuffer[2 * numDetections + i];
            float h = _outputBuffer[3 * numDetections + i];

            float x1 = (cx - w / 2 - padX) / scale;
            float y1 = (cy - h / 2 - padY) / scale;
            float bw = w / scale;
            float bh = h / scale;

            x1 = Math.Max(0, x1);
            y1 = Math.Max(0, y1);
            bw = Math.Min(bw, origW - x1);
            bh = Math.Min(bh, origH - y1);

            boxes.Add(new Rect((int)x1, (int)y1, (int)bw, (int)bh));
            confidences.Add(bestConf);
            classIds.Add(bestClass);
        }

        if (boxes.Count == 0)
            return [];

        CvDnn.NMSBoxes(boxes, confidences, _confThreshold, _iouThreshold, out int[] indices);

        var detections = new List<DetectedBoss>(indices.Length);
        foreach (var idx in indices)
        {
            var box = boxes[idx];
            detections.Add(new DetectedBoss(
                _classNames[classIds[idx]],
                confidences[idx],
                box.X, box.Y, box.Width, box.Height,
                box.X + box.Width / 2,
                box.Y + box.Height / 2));
        }

        return detections;
    }

    public void Dispose()
    {
        _session.Dispose();
        _resized.Dispose();
        _padded.Dispose();
        _rgb.Dispose();
        _floatMat.Dispose();
    }
}