using System.Runtime.InteropServices; using Microsoft.ML.OnnxRuntime; using Microsoft.ML.OnnxRuntime.Tensors; using OpenCvSharp; using OpenCvSharp.Dnn; using Serilog; namespace Automata.Screen; /// /// YOLO11 object detection via ONNX Runtime with CUDA GPU acceleration. /// Handles letterbox preprocessing, inference, and NMS postprocessing. /// Buffers are pooled to avoid LOH allocations that trigger Gen2 GC pauses. /// public class OnnxYoloDetector : IDisposable { private readonly InferenceSession _session; private readonly string[] _classNames; private readonly int _imgSize; private readonly float _confThreshold; private readonly float _iouThreshold; private readonly string _inputName; private bool _warmedUp; // Pooled buffers — allocated once, reused every inference (avoids LOH/GC pressure) private readonly float[] _tensorBuffer; // 3 * imgSize * imgSize (~1.2MB for 640) private float[]? _outputBuffer; // rowSize * numDetections, sized on first use // Pre-allocated Mats for preprocessing (reused every inference — avoids alloc/GC per frame) private readonly Mat _resized = new(); private readonly Mat _padded; private readonly Mat _rgb = new(); private readonly Mat _floatMat = new(); public OnnxYoloDetector(string modelPath, string[] classNames, float confThreshold = 0.40f, float iouThreshold = 0.45f) { _classNames = classNames; _confThreshold = confThreshold; _iouThreshold = iouThreshold; var opts = new SessionOptions(); opts.GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_ALL; opts.InterOpNumThreads = 1; // single model, no inter-op parallelism needed opts.IntraOpNumThreads = Environment.ProcessorCount / 2; // use half the cores (leave room for game + pipeline) opts.ExecutionMode = ExecutionMode.ORT_SEQUENTIAL; // sequential is faster for single inference // CPU EP — avoids GPU contention with DXGI screen capture Log.Information("OnnxYolo: using CPU EP, intra threads={Threads}", opts.IntraOpNumThreads); _session = new InferenceSession(modelPath, opts); _inputName = _session.InputNames[0]; // Read imgSize from the model's input shape (NCHW: [1, 3, H, W]) var inputMeta = _session.InputMetadata[_inputName]; _imgSize = inputMeta.Dimensions[2]; // H == W for square YOLO input _tensorBuffer = new float[3 * _imgSize * _imgSize]; _padded = new Mat(_imgSize, _imgSize, MatType.CV_8UC3, new Scalar(114, 114, 114)); Log.Information("OnnxYolo: loaded {Path} (input: {Input}, imgSize: {ImgSize})", modelPath, _inputName, _imgSize); } /// /// Run detection on a BGR Mat. Returns detected bosses in original image coordinates. /// public (List Detections, float TotalMs, float PreMs, float InfMs) Detect(Mat bgrMat) { var swTotal = System.Diagnostics.Stopwatch.StartNew(); // 1. Letterbox preprocess (reuses _tensorBuffer) var swPre = System.Diagnostics.Stopwatch.StartNew(); var (tensor, scale, padX, padY) = Preprocess(bgrMat); swPre.Stop(); // 2. Run inference var swInf = System.Diagnostics.Stopwatch.StartNew(); var inputs = new List { NamedOnnxValue.CreateFromTensor(_inputName, tensor) }; using var results = _session.Run(inputs); swInf.Stop(); // 3. Parse output (reuses _outputBuffer) var outputTensor = results.First().AsTensor(); var detections = Postprocess(outputTensor, scale, padX, padY, bgrMat.Width, bgrMat.Height); swTotal.Stop(); var totalMs = (float)swTotal.Elapsed.TotalMilliseconds; if (!_warmedUp) { _warmedUp = true; Log.Information("OnnxYolo warmup: pre={Pre:F0}ms inf={Inf:F0}ms total={Total:F0}ms", swPre.Elapsed.TotalMilliseconds, swInf.Elapsed.TotalMilliseconds, totalMs); } return (detections, totalMs, (float)swPre.Elapsed.TotalMilliseconds, (float)swInf.Elapsed.TotalMilliseconds); } private (DenseTensor tensor, float scale, int padX, int padY) Preprocess(Mat bgrMat) { int origW = bgrMat.Width, origH = bgrMat.Height; float scale = Math.Min((float)_imgSize / origW, (float)_imgSize / origH); int newW = (int)Math.Round(origW * scale); int newH = (int)Math.Round(origH * scale); int padX = (_imgSize - newW) / 2; int padY = (_imgSize - newH) / 2; Cv2.Resize(bgrMat, _resized, new Size(newW, newH), interpolation: InterpolationFlags.Linear); _padded.SetTo(new Scalar(114, 114, 114)); _resized.CopyTo(_padded[new Rect(padX, padY, newW, newH)]); Cv2.CvtColor(_padded, _rgb, ColorConversionCodes.BGR2RGB); _rgb.ConvertTo(_floatMat, MatType.CV_32FC3, 1.0 / 255.0); // HWC → NCHW via channel split + Marshal.Copy into pooled buffer int pixels = _imgSize * _imgSize; Cv2.Split(_floatMat, out Mat[] channels); try { for (int c = 0; c < 3; c++) Marshal.Copy(channels[c].Data, _tensorBuffer, c * pixels, pixels); } finally { foreach (var ch in channels) ch.Dispose(); } // Wrap pooled buffer in tensor (no copy — DenseTensor references the array) var tensor = new DenseTensor(_tensorBuffer, [1, 3, _imgSize, _imgSize]); return (tensor, scale, padX, padY); } private List Postprocess(Tensor output, float scale, int padX, int padY, int origW, int origH) { int numClasses = _classNames.Length; int numDetections = output.Dimensions[2]; int rowSize = output.Dimensions[1]; // 4 + nc int flatSize = rowSize * numDetections; // Reuse output buffer (resize only if model output shape changed) if (_outputBuffer == null || _outputBuffer.Length < flatSize) _outputBuffer = new float[flatSize]; if (output is DenseTensor dense) dense.Buffer.Span.CopyTo(_outputBuffer); else for (int r = 0; r < rowSize; r++) for (int i = 0; i < numDetections; i++) _outputBuffer[r * numDetections + i] = output[0, r, i]; var boxes = new List(); var confidences = new List(); var classIds = new List(); for (int i = 0; i < numDetections; i++) { float bestConf = 0; int bestClass = 0; for (int c = 0; c < numClasses; c++) { float conf = _outputBuffer[(4 + c) * numDetections + i]; if (conf > bestConf) { bestConf = conf; bestClass = c; } } if (bestConf < _confThreshold) continue; float cx = _outputBuffer[0 * numDetections + i]; float cy = _outputBuffer[1 * numDetections + i]; float w = _outputBuffer[2 * numDetections + i]; float h = _outputBuffer[3 * numDetections + i]; float x1 = (cx - w / 2 - padX) / scale; float y1 = (cy - h / 2 - padY) / scale; float bw = w / scale; float bh = h / scale; x1 = Math.Max(0, x1); y1 = Math.Max(0, y1); bw = Math.Min(bw, origW - x1); bh = Math.Min(bh, origH - y1); boxes.Add(new Rect((int)x1, (int)y1, (int)bw, (int)bh)); confidences.Add(bestConf); classIds.Add(bestClass); } if (boxes.Count == 0) return []; CvDnn.NMSBoxes(boxes, confidences, _confThreshold, _iouThreshold, out int[] indices); var detections = new List(indices.Length); foreach (var idx in indices) { var box = boxes[idx]; detections.Add(new DetectedBoss( _classNames[classIds[idx]], confidences[idx], box.X, box.Y, box.Width, box.Height, box.X + box.Width / 2, box.Y + box.Height / 2)); } return detections; } public void Dispose() { _session.Dispose(); _resized.Dispose(); _padded.Dispose(); _rgb.Dispose(); _floatMat.Dispose(); } }