227 lines
8.5 KiB
C#
227 lines
8.5 KiB
C#
using System.Runtime.InteropServices;
|
|
using Microsoft.ML.OnnxRuntime;
|
|
using Microsoft.ML.OnnxRuntime.Tensors;
|
|
using OpenCvSharp;
|
|
using OpenCvSharp.Dnn;
|
|
using Serilog;
|
|
|
|
namespace Automata.Screen;
|
|
|
|
/// <summary>
|
|
/// YOLO11 object detection via ONNX Runtime with CUDA GPU acceleration.
|
|
/// Handles letterbox preprocessing, inference, and NMS postprocessing.
|
|
/// Buffers are pooled to avoid LOH allocations that trigger Gen2 GC pauses.
|
|
/// </summary>
|
|
public class OnnxYoloDetector : IDisposable
|
|
{
|
|
private readonly InferenceSession _session;
|
|
private readonly string[] _classNames;
|
|
private readonly int _imgSize;
|
|
private readonly float _confThreshold;
|
|
private readonly float _iouThreshold;
|
|
private readonly string _inputName;
|
|
private bool _warmedUp;
|
|
|
|
// Pooled buffers — allocated once, reused every inference (avoids LOH/GC pressure)
|
|
private readonly float[] _tensorBuffer; // 3 * imgSize * imgSize (~1.2MB for 640)
|
|
private float[]? _outputBuffer; // rowSize * numDetections, sized on first use
|
|
|
|
// Pre-allocated Mats for preprocessing (reused every inference — avoids alloc/GC per frame)
|
|
private readonly Mat _resized = new();
|
|
private readonly Mat _padded;
|
|
private readonly Mat _rgb = new();
|
|
private readonly Mat _floatMat = new();
|
|
|
|
public OnnxYoloDetector(string modelPath, string[] classNames,
|
|
float confThreshold = 0.40f, float iouThreshold = 0.45f)
|
|
{
|
|
_classNames = classNames;
|
|
_confThreshold = confThreshold;
|
|
_iouThreshold = iouThreshold;
|
|
|
|
var opts = new SessionOptions();
|
|
opts.GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_ALL;
|
|
opts.InterOpNumThreads = 1; // single model, no inter-op parallelism needed
|
|
opts.IntraOpNumThreads = Environment.ProcessorCount / 2; // use half the cores (leave room for game + pipeline)
|
|
opts.ExecutionMode = ExecutionMode.ORT_SEQUENTIAL; // sequential is faster for single inference
|
|
// CPU EP — avoids GPU contention with DXGI screen capture
|
|
Log.Information("OnnxYolo: using CPU EP, intra threads={Threads}", opts.IntraOpNumThreads);
|
|
|
|
_session = new InferenceSession(modelPath, opts);
|
|
_inputName = _session.InputNames[0];
|
|
|
|
// Read imgSize from the model's input shape (NCHW: [1, 3, H, W])
|
|
var inputMeta = _session.InputMetadata[_inputName];
|
|
_imgSize = inputMeta.Dimensions[2]; // H == W for square YOLO input
|
|
|
|
_tensorBuffer = new float[3 * _imgSize * _imgSize];
|
|
_padded = new Mat(_imgSize, _imgSize, MatType.CV_8UC3, new Scalar(114, 114, 114));
|
|
|
|
Log.Information("OnnxYolo: loaded {Path} (input: {Input}, imgSize: {ImgSize})",
|
|
modelPath, _inputName, _imgSize);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Run detection on a BGR Mat. Returns detected bosses in original image coordinates.
|
|
/// </summary>
|
|
public (List<DetectedBoss> Detections, float TotalMs, float PreMs, float InfMs) Detect(Mat bgrMat)
|
|
{
|
|
var swTotal = System.Diagnostics.Stopwatch.StartNew();
|
|
|
|
// 1. Letterbox preprocess (reuses _tensorBuffer)
|
|
var swPre = System.Diagnostics.Stopwatch.StartNew();
|
|
var (tensor, scale, padX, padY) = Preprocess(bgrMat);
|
|
swPre.Stop();
|
|
|
|
// 2. Run inference
|
|
var swInf = System.Diagnostics.Stopwatch.StartNew();
|
|
var inputs = new List<NamedOnnxValue>
|
|
{
|
|
NamedOnnxValue.CreateFromTensor(_inputName, tensor)
|
|
};
|
|
using var results = _session.Run(inputs);
|
|
swInf.Stop();
|
|
|
|
// 3. Parse output (reuses _outputBuffer)
|
|
var outputTensor = results.First().AsTensor<float>();
|
|
var detections = Postprocess(outputTensor, scale, padX, padY, bgrMat.Width, bgrMat.Height);
|
|
|
|
swTotal.Stop();
|
|
var totalMs = (float)swTotal.Elapsed.TotalMilliseconds;
|
|
|
|
if (!_warmedUp)
|
|
{
|
|
_warmedUp = true;
|
|
Log.Information("OnnxYolo warmup: pre={Pre:F0}ms inf={Inf:F0}ms total={Total:F0}ms",
|
|
swPre.Elapsed.TotalMilliseconds, swInf.Elapsed.TotalMilliseconds, totalMs);
|
|
}
|
|
|
|
return (detections, totalMs, (float)swPre.Elapsed.TotalMilliseconds, (float)swInf.Elapsed.TotalMilliseconds);
|
|
}
|
|
|
|
private (DenseTensor<float> tensor, float scale, int padX, int padY) Preprocess(Mat bgrMat)
|
|
{
|
|
int origW = bgrMat.Width, origH = bgrMat.Height;
|
|
|
|
float scale = Math.Min((float)_imgSize / origW, (float)_imgSize / origH);
|
|
int newW = (int)Math.Round(origW * scale);
|
|
int newH = (int)Math.Round(origH * scale);
|
|
|
|
int padX = (_imgSize - newW) / 2;
|
|
int padY = (_imgSize - newH) / 2;
|
|
|
|
Cv2.Resize(bgrMat, _resized, new Size(newW, newH), interpolation: InterpolationFlags.Linear);
|
|
|
|
_padded.SetTo(new Scalar(114, 114, 114));
|
|
_resized.CopyTo(_padded[new Rect(padX, padY, newW, newH)]);
|
|
|
|
Cv2.CvtColor(_padded, _rgb, ColorConversionCodes.BGR2RGB);
|
|
|
|
_rgb.ConvertTo(_floatMat, MatType.CV_32FC3, 1.0 / 255.0);
|
|
|
|
// HWC → NCHW via channel split + Marshal.Copy into pooled buffer
|
|
int pixels = _imgSize * _imgSize;
|
|
Cv2.Split(_floatMat, out Mat[] channels);
|
|
try
|
|
{
|
|
for (int c = 0; c < 3; c++)
|
|
Marshal.Copy(channels[c].Data, _tensorBuffer, c * pixels, pixels);
|
|
}
|
|
finally
|
|
{
|
|
foreach (var ch in channels) ch.Dispose();
|
|
}
|
|
|
|
// Wrap pooled buffer in tensor (no copy — DenseTensor references the array)
|
|
var tensor = new DenseTensor<float>(_tensorBuffer, [1, 3, _imgSize, _imgSize]);
|
|
return (tensor, scale, padX, padY);
|
|
}
|
|
|
|
private List<DetectedBoss> Postprocess(Tensor<float> output, float scale,
|
|
int padX, int padY, int origW, int origH)
|
|
{
|
|
int numClasses = _classNames.Length;
|
|
int numDetections = output.Dimensions[2];
|
|
int rowSize = output.Dimensions[1]; // 4 + nc
|
|
int flatSize = rowSize * numDetections;
|
|
|
|
// Reuse output buffer (resize only if model output shape changed)
|
|
if (_outputBuffer == null || _outputBuffer.Length < flatSize)
|
|
_outputBuffer = new float[flatSize];
|
|
|
|
if (output is DenseTensor<float> dense)
|
|
dense.Buffer.Span.CopyTo(_outputBuffer);
|
|
else
|
|
for (int r = 0; r < rowSize; r++)
|
|
for (int i = 0; i < numDetections; i++)
|
|
_outputBuffer[r * numDetections + i] = output[0, r, i];
|
|
|
|
var boxes = new List<Rect>();
|
|
var confidences = new List<float>();
|
|
var classIds = new List<int>();
|
|
|
|
for (int i = 0; i < numDetections; i++)
|
|
{
|
|
float bestConf = 0;
|
|
int bestClass = 0;
|
|
for (int c = 0; c < numClasses; c++)
|
|
{
|
|
float conf = _outputBuffer[(4 + c) * numDetections + i];
|
|
if (conf > bestConf)
|
|
{
|
|
bestConf = conf;
|
|
bestClass = c;
|
|
}
|
|
}
|
|
|
|
if (bestConf < _confThreshold) continue;
|
|
|
|
float cx = _outputBuffer[0 * numDetections + i];
|
|
float cy = _outputBuffer[1 * numDetections + i];
|
|
float w = _outputBuffer[2 * numDetections + i];
|
|
float h = _outputBuffer[3 * numDetections + i];
|
|
|
|
float x1 = (cx - w / 2 - padX) / scale;
|
|
float y1 = (cy - h / 2 - padY) / scale;
|
|
float bw = w / scale;
|
|
float bh = h / scale;
|
|
|
|
x1 = Math.Max(0, x1);
|
|
y1 = Math.Max(0, y1);
|
|
bw = Math.Min(bw, origW - x1);
|
|
bh = Math.Min(bh, origH - y1);
|
|
|
|
boxes.Add(new Rect((int)x1, (int)y1, (int)bw, (int)bh));
|
|
confidences.Add(bestConf);
|
|
classIds.Add(bestClass);
|
|
}
|
|
|
|
if (boxes.Count == 0)
|
|
return [];
|
|
|
|
CvDnn.NMSBoxes(boxes, confidences, _confThreshold, _iouThreshold, out int[] indices);
|
|
|
|
var detections = new List<DetectedBoss>(indices.Length);
|
|
foreach (var idx in indices)
|
|
{
|
|
var box = boxes[idx];
|
|
detections.Add(new DetectedBoss(
|
|
_classNames[classIds[idx]],
|
|
confidences[idx],
|
|
box.X, box.Y, box.Width, box.Height,
|
|
box.X + box.Width / 2,
|
|
box.Y + box.Height / 2));
|
|
}
|
|
|
|
return detections;
|
|
}
|
|
|
|
public void Dispose()
|
|
{
|
|
_session.Dispose();
|
|
_resized.Dispose();
|
|
_padded.Dispose();
|
|
_rgb.Dispose();
|
|
_floatMat.Dispose();
|
|
}
|
|
}
|