using System.Runtime.CompilerServices; using Microsoft.ML.OnnxRuntime; using Microsoft.ML.OnnxRuntime.Tensors; namespace splitter.algo; public sealed class YoloV10ObjectDetector : LoggingBase, IObjectDetector, IDisposable { private readonly InferenceSession _session; private readonly string _inputName; private readonly string _outputName; private const int _inputWidth = 640; private const int _inputHeight = 640; private const float _scoreThreshold = 0.35f; private const float _nmsThreshold = 0.45f; private const int _personClassIndex = 0; private readonly Mat _resizeMat = new(); private readonly Mat _rgbMat = new(); private readonly float[] _inputBuffer; private readonly DenseTensor _inputTensor; private readonly List _inputs = new(1); private readonly List _detections = new(256); private readonly List _nmsBuffer = new(256); private readonly List<(Rect box, Point2f center)> _results = new(64); private readonly float _inv255 = 1f / 255f; private readonly struct Detection { public readonly float X; public readonly float Y; public readonly float Width; public readonly float Height; public readonly float Score; public Detection(float x, float y, float w, float h, float score) { X = x; Y = y; Width = w; Height = h; Score = score; } } public YoloV10ObjectDetector(ILogger logger) : base(logger, -1) { var options = new SessionOptions(); options.AppendExecutionProvider_DML(); var basePath = AppDomain.CurrentDomain.BaseDirectory; var modelPath = Path.Combine(basePath, "models", "yolov10m.onnx"); _session = new InferenceSession(modelPath, options); _inputName = _session.InputMetadata.Keys.First(); _outputName = _session.OutputMetadata.Keys.First(); _inputBuffer = new float[1 * 3 * _inputHeight * _inputWidth]; _inputTensor = new DenseTensor(_inputBuffer, new[] { 1, 3, _inputHeight, _inputWidth }); _inputs.Add(NamedOnnxValue.CreateFromTensor(_inputName, _inputTensor)); } public List<(Rect box, Point2f center)> DetectAll(SingleTask job, Mat frameCont) { if (frameCont.Empty()) { _results.Clear(); return _results; } Cv2.Resize(frameCont, _resizeMat, new Size(_inputWidth, _inputHeight)); Cv2.CvtColor(_resizeMat, _rgbMat, ColorConversionCodes.BGR2RGB); FillInputTensor(_rgbMat); using var results = _session.Run(_inputs); Tensor? output = null; foreach (var r in results) { if (r.Name == _outputName) { output = r.AsTensor(); break; } } if (output is null) { _results.Clear(); return _results; } ParseYoloV10( output, frameCont.Width, frameCont.Height, job.Job.ScoreThreshold, _personClassIndex, _detections); var final = ApplyNms(_detections, _nmsThreshold, _nmsBuffer); _results.Clear(); for (var i = 0; i < final.Count; i++) { var d = final[i]; var x = (int)d.X; var y = (int)d.Y; var w = (int)d.Width; var h = (int)d.Height; x = Math.Clamp(x, 0, frameCont.Width - 1); y = Math.Clamp(y, 0, frameCont.Height - 1); w = Math.Clamp(w, 1, frameCont.Width - x); h = Math.Clamp(h, 1, frameCont.Height - y); var rect = new Rect(x, y, w, h); var center = new Point2f(x + w / 2f, y + h / 2f); _results.Add((rect, center)); } return _results; } [MethodImpl(MethodImplOptions.AggressiveInlining)] private void FillInputTensor(Mat rgb) { var height = _inputHeight; var width = _inputWidth; var planeSize = height * width; Span dst = _inputBuffer.AsSpan(); unsafe { for (var y = 0; y < height; y++) { var rowPtr = (byte*)rgb.Ptr(y).ToPointer(); var rowSpan = new Span(rowPtr, width * 3); var srcIndex = 0; for (var x = 0; x < width; x++) { var r = rowSpan[srcIndex + 0]; var g = rowSpan[srcIndex + 1]; var b = rowSpan[srcIndex + 2]; var offset = y * width + x; dst[offset] = r * _inv255; dst[planeSize + offset] = g * _inv255; dst[2 * planeSize + offset] = b * _inv255; srcIndex += 3; } } } } // YOLOv10 parser: [1, 300, 6] => x1, y1, x2, y2, score, class_id private static void ParseYoloV10( Tensor output, int originalWidth, int originalHeight, float scoreThreshold, int classIndex, List detections) { detections.Clear(); // dims: [1, 300, 6] var count = output.Dimensions[1]; var xScale = (float)originalWidth / 640f; var yScale = (float)originalHeight / 640f; for (var i = 0; i < count; i++) { var x1 = output[0, i, 0]; var y1 = output[0, i, 1]; var x2 = output[0, i, 2]; var y2 = output[0, i, 3]; var score = output[0, i, 4]; var cls = (int)output[0, i, 5]; if (cls != classIndex) continue; if (score < scoreThreshold) continue; var left = x1 * xScale; var top = y1 * yScale; var width = (x2 - x1) * xScale; var height = (y2 - y1) * yScale; detections.Add(new Detection(left, top, width, height, score)); } } private static List ApplyNms( List detections, float nmsThreshold, List nmsBuffer) { nmsBuffer.Clear(); if (detections.Count == 0) return nmsBuffer; detections.Sort(static (a, b) => b.Score.CompareTo(a.Score)); for (var i = 0; i < detections.Count; i++) { var candidate = detections[i]; var keep = true; for (var j = 0; j < nmsBuffer.Count; j++) { if (IoU(candidate, nmsBuffer[j]) >= nmsThreshold) { keep = false; break; } } if (keep) nmsBuffer.Add(candidate); } return nmsBuffer; } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static float IoU(in Detection a, in Detection b) { var x1 = MathF.Max(a.X, b.X); var y1 = MathF.Max(a.Y, b.Y); var x2 = MathF.Min(a.X + a.Width, b.X + b.Width); var y2 = MathF.Min(a.Y + a.Height, b.Y + b.Height); var interW = x2 - x1; if (interW <= 0f) return 0f; var interH = y2 - y1; if (interH <= 0f) return 0f; var interArea = interW * interH; var areaA = a.Width * a.Height; var areaB = b.Width * b.Height; var union = areaA + areaB - interArea; if (union <= 0f) return 0f; return interArea / union; } public void Dispose() { _session?.Dispose(); _resizeMat?.Dispose(); _rgbMat?.Dispose(); } }