Compare commits

...

2 Commits

24 changed files with 665 additions and 119 deletions

View File

@ -13,7 +13,7 @@ Splitter pipeline is:
* FFMpeg used to decode video frames into OpenCVSharp.Mat
* One of detectors used:
- For face detection: [opencv_zoo/models/face_detection_yunet at main opencv/opencv_zoo](https://github.com/opencv/opencv_zoo/tree/main/models/face_detection_yunet)
- For body detection: [yolov8s.pt Ultralytics/YOLOv8 at main](https://huggingface.co/Ultralytics/YOLOv8/blob/main/yolov8s.pt)
- For body detection: [THU-MIG/yolov10: YOLOv10: Real-Time End-to-End Object Detection [NeurIPS 2024]](https://github.com/THU-MIG/yolov10/tree/main)
* Camera control aplied (CameraControl class)
* Final video frames are encoded back to video file using FFMpeg

View File

@ -6,13 +6,17 @@ public class PreviewData
public IReadOnlyList<OpenCvSharp.Rect> DetectedBoxes { get; }
public Rect? CropRect { get; }
public Point2f GravitateTo { get; }
public TimeSpan Position { get; }
public int? Rotate { get; }
public PreviewData(Avalonia.Media.Imaging.Bitmap? frame, IReadOnlyList<OpenCvSharp.Rect> boxes, Rect? crop, Point2f gravitateTo)
public PreviewData(Avalonia.Media.Imaging.Bitmap? frame, IReadOnlyList<OpenCvSharp.Rect> boxes, Rect? crop, Point2f gravitateTo, TimeSpan position, int? rotate)
{
Frame = frame;
DetectedBoxes = boxes;
CropRect = crop;
GravitateTo = gravitateTo;
Position = position;
Rotate = rotate;
}
}

View File

@ -36,16 +36,19 @@ internal sealed class Program
// splitter services
services.AddSingleton<UltraFaceDetector>();
services.AddSingleton<YoloOnnxObjectDetector>();
services.AddSingleton( x => new SingleThreadedDetector<UltraFaceDetector>(x.GetRequiredService<UltraFaceDetector>()) );
services.AddSingleton(x => new SingleThreadedDetector<YoloOnnxObjectDetector>(x.GetRequiredService<YoloOnnxObjectDetector>()));
services.AddSingleton<YoloV10ObjectDetector>();
services.AddSingleton<OSNetEmbeddingExtractor>();
services.AddSingleton<IObjectTracker, ObjectTracker>();
services.AddSingleton(x => new SingleThreadedDetector<UltraFaceDetector>(x.GetRequiredService<UltraFaceDetector>()));
services.AddSingleton(x => new SingleThreadedDetector<YoloV10ObjectDetector>(x.GetRequiredService<YoloV10ObjectDetector>()));
services.AddSingleton(x => new SingleThreadedDetector<DummyDetector>(x.GetRequiredService<DummyDetector>()));
services.AddSingleton<IEmbeddingExtractor>(x => new SingleThreadedEmbeddingExtractor<OSNetEmbeddingExtractor>(x.GetRequiredService<OSNetEmbeddingExtractor>()));
services.AddSingleton<Func<string, IObjectDetector>>( x => detectorName =>
{
return detectorName switch
{
"face" => x.GetRequiredService<SingleThreadedDetector<UltraFaceDetector>>(),
"body" => x.GetRequiredService<SingleThreadedDetector<YoloOnnxObjectDetector>>(),
"body" => x.GetRequiredService<SingleThreadedDetector<YoloV10ObjectDetector>>(),
"none" => x.GetRequiredService<SingleThreadedDetector<DummyDetector>>(),
_ => new DummyDetector()
};

View File

@ -5,7 +5,7 @@ public class SingleThreadedDetector<T>(IObjectDetector _detector) : IObjectDetec
{
private Lock _lock = new();
public List<(OpenCvSharp.Rect box, Point2f center)> DetectAll(SingleTask job, Mat frameCont)
public List<DetectedPerson> DetectAll(SingleTask job, Mat frameCont)
{
lock (_lock)
{
@ -19,3 +19,24 @@ public class SingleThreadedDetector<T>(IObjectDetector _detector) : IObjectDetec
d.Dispose();
}
}
public class SingleThreadedEmbeddingExtractor<T>(IEmbeddingExtractor _extractor) : IEmbeddingExtractor
where T : IEmbeddingExtractor
{
private Lock _lock = new();
public float[] Extract(Mat frame, OpenCvSharp.Rect box)
{
lock (_lock)
{
return _extractor.Extract(frame, box);
}
}
public void Dispose()
{
if (_extractor is IDisposable d)
d.Dispose();
}
}

View File

@ -15,7 +15,7 @@ public partial class JobViewModel : ObservableObject
public SingleJob GetJob() => Job;
[ObservableProperty] private VideoInfo? _probe;
[ObservableProperty] private PreviewData? _preview = new(null, [], null, new(0.5f, 0.5f));
[ObservableProperty] private PreviewData? _preview = new(null, [], null, new(0.5f, 0.5f), TimeSpan.Zero, null);
[ObservableProperty] private Bitmap? _thumbnail;
[ObservableProperty] private double _sliderLiveValue;
[ObservableProperty] private double _positionSeconds;
@ -70,7 +70,7 @@ public partial class JobViewModel : ObservableObject
{
if (string.IsNullOrWhiteSpace(value))
{
Job.GravitateTo = null;
Job.GravitateTo = new Point2f(0.5f, 0.5f);
}
else
{
@ -109,6 +109,19 @@ public partial class JobViewModel : ObservableObject
}
}
public float ScoreThreshold
{
get => Job.ScoreThreshold;
set
{
if (Math.Abs(Job.ScoreThreshold - value) < 0.001)
return;
Job.ScoreThreshold = value;
OnPropertyChanged();
Task.Run(CreatePreview);
}
}
public string? Mask
{
get => Job.Mask;
@ -182,10 +195,10 @@ public partial class JobViewModel : ObservableObject
public Point2f GravitateTo
{
get => Job.GravitateTo ?? new Point2f(0.5f, 0.5f);
get => Job.GravitateTo;
set
{
if (Job.GravitateTo != null && Math.Abs(Job.GravitateTo.Value.X - value.X) < 0.001 && Math.Abs(Job.GravitateTo.Value.Y - value.Y) < 0.001)
if (Math.Abs(Job.GravitateTo.X - value.X) < 0.001 && Math.Abs(Job.GravitateTo.Y - value.Y) < 0.001)
return;
Job.GravitateTo = value;
@ -199,6 +212,8 @@ public partial class JobViewModel : ObservableObject
get => Job.DetectAbove;
set
{
if (Math.Abs(Job.DetectAbove - value) < 0.001 )
return;
Job.DetectAbove = value;
OnPropertyChanged();
Task.Run(CreatePreview);
@ -262,11 +277,17 @@ public partial class JobViewModel : ObservableObject
return;
try
{
var frame = await _thumbnails.CreateThumbnailAsync(Job.InputFile, Probe, TimeSpan.FromSeconds(PositionSeconds), Probe.Width, Probe.Height, Job.Rotate);
var pos = TimeSpan.FromSeconds(PositionSeconds);
Bitmap? frame;
if (Preview?.Frame == null || Preview.Position != pos)
frame = await _thumbnails.CreateThumbnailAsync(Job.InputFile, Probe, pos, Probe.Width, Probe.Height, Job.Rotate);
else
frame = Preview.Frame;
if ( frame == null )
return;
Preview = new PreviewData(frame, [], null, Job.GravitateTo ?? new (0.5f, 0.5f));
Preview = new PreviewData(frame, [], null, Job.GravitateTo, pos, Job.Rotate);
var detector = _detectorFactory(Job.Detect ?? "");
var j = new SingleTask
@ -286,7 +307,7 @@ public partial class JobViewModel : ObservableObject
if (detections.Count > 0)
{
var primaryDetection = detections
.OrderByDescending(d => d.box.Height * d.box.Width)
.OrderByDescending(d => d.Box.Height * d.Box.Width)
.FirstOrDefault();
var w = Probe.Width;
@ -295,16 +316,16 @@ public partial class JobViewModel : ObservableObject
var cropWidth = Job.Crop?.width ?? CommandLine.DefaultW;
var cropHeight = Job.Crop?.height ?? CommandLine.DefaultH;
var cx = primaryDetection.center.X - cropWidth / 2f;
var cy = primaryDetection.center.Y - cropHeight / 2f;
var cx = primaryDetection.Center.X - cropWidth / 2f;
var cy = primaryDetection.Center.Y - cropHeight / 2f;
var r = new Rect(cx, cy, cropWidth, cropHeight);
crop = ClampCrop(r, w, h);
}
var boxes = detections.Select(x => x.box).ToList();
Preview = new PreviewData(frame, boxes, crop, Job.GravitateTo ?? new (0.5f, 0.5f));
var boxes = detections.Select(x => x.Box).ToList();
Preview = new PreviewData(frame, boxes, crop, Job.GravitateTo, pos, Job.Rotate);
}
catch (Exception ex)
{

View File

@ -86,6 +86,25 @@ x:DataType="vm:InspectorPaneViewModel">
Width="160"/>
</StackPanel>
<!-- ScoreThreshold -->
<StackPanel Orientation="Horizontal" Spacing="8">
<TextBlock Text="Score Threshold" Width="120"/>
<StackPanel Orientation="Vertical" Spacing="4" Width="260">
<Slider Minimum="0"
Maximum="1"
SmallChange="0.01"
LargeChange="0.1"
TickFrequency="0.05"
IsSnapToTickEnabled="False"
Value="{Binding Selected.ScoreThreshold, Mode=TwoWay}"/>
<TextBlock Text="{Binding Selected.ScoreThreshold, StringFormat='0.00'}"
FontSize="10"
HorizontalAlignment="Right"/>
</StackPanel>
</StackPanel>
<!-- DetectAbove -->
<StackPanel Orientation="Horizontal" Spacing="8">
<TextBlock Text="Detect Above" Width="120"/>

View File

@ -8,7 +8,7 @@
x:DataType="vm:MainViewModel"
x:Name="Root"
Width="1800"
Height="790"
Height="830"
Title="Splitter UI"
Icon="avares://Splitter-UI/Assets/splitter.png">

View File

@ -98,6 +98,14 @@ public sealed class CommandLine
else
Master.DetectAbove = 0.7f;
}
else if (arg.StartsWith("--score-threshold="))
{
var val = arg.Substring("--score-threshold=".Length);
if (float.TryParse(val, NumberStyles.Float, CultureInfo.InvariantCulture, out var scoreThreshold) && scoreThreshold >= 0.0f && scoreThreshold <= 1.0f)
Master.ScoreThreshold = scoreThreshold;
else
Master.ScoreThreshold = 0.25f;
}
else if (arg == "--crop")
{
Master.Crop = ParseCrop("");
@ -224,22 +232,22 @@ public sealed class CommandLine
return key.Length > 0;
}
private static Point2f? ParseGravitate(string value)
private static Point2f ParseGravitate(string value)
{
// Expected format: "<x>:<y>"
var parts = value.Split(':');
if (parts.Length != 2)
return null;
return new Point2f(0.5f, 0.5f);
if (!float.TryParse(parts[0], NumberStyles.Float, CultureInfo.InvariantCulture, out var x))
return null;
return new Point2f(0.5f, 0.5f);
if (!float.TryParse(parts[1], NumberStyles.Float, CultureInfo.InvariantCulture, out var y))
return null;
return new Point2f(0.5f, 0.5f);
// Normalized range check (0.01.0)
if (x < 0f || x > 1f || y < 0f || y > 1f)
return null;
return new Point2f(0.5f, 0.5f);
return new Point2f(x, y);
}

View File

@ -64,11 +64,13 @@ public class JobProcessor(ILogger logger) : LoggingBase(logger, 0), IJobProcesso
IObjectDetector detector = job.Detect switch
{
"face" => new UltraFaceDetector(_logger),
"body" => new YoloOnnxObjectDetector(_logger),
"body" => new YoloV10ObjectDetector(_logger),
"none" => new DummyDetector(),
_ => throw new InvalidOperationException($"Unknown detector: {job.Detect}")
};
return new TrackingSplitter(i, detector, job, _logger);
var osnet = new OSNetEmbeddingExtractor();
var tracker = new ObjectTracker(detector, osnet);
return new TrackingSplitter(i, tracker, job, _logger);
};
}
else

View File

@ -29,12 +29,7 @@ public class SingleJob
/// such as left-center (0.2, 0.5) or top-right (0.8, 0.2). This can be useful for
/// videos where the subject tends to be off-center or for creative framing choices.
/// </summary>
public Point2f? GravitateTo { get; set; }
/// <summary>
/// Face or human detectors should only report detections if their upper bound starts below this threshold.
/// This is a value between 0.0 and 1.0 mapped to 0..Height.
/// </summary>
public float DetectAbove { get; set; } = 0.3f;
public Point2f GravitateTo { get; set; } = new Point2f(0.5f, 0.5f);
/// <summary>
/// Destination file mask.
/// </summary>
@ -50,6 +45,15 @@ public class SingleJob
/// </summary>
public string? Detect { get; set; }
/// <summary>
/// Detection confidence threshold. This is a value between 0.0 and 1.0 that sets the minimum confidence
/// </summary>
public float ScoreThreshold { get; set; } = 0.25f;
/// <summary>
/// Face or human detectors should only report detections if their upper bound starts below this threshold.
/// This is a value between 0.0 and 1.0 mapped to 0..Height.
/// </summary>
public float DetectAbove { get; set; } = 0.7f;
/// <summary>
/// Set starget segments length explicitly. By default, the splitter calculates segment
/// lengths to be equal and not exceed 58 seconds.
/// </summary>

View File

@ -4,24 +4,18 @@ using System.Runtime.InteropServices;
namespace splitter;
public class TrackingSplitter : LoggingBase, ISegmentProcessor, IDisposable
public class TrackingSplitter : LoggingBase, ISegmentProcessor
{
private readonly IObjectDetector _detector;
private readonly IObjectTracker _tracker;
public TrackingSplitter(
int progressLine,
IObjectDetector detector,
IObjectTracker tracker,
SingleJob cmd,
ILogger logger)
: base(logger, progressLine)
{
_detector = detector;
}
public void Dispose()
{
if (_detector is IDisposable d)
d.Dispose();
_tracker = tracker;
}
public async Task ProcessSegment(SingleTask job, CancellationToken token)
@ -103,12 +97,12 @@ public class TrackingSplitter : LoggingBase, ISegmentProcessor, IDisposable
var kalman = new KalmanTracker();
var camera = new CameraController(
videoWidth,
videoHeight,
job.Job.Crop.Value.width,
job.Job.Crop.Value.height,
kalman,
job.Job);
videoWidth,
videoHeight,
job.Job.Crop.Value.width,
job.Job.Crop.Value.height,
kalman,
job.Job);
try
{
@ -130,12 +124,7 @@ public class TrackingSplitter : LoggingBase, ISegmentProcessor, IDisposable
Marshal.Copy(inBuffer, 0, frameMat.Data, inBytes);
var objects = _detector.DetectAll(job, frameMat);
// Ignore detections starting in the lower 1/2 of the frame
objects = objects.Where(o => o.center.Y <= frameMat.Height * job.Job.DetectAbove).ToList();
var primary = SelectTrackedObject(objects, kalman.LastMeasurement);
var (objects, primary) = _tracker.SelectTrackedObject(job, frameMat, kalman.LastMeasurement);
camera.Update(primary);
var roi = camera.Roi;
@ -389,7 +378,7 @@ public class TrackingSplitter : LoggingBase, ISegmentProcessor, IDisposable
private void DrawDebug(
Mat frame,
System.Collections.Generic.List<(Rect box, Point2f center)> objects,
List<DetectedPerson> objects,
CameraController camera,
KalmanTracker kalman)
{
@ -418,52 +407,4 @@ public class TrackingSplitter : LoggingBase, ISegmentProcessor, IDisposable
HersheyFonts.HersheySimplex, 0.6, color, 2);
}
private (Rect box, Point2f center)? SelectTrackedObject(
List<(Rect box, Point2f center)> foundObjects,
Point2f? previousCenter)
{
if (foundObjects == null || foundObjects.Count == 0)
return null;
if (!previousCenter.HasValue)
{
var bestIndex = 0;
var bestArea = float.MinValue;
for (var i = 0; i < foundObjects.Count; i++)
{
var f = foundObjects[i];
var area = f.box.Width * f.box.Height;
if (area > bestArea)
{
bestArea = area;
bestIndex = i;
}
}
return foundObjects[bestIndex];
}
else
{
var prev = previousCenter.Value;
var bestIndex = 0;
var bestDist2 = float.MaxValue;
for (var i = 0; i < foundObjects.Count; i++)
{
var f = foundObjects[i];
var dx = f.center.X - prev.X;
var dy = f.center.Y - prev.Y;
var d2 = dx * dx + dy * dy;
if (d2 < bestDist2)
{
bestDist2 = d2;
bestIndex = i;
}
}
return foundObjects[bestIndex];
}
}
}

View File

@ -58,7 +58,7 @@ public sealed class CameraController
_kalman.Reset(_cameraCenter);
}
private Point2f DefaultCenter => _cmd.GravitateTo ?? new Point2f(_videoWidth / 2f, _videoHeight / 2f);
private Point2f DefaultCenter => _cmd.GravitateTo;
public int LostFrames => _lostFrames;
public Point2f CameraCenter => _cameraCenter;
@ -68,15 +68,15 @@ public sealed class CameraController
public Point2f? ObjectCenter => _objectCenter;
public Rect Roi => _roi;
public void Update((Rect box, Point2f center)? primary)
public void Update(DetectedPerson? primary)
{
Rect? objectBox = null;
Point2f? objectCenter = null;
if (primary.HasValue)
{
objectCenter = primary.Value.center;
objectBox = primary.Value.box;
objectCenter = primary.Value.Center;
objectBox = primary.Value.Box;
}
// ---------------------------------------------------------

View File

@ -0,0 +1,8 @@
namespace splitter.algo;
public struct DetectedPerson
{
public ulong Id;
public Rect Box;
public Point2f Center;
}

View File

@ -2,19 +2,19 @@
public sealed class DummyDetector : IObjectDetector
{
public List<(Rect box, Point2f center)> DetectAll(SingleTask job, Mat frameCont)
public List<DetectedPerson> DetectAll(SingleTask job, Mat frameCont)
{
var h = job.Info.Height;
var w = job.Info.Width;
var c = job.Job.GravitateTo ?? new Point2f(0.5f, 0.5f);
var c = job.Job.GravitateTo;
var x = (int)(c.X * w);
var y = (int)(c.Y * h);
var center = new Point2f(x, y);
var rect = new Rect(x - 1, y - 1, 2, 2);
return [(rect, center)];
return [new DetectedPerson { Box = rect, Center = center }];
}
public void Dispose() {}

View File

@ -0,0 +1,6 @@
namespace splitter.algo;
public interface IEmbeddingExtractor : IDisposable
{
float[] Extract(Mat frame, Rect box);
}

View File

@ -2,5 +2,5 @@
public interface IObjectDetector : IDisposable
{
List<(Rect box, Point2f center)> DetectAll(SingleTask job, Mat frameCont);
List<DetectedPerson> DetectAll(SingleTask job, Mat frameCont);
}

View File

@ -0,0 +1,6 @@
namespace splitter.algo;
public interface IObjectTracker
{
(List<DetectedPerson>, DetectedPerson?) SelectTrackedObject(SingleTask job, Mat frameMat, Point2f? lastMeasurement);
}

View File

@ -0,0 +1,127 @@
using System.Runtime.CompilerServices;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
namespace splitter.algo;
public sealed class OSNetEmbeddingExtractor : IDisposable, IEmbeddingExtractor
{
private readonly InferenceSession _session;
private readonly string _inputName;
private readonly string _outputName;
private const int _batchSize = 16;
private const int _inputHeight = 256;
private const int _inputWidth = 128;
private const int _channels = 3;
private readonly float[] _inputBuffer;
private readonly DenseTensor<float> _inputTensor;
private readonly List<NamedOnnxValue> _inputs = new(1);
private readonly float[] _embedding;
private readonly Mat _resizeMat = new();
private readonly Mat _rgbMat = new();
private readonly float _inv255 = 1f / 255f;
public OSNetEmbeddingExtractor()
{
var opt = new SessionOptions();
opt.AppendExecutionProvider_DML();
var modelPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "models", "osnet_x0_25_msmt17.onnx");
_session = new InferenceSession(modelPath, opt);
_inputName = _session.InputMetadata.Keys.First();
_outputName = _session.OutputMetadata.Keys.First();
int inputSize = _batchSize * _channels * _inputHeight * _inputWidth;
_inputBuffer = new float[inputSize];
_inputTensor = new DenseTensor<float>(
_inputBuffer,
new[] { _batchSize, _channels, _inputHeight, _inputWidth }
);
_inputs.Add(NamedOnnxValue.CreateFromTensor(_inputName, _inputTensor));
int outDim = _session.OutputMetadata[_outputName].Dimensions[1];
_embedding = new float[outDim];
}
public float[] Extract(Mat frame, Rect box)
{
// Clear all batches
Array.Clear(_inputBuffer, 0, _inputBuffer.Length);
// Extract ROI
var roi = new Mat(frame, box);
Cv2.Resize(roi, _resizeMat, new Size(_inputWidth, _inputHeight));
Cv2.CvtColor(_resizeMat, _rgbMat, ColorConversionCodes.BGR2RGB);
FillBatch0(_rgbMat);
using var results = _session.Run(_inputs);
var output = results.First(v => v.Name == _outputName).AsTensor<float>();
// Read embedding from batch 0
for (int i = 0; i < _embedding.Length; i++)
_embedding[i] = output[0, i];
NormalizeL2(_embedding);
return _embedding;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void FillBatch0(Mat rgb)
{
int plane = _inputHeight * _inputWidth;
unsafe
{
for (int y = 0; y < _inputHeight; y++)
{
var rowPtr = (byte*)rgb.Ptr(y).ToPointer();
var rowSpan = new Span<byte>(rowPtr, _inputWidth * 3);
int src = 0;
for (int x = 0; x < _inputWidth; x++)
{
int off = y * _inputWidth + x;
_inputBuffer[off] = rowSpan[src + 0] * _inv255; // R
_inputBuffer[plane + off] = rowSpan[src + 1] * _inv255; // G
_inputBuffer[2 * plane + off] = rowSpan[src + 2] * _inv255; // B
src += 3;
}
}
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void NormalizeL2(float[] v)
{
float sum = 0f;
for (int i = 0; i < v.Length; i++)
sum += v[i] * v[i];
float inv = 1f / MathF.Sqrt(sum);
for (int i = 0; i < v.Length; i++)
v[i] *= inv;
}
public void Dispose()
{
_session?.Dispose();
_resizeMat?.Dispose();
_rgbMat?.Dispose();
}
}

View File

@ -0,0 +1,98 @@
namespace splitter.algo;
public class ObjectTracker(IObjectDetector _detector, IEmbeddingExtractor _embeddingExtractor) : IObjectTracker
{
public (List<DetectedPerson> /*objects*/, DetectedPerson? /*primary*/) SelectTrackedObject(SingleTask job, Mat frameMat, Point2f? lastMeasurement)
{
var objects = _detector.DetectAll(job, frameMat) ?? [];
// Ignore detections starting in the lower 1/2 of the frame
objects = objects.Where(o => o.Center.Y <= frameMat.Height * job.Job.DetectAbove).ToList();
// attach embeddings to all persons
for (int i = 0; i < objects.Count; i++)
{
var p = objects[i]; // copy struct
var rect = p.Box;
rect.X = Math.Clamp(rect.X, 0, frameMat.Width - 1);
rect.Y = Math.Clamp(rect.Y, 0, frameMat.Height - 1);
rect.Width = Math.Clamp(rect.Width, 1, frameMat.Width - rect.X);
rect.Height = Math.Clamp(rect.Height, 1, frameMat.Height - rect.Y);
var embedding = _embeddingExtractor.Extract(frameMat, rect);
p.Id = HashEmbedding(embedding); // assign ID based on embedding hash
objects[i] = p; // write back
}
var primary = SelectPrimaryObject(objects, lastMeasurement);
return (objects, primary);
}
private static ulong HashEmbedding(float[] emb)
{
unchecked
{
ulong hash = 146527;
for (int i = 0; i < emb.Length; i++)
{
// convert float to int bits
uint bits = (uint)BitConverter.SingleToInt32Bits(emb[i]);
hash = (hash * 16777619) ^ bits;
}
return hash;
}
}
private DetectedPerson? SelectPrimaryObject(
List<DetectedPerson> foundObjects,
Point2f? previousCenter)
{
if (foundObjects == null || foundObjects.Count == 0)
return null;
if (!previousCenter.HasValue)
{
var bestIndex = 0;
var bestArea = float.MinValue;
for (var i = 0; i < foundObjects.Count; i++)
{
var f = foundObjects[i];
var area = f.Box.Width * f.Box.Height;
if (area > bestArea)
{
bestArea = area;
bestIndex = i;
}
}
return foundObjects[bestIndex];
}
else
{
var prev = previousCenter.Value;
var bestIndex = 0;
var bestDist2 = float.MaxValue;
for (var i = 0; i < foundObjects.Count; i++)
{
var f = foundObjects[i];
var dx = f.Center.X - prev.X;
var dy = f.Center.Y - prev.Y;
var d2 = dx * dx + dy * dy;
if (d2 < bestDist2)
{
bestDist2 = d2;
bestIndex = i;
}
}
return foundObjects[bestIndex];
}
}
}

View File

@ -23,14 +23,14 @@ public sealed class UltraFaceDetector: LoggingBase, IDisposable, IObjectDetector
_ultraFace = UltraFace.Create(param);
}
public List<(Rect box, Point2f center)> DetectAll(SingleTask job, Mat frameCont)
public List<DetectedPerson> DetectAll(SingleTask job, Mat frameCont)
{
// Convert to byte[] for UltraFace
var bytesFull = frameCont.Rows * frameCont.Cols * frameCont.ElemSize();
var bgr = new byte[bytesFull];
Marshal.Copy(frameCont.Data, bgr, 0, bytesFull);
var results = new List<(Rect box, Point2f center)>();
var results = new List<DetectedPerson>();
if (bgr == null || bgr.Length == 0)
return results;
@ -69,7 +69,7 @@ public sealed class UltraFaceDetector: LoggingBase, IDisposable, IObjectDetector
rect.X + rect.Width / 2f,
rect.Y + rect.Height / 2f);
results.Add((rect, center));
results.Add(new DetectedPerson{ Box = rect, Center = center });
}
}
}

View File

@ -0,0 +1,278 @@
using System.Runtime.CompilerServices;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
namespace splitter.algo;
public sealed class YoloV10ObjectDetector : LoggingBase, IObjectDetector, IDisposable
{
private readonly InferenceSession _session;
private readonly string _inputName;
private readonly string _outputName;
private const int _inputWidth = 640;
private const int _inputHeight = 640;
private const float _scoreThreshold = 0.35f;
private const float _nmsThreshold = 0.45f;
private const int _personClassIndex = 0;
private readonly Mat _resizeMat = new();
private readonly Mat _rgbMat = new();
private readonly float[] _inputBuffer;
private readonly DenseTensor<float> _inputTensor;
private readonly List<NamedOnnxValue> _inputs = new(1);
private readonly List<Detection> _detections = new(256);
private readonly List<Detection> _nmsBuffer = new(256);
private readonly List<DetectedPerson> _results = new(64);
private readonly float _inv255 = 1f / 255f;
private readonly struct Detection
{
public readonly float X;
public readonly float Y;
public readonly float Width;
public readonly float Height;
public readonly float Score;
public Detection(float x, float y, float w, float h, float score)
{
X = x;
Y = y;
Width = w;
Height = h;
Score = score;
}
}
public YoloV10ObjectDetector(ILogger logger) : base(logger, -1)
{
var options = new SessionOptions();
options.AppendExecutionProvider_DML();
var basePath = AppDomain.CurrentDomain.BaseDirectory;
var modelPath = Path.Combine(basePath, "models", "yolov10m.onnx");
_session = new InferenceSession(modelPath, options);
_inputName = _session.InputMetadata.Keys.First();
_outputName = _session.OutputMetadata.Keys.First();
_inputBuffer = new float[1 * 3 * _inputHeight * _inputWidth];
_inputTensor = new DenseTensor<float>(_inputBuffer, new[] { 1, 3, _inputHeight, _inputWidth });
_inputs.Add(NamedOnnxValue.CreateFromTensor(_inputName, _inputTensor));
}
public List<DetectedPerson> DetectAll(SingleTask job, Mat frameCont)
{
if (frameCont.Empty())
{
_results.Clear();
return _results;
}
Cv2.Resize(frameCont, _resizeMat, new Size(_inputWidth, _inputHeight));
Cv2.CvtColor(_resizeMat, _rgbMat, ColorConversionCodes.BGR2RGB);
FillInputTensor(_rgbMat);
using var results = _session.Run(_inputs);
Tensor<float>? output = null;
foreach (var r in results)
{
if (r.Name == _outputName)
{
output = r.AsTensor<float>();
break;
}
}
if (output is null)
{
_results.Clear();
return _results;
}
ParseYoloV10(
output,
frameCont.Width,
frameCont.Height,
job.Job.ScoreThreshold,
_personClassIndex,
_detections);
var final = ApplyNms(_detections, _nmsThreshold, _nmsBuffer);
_results.Clear();
for (var i = 0; i < final.Count; i++)
{
var d = final[i];
var x = (int)d.X;
var y = (int)d.Y;
var w = (int)d.Width;
var h = (int)d.Height;
x = Math.Clamp(x, 0, frameCont.Width - 1);
y = Math.Clamp(y, 0, frameCont.Height - 1);
w = Math.Clamp(w, 1, frameCont.Width - x);
h = Math.Clamp(h, 1, frameCont.Height - y);
var rect = new Rect(x, y, w, h);
var center = new Point2f(x + w / 2f, y + h / 2f);
_results.Add(new DetectedPerson{ Box = rect, Center = center });
}
return _results;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void FillInputTensor(Mat rgb)
{
var height = _inputHeight;
var width = _inputWidth;
var planeSize = height * width;
Span<float> dst = _inputBuffer.AsSpan();
unsafe
{
for (var y = 0; y < height; y++)
{
var rowPtr = (byte*)rgb.Ptr(y).ToPointer();
var rowSpan = new Span<byte>(rowPtr, width * 3);
var srcIndex = 0;
for (var x = 0; x < width; x++)
{
var r = rowSpan[srcIndex + 0];
var g = rowSpan[srcIndex + 1];
var b = rowSpan[srcIndex + 2];
var offset = y * width + x;
dst[offset] = r * _inv255;
dst[planeSize + offset] = g * _inv255;
dst[2 * planeSize + offset] = b * _inv255;
srcIndex += 3;
}
}
}
}
// YOLOv10 parser: [1, 300, 6] => x1, y1, x2, y2, score, class_id
private static void ParseYoloV10(
Tensor<float> output,
int originalWidth,
int originalHeight,
float scoreThreshold,
int classIndex,
List<Detection> detections)
{
detections.Clear();
// dims: [1, 300, 6]
var count = output.Dimensions[1];
var xScale = (float)originalWidth / 640f;
var yScale = (float)originalHeight / 640f;
for (var i = 0; i < count; i++)
{
var x1 = output[0, i, 0];
var y1 = output[0, i, 1];
var x2 = output[0, i, 2];
var y2 = output[0, i, 3];
var score = output[0, i, 4];
var cls = (int)output[0, i, 5];
if (cls != classIndex)
continue;
if (score < scoreThreshold)
continue;
var left = x1 * xScale;
var top = y1 * yScale;
var width = (x2 - x1) * xScale;
var height = (y2 - y1) * yScale;
detections.Add(new Detection(left, top, width, height, score));
}
}
private static List<Detection> ApplyNms(
List<Detection> detections,
float nmsThreshold,
List<Detection> nmsBuffer)
{
nmsBuffer.Clear();
if (detections.Count == 0)
return nmsBuffer;
detections.Sort(static (a, b) => b.Score.CompareTo(a.Score));
for (var i = 0; i < detections.Count; i++)
{
var candidate = detections[i];
var keep = true;
for (var j = 0; j < nmsBuffer.Count; j++)
{
if (IoU(candidate, nmsBuffer[j]) >= nmsThreshold)
{
keep = false;
break;
}
}
if (keep)
nmsBuffer.Add(candidate);
}
return nmsBuffer;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static float IoU(in Detection a, in Detection b)
{
var x1 = MathF.Max(a.X, b.X);
var y1 = MathF.Max(a.Y, b.Y);
var x2 = MathF.Min(a.X + a.Width, b.X + b.Width);
var y2 = MathF.Min(a.Y + a.Height, b.Y + b.Height);
var interW = x2 - x1;
if (interW <= 0f) return 0f;
var interH = y2 - y1;
if (interH <= 0f) return 0f;
var interArea = interW * interH;
var areaA = a.Width * a.Height;
var areaB = b.Width * b.Height;
var union = areaA + areaB - interArea;
if (union <= 0f) return 0f;
return interArea / union;
}
public void Dispose()
{
_session?.Dispose();
_resizeMat?.Dispose();
_rgbMat?.Dispose();
}
}

View File

@ -4,7 +4,7 @@ using Microsoft.ML.OnnxRuntime.Tensors;
namespace splitter.algo;
public sealed class YoloOnnxObjectDetector : LoggingBase, IObjectDetector, IDisposable
public sealed class YoloV8ObjectDetector : LoggingBase, IObjectDetector, IDisposable
{
private readonly InferenceSession _session;
private readonly string _inputName;
@ -32,7 +32,7 @@ public sealed class YoloOnnxObjectDetector : LoggingBase, IObjectDetector, IDisp
private readonly List<Detection> _nmsBuffer = new(256);
// Reusable result list
private readonly List<(Rect box, Point2f center)> _results = new(64);
private readonly List<DetectedPerson> _results = new(64);
private readonly float _inv255 = 1f / 255f;
@ -54,7 +54,7 @@ public sealed class YoloOnnxObjectDetector : LoggingBase, IObjectDetector, IDisp
}
}
public YoloOnnxObjectDetector(ILogger logger) : base(logger, -1)
public YoloV8ObjectDetector(ILogger logger) : base(logger, -1)
{
var options = new SessionOptions();
options.AppendExecutionProvider_DML();
@ -78,7 +78,7 @@ public sealed class YoloOnnxObjectDetector : LoggingBase, IObjectDetector, IDisp
_inputs.Add(NamedOnnxValue.CreateFromTensor(_inputName, _inputTensor));
}
public List<(Rect box, Point2f center)> DetectAll(SingleTask job, Mat frameCont)
public List<DetectedPerson> DetectAll(SingleTask job, Mat frameCont)
{
if (frameCont.Empty())
{
@ -142,7 +142,7 @@ public sealed class YoloOnnxObjectDetector : LoggingBase, IObjectDetector, IDisp
var rect = new Rect(x, y, w, h);
var center = new Point2f(x + w / 2f, y + h / 2f);
_results.Add((rect, center));
_results.Add(new DetectedPerson{ Box = rect, Center = center });
}
return _results;

Binary file not shown.