Switched body detection model to yolov10m.

This commit is contained in:
Alexander Shabarshov 2026-06-08 12:45:35 +01:00
parent d3c82ce924
commit 78c9713425
13 changed files with 362 additions and 28 deletions

View File

@ -13,7 +13,7 @@ Splitter pipeline is:
* FFMpeg used to decode video frames into OpenCVSharp.Mat * FFMpeg used to decode video frames into OpenCVSharp.Mat
* One of detectors used: * One of detectors used:
- For face detection: [opencv_zoo/models/face_detection_yunet at main opencv/opencv_zoo](https://github.com/opencv/opencv_zoo/tree/main/models/face_detection_yunet) - For face detection: [opencv_zoo/models/face_detection_yunet at main opencv/opencv_zoo](https://github.com/opencv/opencv_zoo/tree/main/models/face_detection_yunet)
- For body detection: [yolov8s.pt Ultralytics/YOLOv8 at main](https://huggingface.co/Ultralytics/YOLOv8/blob/main/yolov8s.pt) - For body detection: [THU-MIG/yolov10: YOLOv10: Real-Time End-to-End Object Detection [NeurIPS 2024]](https://github.com/THU-MIG/yolov10/tree/main)
* Camera control aplied (CameraControl class) * Camera control aplied (CameraControl class)
* Final video frames are encoded back to video file using FFMpeg * Final video frames are encoded back to video file using FFMpeg

View File

@ -6,13 +6,17 @@ public class PreviewData
public IReadOnlyList<OpenCvSharp.Rect> DetectedBoxes { get; } public IReadOnlyList<OpenCvSharp.Rect> DetectedBoxes { get; }
public Rect? CropRect { get; } public Rect? CropRect { get; }
public Point2f GravitateTo { get; } public Point2f GravitateTo { get; }
public TimeSpan Position { get; }
public int? Rotate { get; }
public PreviewData(Avalonia.Media.Imaging.Bitmap? frame, IReadOnlyList<OpenCvSharp.Rect> boxes, Rect? crop, Point2f gravitateTo) public PreviewData(Avalonia.Media.Imaging.Bitmap? frame, IReadOnlyList<OpenCvSharp.Rect> boxes, Rect? crop, Point2f gravitateTo, TimeSpan position, int? rotate)
{ {
Frame = frame; Frame = frame;
DetectedBoxes = boxes; DetectedBoxes = boxes;
CropRect = crop; CropRect = crop;
GravitateTo = gravitateTo; GravitateTo = gravitateTo;
Position = position;
Rotate = rotate;
} }
} }

View File

@ -36,16 +36,16 @@ internal sealed class Program
// splitter services // splitter services
services.AddSingleton<UltraFaceDetector>(); services.AddSingleton<UltraFaceDetector>();
services.AddSingleton<YoloOnnxObjectDetector>(); services.AddSingleton<YoloV10ObjectDetector>();
services.AddSingleton( x => new SingleThreadedDetector<UltraFaceDetector>(x.GetRequiredService<UltraFaceDetector>()) ); services.AddSingleton( x => new SingleThreadedDetector<UltraFaceDetector>(x.GetRequiredService<UltraFaceDetector>()) );
services.AddSingleton(x => new SingleThreadedDetector<YoloOnnxObjectDetector>(x.GetRequiredService<YoloOnnxObjectDetector>())); services.AddSingleton(x => new SingleThreadedDetector<YoloV10ObjectDetector>(x.GetRequiredService<YoloV10ObjectDetector>()));
services.AddSingleton(x => new SingleThreadedDetector<DummyDetector>(x.GetRequiredService<DummyDetector>())); services.AddSingleton(x => new SingleThreadedDetector<DummyDetector>(x.GetRequiredService<DummyDetector>()));
services.AddSingleton<Func<string, IObjectDetector>>( x => detectorName => services.AddSingleton<Func<string, IObjectDetector>>( x => detectorName =>
{ {
return detectorName switch return detectorName switch
{ {
"face" => x.GetRequiredService<SingleThreadedDetector<UltraFaceDetector>>(), "face" => x.GetRequiredService<SingleThreadedDetector<UltraFaceDetector>>(),
"body" => x.GetRequiredService<SingleThreadedDetector<YoloOnnxObjectDetector>>(), "body" => x.GetRequiredService<SingleThreadedDetector<YoloV10ObjectDetector>>(),
"none" => x.GetRequiredService<SingleThreadedDetector<DummyDetector>>(), "none" => x.GetRequiredService<SingleThreadedDetector<DummyDetector>>(),
_ => new DummyDetector() _ => new DummyDetector()
}; };

View File

@ -15,7 +15,7 @@ public partial class JobViewModel : ObservableObject
public SingleJob GetJob() => Job; public SingleJob GetJob() => Job;
[ObservableProperty] private VideoInfo? _probe; [ObservableProperty] private VideoInfo? _probe;
[ObservableProperty] private PreviewData? _preview = new(null, [], null, new(0.5f, 0.5f)); [ObservableProperty] private PreviewData? _preview = new(null, [], null, new(0.5f, 0.5f), TimeSpan.Zero, null);
[ObservableProperty] private Bitmap? _thumbnail; [ObservableProperty] private Bitmap? _thumbnail;
[ObservableProperty] private double _sliderLiveValue; [ObservableProperty] private double _sliderLiveValue;
[ObservableProperty] private double _positionSeconds; [ObservableProperty] private double _positionSeconds;
@ -70,7 +70,7 @@ public partial class JobViewModel : ObservableObject
{ {
if (string.IsNullOrWhiteSpace(value)) if (string.IsNullOrWhiteSpace(value))
{ {
Job.GravitateTo = null; Job.GravitateTo = new Point2f(0.5f, 0.5f);
} }
else else
{ {
@ -109,6 +109,19 @@ public partial class JobViewModel : ObservableObject
} }
} }
public float ScoreThreshold
{
get => Job.ScoreThreshold;
set
{
if (Math.Abs(Job.ScoreThreshold - value) < 0.001)
return;
Job.ScoreThreshold = value;
OnPropertyChanged();
Task.Run(CreatePreview);
}
}
public string? Mask public string? Mask
{ {
get => Job.Mask; get => Job.Mask;
@ -182,10 +195,10 @@ public partial class JobViewModel : ObservableObject
public Point2f GravitateTo public Point2f GravitateTo
{ {
get => Job.GravitateTo ?? new Point2f(0.5f, 0.5f); get => Job.GravitateTo;
set set
{ {
if (Job.GravitateTo != null && Math.Abs(Job.GravitateTo.Value.X - value.X) < 0.001 && Math.Abs(Job.GravitateTo.Value.Y - value.Y) < 0.001) if (Math.Abs(Job.GravitateTo.X - value.X) < 0.001 && Math.Abs(Job.GravitateTo.Y - value.Y) < 0.001)
return; return;
Job.GravitateTo = value; Job.GravitateTo = value;
@ -199,6 +212,8 @@ public partial class JobViewModel : ObservableObject
get => Job.DetectAbove; get => Job.DetectAbove;
set set
{ {
if (Math.Abs(Job.DetectAbove - value) < 0.001 )
return;
Job.DetectAbove = value; Job.DetectAbove = value;
OnPropertyChanged(); OnPropertyChanged();
Task.Run(CreatePreview); Task.Run(CreatePreview);
@ -262,11 +277,17 @@ public partial class JobViewModel : ObservableObject
return; return;
try try
{ {
var frame = await _thumbnails.CreateThumbnailAsync(Job.InputFile, Probe, TimeSpan.FromSeconds(PositionSeconds), Probe.Width, Probe.Height, Job.Rotate); var pos = TimeSpan.FromSeconds(PositionSeconds);
Bitmap? frame;
if (Preview?.Frame == null || Preview.Position != pos)
frame = await _thumbnails.CreateThumbnailAsync(Job.InputFile, Probe, pos, Probe.Width, Probe.Height, Job.Rotate);
else
frame = Preview.Frame;
if ( frame == null ) if ( frame == null )
return; return;
Preview = new PreviewData(frame, [], null, Job.GravitateTo ?? new (0.5f, 0.5f)); Preview = new PreviewData(frame, [], null, Job.GravitateTo, pos, Job.Rotate);
var detector = _detectorFactory(Job.Detect ?? ""); var detector = _detectorFactory(Job.Detect ?? "");
var j = new SingleTask var j = new SingleTask
@ -304,7 +325,7 @@ public partial class JobViewModel : ObservableObject
} }
var boxes = detections.Select(x => x.box).ToList(); var boxes = detections.Select(x => x.box).ToList();
Preview = new PreviewData(frame, boxes, crop, Job.GravitateTo ?? new (0.5f, 0.5f)); Preview = new PreviewData(frame, boxes, crop, Job.GravitateTo, pos, Job.Rotate);
} }
catch (Exception ex) catch (Exception ex)
{ {

View File

@ -86,6 +86,25 @@ x:DataType="vm:InspectorPaneViewModel">
Width="160"/> Width="160"/>
</StackPanel> </StackPanel>
<!-- ScoreThreshold -->
<StackPanel Orientation="Horizontal" Spacing="8">
<TextBlock Text="Score Threshold" Width="120"/>
<StackPanel Orientation="Vertical" Spacing="4" Width="260">
<Slider Minimum="0"
Maximum="1"
SmallChange="0.01"
LargeChange="0.1"
TickFrequency="0.05"
IsSnapToTickEnabled="False"
Value="{Binding Selected.ScoreThreshold, Mode=TwoWay}"/>
<TextBlock Text="{Binding Selected.ScoreThreshold, StringFormat='0.00'}"
FontSize="10"
HorizontalAlignment="Right"/>
</StackPanel>
</StackPanel>
<!-- DetectAbove --> <!-- DetectAbove -->
<StackPanel Orientation="Horizontal" Spacing="8"> <StackPanel Orientation="Horizontal" Spacing="8">
<TextBlock Text="Detect Above" Width="120"/> <TextBlock Text="Detect Above" Width="120"/>

View File

@ -98,6 +98,14 @@ public sealed class CommandLine
else else
Master.DetectAbove = 0.7f; Master.DetectAbove = 0.7f;
} }
else if (arg.StartsWith("--score-threshold="))
{
var val = arg.Substring("--score-threshold=".Length);
if (float.TryParse(val, NumberStyles.Float, CultureInfo.InvariantCulture, out var scoreThreshold) && scoreThreshold >= 0.0f && scoreThreshold <= 1.0f)
Master.ScoreThreshold = scoreThreshold;
else
Master.ScoreThreshold = 0.25f;
}
else if (arg == "--crop") else if (arg == "--crop")
{ {
Master.Crop = ParseCrop(""); Master.Crop = ParseCrop("");
@ -224,22 +232,22 @@ public sealed class CommandLine
return key.Length > 0; return key.Length > 0;
} }
private static Point2f? ParseGravitate(string value) private static Point2f ParseGravitate(string value)
{ {
// Expected format: "<x>:<y>" // Expected format: "<x>:<y>"
var parts = value.Split(':'); var parts = value.Split(':');
if (parts.Length != 2) if (parts.Length != 2)
return null; return new Point2f(0.5f, 0.5f);
if (!float.TryParse(parts[0], NumberStyles.Float, CultureInfo.InvariantCulture, out var x)) if (!float.TryParse(parts[0], NumberStyles.Float, CultureInfo.InvariantCulture, out var x))
return null; return new Point2f(0.5f, 0.5f);
if (!float.TryParse(parts[1], NumberStyles.Float, CultureInfo.InvariantCulture, out var y)) if (!float.TryParse(parts[1], NumberStyles.Float, CultureInfo.InvariantCulture, out var y))
return null; return new Point2f(0.5f, 0.5f);
// Normalized range check (0.01.0) // Normalized range check (0.01.0)
if (x < 0f || x > 1f || y < 0f || y > 1f) if (x < 0f || x > 1f || y < 0f || y > 1f)
return null; return new Point2f(0.5f, 0.5f);
return new Point2f(x, y); return new Point2f(x, y);
} }

View File

@ -64,7 +64,7 @@ public class JobProcessor(ILogger logger) : LoggingBase(logger, 0), IJobProcesso
IObjectDetector detector = job.Detect switch IObjectDetector detector = job.Detect switch
{ {
"face" => new UltraFaceDetector(_logger), "face" => new UltraFaceDetector(_logger),
"body" => new YoloOnnxObjectDetector(_logger), "body" => new YoloV10ObjectDetector(_logger),
"none" => new DummyDetector(), "none" => new DummyDetector(),
_ => throw new InvalidOperationException($"Unknown detector: {job.Detect}") _ => throw new InvalidOperationException($"Unknown detector: {job.Detect}")
}; };

View File

@ -29,12 +29,7 @@ public class SingleJob
/// such as left-center (0.2, 0.5) or top-right (0.8, 0.2). This can be useful for /// such as left-center (0.2, 0.5) or top-right (0.8, 0.2). This can be useful for
/// videos where the subject tends to be off-center or for creative framing choices. /// videos where the subject tends to be off-center or for creative framing choices.
/// </summary> /// </summary>
public Point2f? GravitateTo { get; set; } public Point2f GravitateTo { get; set; } = new Point2f(0.5f, 0.5f);
/// <summary>
/// Face or human detectors should only report detections if their upper bound starts below this threshold.
/// This is a value between 0.0 and 1.0 mapped to 0..Height.
/// </summary>
public float DetectAbove { get; set; } = 0.3f;
/// <summary> /// <summary>
/// Destination file mask. /// Destination file mask.
/// </summary> /// </summary>
@ -50,6 +45,15 @@ public class SingleJob
/// </summary> /// </summary>
public string? Detect { get; set; } public string? Detect { get; set; }
/// <summary> /// <summary>
/// Detection confidence threshold. This is a value between 0.0 and 1.0 that sets the minimum confidence
/// </summary>
public float ScoreThreshold { get; set; } = 0.25f;
/// <summary>
/// Face or human detectors should only report detections if their upper bound starts below this threshold.
/// This is a value between 0.0 and 1.0 mapped to 0..Height.
/// </summary>
public float DetectAbove { get; set; } = 0.7f;
/// <summary>
/// Set starget segments length explicitly. By default, the splitter calculates segment /// Set starget segments length explicitly. By default, the splitter calculates segment
/// lengths to be equal and not exceed 58 seconds. /// lengths to be equal and not exceed 58 seconds.
/// </summary> /// </summary>

View File

@ -58,7 +58,7 @@ public sealed class CameraController
_kalman.Reset(_cameraCenter); _kalman.Reset(_cameraCenter);
} }
private Point2f DefaultCenter => _cmd.GravitateTo ?? new Point2f(_videoWidth / 2f, _videoHeight / 2f); private Point2f DefaultCenter => _cmd.GravitateTo;
public int LostFrames => _lostFrames; public int LostFrames => _lostFrames;
public Point2f CameraCenter => _cameraCenter; public Point2f CameraCenter => _cameraCenter;

View File

@ -7,7 +7,7 @@ public sealed class DummyDetector : IObjectDetector
var h = job.Info.Height; var h = job.Info.Height;
var w = job.Info.Width; var w = job.Info.Width;
var c = job.Job.GravitateTo ?? new Point2f(0.5f, 0.5f); var c = job.Job.GravitateTo;
var x = (int)(c.X * w); var x = (int)(c.X * w);
var y = (int)(c.Y * h); var y = (int)(c.Y * h);

View File

@ -0,0 +1,278 @@
using System.Runtime.CompilerServices;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
namespace splitter.algo;
public sealed class YoloV10ObjectDetector : LoggingBase, IObjectDetector, IDisposable
{
private readonly InferenceSession _session;
private readonly string _inputName;
private readonly string _outputName;
private const int _inputWidth = 640;
private const int _inputHeight = 640;
private const float _scoreThreshold = 0.35f;
private const float _nmsThreshold = 0.45f;
private const int _personClassIndex = 0;
private readonly Mat _resizeMat = new();
private readonly Mat _rgbMat = new();
private readonly float[] _inputBuffer;
private readonly DenseTensor<float> _inputTensor;
private readonly List<NamedOnnxValue> _inputs = new(1);
private readonly List<Detection> _detections = new(256);
private readonly List<Detection> _nmsBuffer = new(256);
private readonly List<(Rect box, Point2f center)> _results = new(64);
private readonly float _inv255 = 1f / 255f;
private readonly struct Detection
{
public readonly float X;
public readonly float Y;
public readonly float Width;
public readonly float Height;
public readonly float Score;
public Detection(float x, float y, float w, float h, float score)
{
X = x;
Y = y;
Width = w;
Height = h;
Score = score;
}
}
public YoloV10ObjectDetector(ILogger logger) : base(logger, -1)
{
var options = new SessionOptions();
options.AppendExecutionProvider_DML();
var basePath = AppDomain.CurrentDomain.BaseDirectory;
var modelPath = Path.Combine(basePath, "models", "yolov10m.onnx");
_session = new InferenceSession(modelPath, options);
_inputName = _session.InputMetadata.Keys.First();
_outputName = _session.OutputMetadata.Keys.First();
_inputBuffer = new float[1 * 3 * _inputHeight * _inputWidth];
_inputTensor = new DenseTensor<float>(_inputBuffer, new[] { 1, 3, _inputHeight, _inputWidth });
_inputs.Add(NamedOnnxValue.CreateFromTensor(_inputName, _inputTensor));
}
public List<(Rect box, Point2f center)> DetectAll(SingleTask job, Mat frameCont)
{
if (frameCont.Empty())
{
_results.Clear();
return _results;
}
Cv2.Resize(frameCont, _resizeMat, new Size(_inputWidth, _inputHeight));
Cv2.CvtColor(_resizeMat, _rgbMat, ColorConversionCodes.BGR2RGB);
FillInputTensor(_rgbMat);
using var results = _session.Run(_inputs);
Tensor<float>? output = null;
foreach (var r in results)
{
if (r.Name == _outputName)
{
output = r.AsTensor<float>();
break;
}
}
if (output is null)
{
_results.Clear();
return _results;
}
ParseYoloV10(
output,
frameCont.Width,
frameCont.Height,
job.Job.ScoreThreshold,
_personClassIndex,
_detections);
var final = ApplyNms(_detections, _nmsThreshold, _nmsBuffer);
_results.Clear();
for (var i = 0; i < final.Count; i++)
{
var d = final[i];
var x = (int)d.X;
var y = (int)d.Y;
var w = (int)d.Width;
var h = (int)d.Height;
x = Math.Clamp(x, 0, frameCont.Width - 1);
y = Math.Clamp(y, 0, frameCont.Height - 1);
w = Math.Clamp(w, 1, frameCont.Width - x);
h = Math.Clamp(h, 1, frameCont.Height - y);
var rect = new Rect(x, y, w, h);
var center = new Point2f(x + w / 2f, y + h / 2f);
_results.Add((rect, center));
}
return _results;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void FillInputTensor(Mat rgb)
{
var height = _inputHeight;
var width = _inputWidth;
var planeSize = height * width;
Span<float> dst = _inputBuffer.AsSpan();
unsafe
{
for (var y = 0; y < height; y++)
{
var rowPtr = (byte*)rgb.Ptr(y).ToPointer();
var rowSpan = new Span<byte>(rowPtr, width * 3);
var srcIndex = 0;
for (var x = 0; x < width; x++)
{
var r = rowSpan[srcIndex + 0];
var g = rowSpan[srcIndex + 1];
var b = rowSpan[srcIndex + 2];
var offset = y * width + x;
dst[offset] = r * _inv255;
dst[planeSize + offset] = g * _inv255;
dst[2 * planeSize + offset] = b * _inv255;
srcIndex += 3;
}
}
}
}
// YOLOv10 parser: [1, 300, 6] => x1, y1, x2, y2, score, class_id
private static void ParseYoloV10(
Tensor<float> output,
int originalWidth,
int originalHeight,
float scoreThreshold,
int classIndex,
List<Detection> detections)
{
detections.Clear();
// dims: [1, 300, 6]
var count = output.Dimensions[1];
var xScale = (float)originalWidth / 640f;
var yScale = (float)originalHeight / 640f;
for (var i = 0; i < count; i++)
{
var x1 = output[0, i, 0];
var y1 = output[0, i, 1];
var x2 = output[0, i, 2];
var y2 = output[0, i, 3];
var score = output[0, i, 4];
var cls = (int)output[0, i, 5];
if (cls != classIndex)
continue;
if (score < scoreThreshold)
continue;
var left = x1 * xScale;
var top = y1 * yScale;
var width = (x2 - x1) * xScale;
var height = (y2 - y1) * yScale;
detections.Add(new Detection(left, top, width, height, score));
}
}
private static List<Detection> ApplyNms(
List<Detection> detections,
float nmsThreshold,
List<Detection> nmsBuffer)
{
nmsBuffer.Clear();
if (detections.Count == 0)
return nmsBuffer;
detections.Sort(static (a, b) => b.Score.CompareTo(a.Score));
for (var i = 0; i < detections.Count; i++)
{
var candidate = detections[i];
var keep = true;
for (var j = 0; j < nmsBuffer.Count; j++)
{
if (IoU(candidate, nmsBuffer[j]) >= nmsThreshold)
{
keep = false;
break;
}
}
if (keep)
nmsBuffer.Add(candidate);
}
return nmsBuffer;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static float IoU(in Detection a, in Detection b)
{
var x1 = MathF.Max(a.X, b.X);
var y1 = MathF.Max(a.Y, b.Y);
var x2 = MathF.Min(a.X + a.Width, b.X + b.Width);
var y2 = MathF.Min(a.Y + a.Height, b.Y + b.Height);
var interW = x2 - x1;
if (interW <= 0f) return 0f;
var interH = y2 - y1;
if (interH <= 0f) return 0f;
var interArea = interW * interH;
var areaA = a.Width * a.Height;
var areaB = b.Width * b.Height;
var union = areaA + areaB - interArea;
if (union <= 0f) return 0f;
return interArea / union;
}
public void Dispose()
{
_session?.Dispose();
_resizeMat?.Dispose();
_rgbMat?.Dispose();
}
}

View File

@ -4,7 +4,7 @@ using Microsoft.ML.OnnxRuntime.Tensors;
namespace splitter.algo; namespace splitter.algo;
public sealed class YoloOnnxObjectDetector : LoggingBase, IObjectDetector, IDisposable public sealed class YoloV8ObjectDetector : LoggingBase, IObjectDetector, IDisposable
{ {
private readonly InferenceSession _session; private readonly InferenceSession _session;
private readonly string _inputName; private readonly string _inputName;
@ -54,7 +54,7 @@ public sealed class YoloOnnxObjectDetector : LoggingBase, IObjectDetector, IDisp
} }
} }
public YoloOnnxObjectDetector(ILogger logger) : base(logger, -1) public YoloV8ObjectDetector(ILogger logger) : base(logger, -1)
{ {
var options = new SessionOptions(); var options = new SessionOptions();
options.AppendExecutionProvider_DML(); options.AppendExecutionProvider_DML();