diff --git a/AGENTS.md b/AGENTS.md index efd91a9..d1c07a9 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -13,7 +13,7 @@ Splitter pipeline is: * FFMpeg used to decode video frames into OpenCVSharp.Mat * One of detectors used: - For face detection: [opencv_zoo/models/face_detection_yunet at main opencv/opencv_zoo](https://github.com/opencv/opencv_zoo/tree/main/models/face_detection_yunet) - - For body detection: [yolov8s.pt Ultralytics/YOLOv8 at main](https://huggingface.co/Ultralytics/YOLOv8/blob/main/yolov8s.pt) + - For body detection: [THU-MIG/yolov10: YOLOv10: Real-Time End-to-End Object Detection [NeurIPS 2024]](https://github.com/THU-MIG/yolov10/tree/main) * Camera control aplied (CameraControl class) * Final video frames are encoded back to video file using FFMpeg diff --git a/Splitter-UI/Models/PreviewData.cs b/Splitter-UI/Models/PreviewData.cs index 5dcdb87..ea76d07 100644 --- a/Splitter-UI/Models/PreviewData.cs +++ b/Splitter-UI/Models/PreviewData.cs @@ -6,13 +6,17 @@ public class PreviewData public IReadOnlyList DetectedBoxes { get; } public Rect? CropRect { get; } public Point2f GravitateTo { get; } + public TimeSpan Position { get; } + public int? Rotate { get; } - public PreviewData(Avalonia.Media.Imaging.Bitmap? frame, IReadOnlyList boxes, Rect? crop, Point2f gravitateTo) + public PreviewData(Avalonia.Media.Imaging.Bitmap? frame, IReadOnlyList boxes, Rect? crop, Point2f gravitateTo, TimeSpan position, int? rotate) { Frame = frame; DetectedBoxes = boxes; CropRect = crop; GravitateTo = gravitateTo; + Position = position; + Rotate = rotate; } } \ No newline at end of file diff --git a/Splitter-UI/Program.cs b/Splitter-UI/Program.cs index f62385e..9c44a27 100644 --- a/Splitter-UI/Program.cs +++ b/Splitter-UI/Program.cs @@ -36,16 +36,16 @@ internal sealed class Program // splitter services services.AddSingleton(); - services.AddSingleton(); + services.AddSingleton(); services.AddSingleton( x => new SingleThreadedDetector(x.GetRequiredService()) ); - services.AddSingleton(x => new SingleThreadedDetector(x.GetRequiredService())); + services.AddSingleton(x => new SingleThreadedDetector(x.GetRequiredService())); services.AddSingleton(x => new SingleThreadedDetector(x.GetRequiredService())); services.AddSingleton>( x => detectorName => { return detectorName switch { "face" => x.GetRequiredService>(), - "body" => x.GetRequiredService>(), + "body" => x.GetRequiredService>(), "none" => x.GetRequiredService>(), _ => new DummyDetector() }; diff --git a/Splitter-UI/ViewModels/JobViewModel.cs b/Splitter-UI/ViewModels/JobViewModel.cs index 2bccbb9..84b5750 100644 --- a/Splitter-UI/ViewModels/JobViewModel.cs +++ b/Splitter-UI/ViewModels/JobViewModel.cs @@ -15,7 +15,7 @@ public partial class JobViewModel : ObservableObject public SingleJob GetJob() => Job; [ObservableProperty] private VideoInfo? _probe; - [ObservableProperty] private PreviewData? _preview = new(null, [], null, new(0.5f, 0.5f)); + [ObservableProperty] private PreviewData? _preview = new(null, [], null, new(0.5f, 0.5f), TimeSpan.Zero, null); [ObservableProperty] private Bitmap? _thumbnail; [ObservableProperty] private double _sliderLiveValue; [ObservableProperty] private double _positionSeconds; @@ -70,7 +70,7 @@ public partial class JobViewModel : ObservableObject { if (string.IsNullOrWhiteSpace(value)) { - Job.GravitateTo = null; + Job.GravitateTo = new Point2f(0.5f, 0.5f); } else { @@ -109,6 +109,19 @@ public partial class JobViewModel : ObservableObject } } + public float ScoreThreshold + { + get => Job.ScoreThreshold; + set + { + if (Math.Abs(Job.ScoreThreshold - value) < 0.001) + return; + Job.ScoreThreshold = value; + OnPropertyChanged(); + Task.Run(CreatePreview); + } + } + public string? Mask { get => Job.Mask; @@ -182,10 +195,10 @@ public partial class JobViewModel : ObservableObject public Point2f GravitateTo { - get => Job.GravitateTo ?? new Point2f(0.5f, 0.5f); + get => Job.GravitateTo; set { - if (Job.GravitateTo != null && Math.Abs(Job.GravitateTo.Value.X - value.X) < 0.001 && Math.Abs(Job.GravitateTo.Value.Y - value.Y) < 0.001) + if (Math.Abs(Job.GravitateTo.X - value.X) < 0.001 && Math.Abs(Job.GravitateTo.Y - value.Y) < 0.001) return; Job.GravitateTo = value; @@ -199,6 +212,8 @@ public partial class JobViewModel : ObservableObject get => Job.DetectAbove; set { + if (Math.Abs(Job.DetectAbove - value) < 0.001 ) + return; Job.DetectAbove = value; OnPropertyChanged(); Task.Run(CreatePreview); @@ -262,11 +277,17 @@ public partial class JobViewModel : ObservableObject return; try { - var frame = await _thumbnails.CreateThumbnailAsync(Job.InputFile, Probe, TimeSpan.FromSeconds(PositionSeconds), Probe.Width, Probe.Height, Job.Rotate); + var pos = TimeSpan.FromSeconds(PositionSeconds); + + Bitmap? frame; + if (Preview?.Frame == null || Preview.Position != pos) + frame = await _thumbnails.CreateThumbnailAsync(Job.InputFile, Probe, pos, Probe.Width, Probe.Height, Job.Rotate); + else + frame = Preview.Frame; if ( frame == null ) return; - Preview = new PreviewData(frame, [], null, Job.GravitateTo ?? new (0.5f, 0.5f)); + Preview = new PreviewData(frame, [], null, Job.GravitateTo, pos, Job.Rotate); var detector = _detectorFactory(Job.Detect ?? ""); var j = new SingleTask @@ -304,7 +325,7 @@ public partial class JobViewModel : ObservableObject } var boxes = detections.Select(x => x.box).ToList(); - Preview = new PreviewData(frame, boxes, crop, Job.GravitateTo ?? new (0.5f, 0.5f)); + Preview = new PreviewData(frame, boxes, crop, Job.GravitateTo, pos, Job.Rotate); } catch (Exception ex) { diff --git a/Splitter-UI/Views/InspectorPane.axaml b/Splitter-UI/Views/InspectorPane.axaml index 1457c41..3b0cd7f 100644 --- a/Splitter-UI/Views/InspectorPane.axaml +++ b/Splitter-UI/Views/InspectorPane.axaml @@ -86,6 +86,25 @@ x:DataType="vm:InspectorPaneViewModel"> Width="160"/> + + + + + + + + + + + diff --git a/splitter-cli/CommandLine.cs b/splitter-cli/CommandLine.cs index a15551c..4a3fa3b 100644 --- a/splitter-cli/CommandLine.cs +++ b/splitter-cli/CommandLine.cs @@ -98,6 +98,14 @@ public sealed class CommandLine else Master.DetectAbove = 0.7f; } + else if (arg.StartsWith("--score-threshold=")) + { + var val = arg.Substring("--score-threshold=".Length); + if (float.TryParse(val, NumberStyles.Float, CultureInfo.InvariantCulture, out var scoreThreshold) && scoreThreshold >= 0.0f && scoreThreshold <= 1.0f) + Master.ScoreThreshold = scoreThreshold; + else + Master.ScoreThreshold = 0.25f; + } else if (arg == "--crop") { Master.Crop = ParseCrop(""); @@ -224,22 +232,22 @@ public sealed class CommandLine return key.Length > 0; } - private static Point2f? ParseGravitate(string value) + private static Point2f ParseGravitate(string value) { // Expected format: ":" var parts = value.Split(':'); if (parts.Length != 2) - return null; + return new Point2f(0.5f, 0.5f); if (!float.TryParse(parts[0], NumberStyles.Float, CultureInfo.InvariantCulture, out var x)) - return null; + return new Point2f(0.5f, 0.5f); if (!float.TryParse(parts[1], NumberStyles.Float, CultureInfo.InvariantCulture, out var y)) - return null; + return new Point2f(0.5f, 0.5f); // Normalized range check (0.0–1.0) if (x < 0f || x > 1f || y < 0f || y > 1f) - return null; + return new Point2f(0.5f, 0.5f); return new Point2f(x, y); } diff --git a/splitter-cli/JobProcessor.cs b/splitter-cli/JobProcessor.cs index 3cacb0f..5ed6060 100644 --- a/splitter-cli/JobProcessor.cs +++ b/splitter-cli/JobProcessor.cs @@ -64,7 +64,7 @@ public class JobProcessor(ILogger logger) : LoggingBase(logger, 0), IJobProcesso IObjectDetector detector = job.Detect switch { "face" => new UltraFaceDetector(_logger), - "body" => new YoloOnnxObjectDetector(_logger), + "body" => new YoloV10ObjectDetector(_logger), "none" => new DummyDetector(), _ => throw new InvalidOperationException($"Unknown detector: {job.Detect}") }; diff --git a/splitter-cli/SingleJob.cs b/splitter-cli/SingleJob.cs index 3a1d1c5..33ba9df 100644 --- a/splitter-cli/SingleJob.cs +++ b/splitter-cli/SingleJob.cs @@ -29,12 +29,7 @@ public class SingleJob /// such as left-center (0.2, 0.5) or top-right (0.8, 0.2). This can be useful for /// videos where the subject tends to be off-center or for creative framing choices. /// - public Point2f? GravitateTo { get; set; } - /// - /// Face or human detectors should only report detections if their upper bound starts below this threshold. - /// This is a value between 0.0 and 1.0 mapped to 0..Height. - /// - public float DetectAbove { get; set; } = 0.3f; + public Point2f GravitateTo { get; set; } = new Point2f(0.5f, 0.5f); /// /// Destination file mask. /// @@ -50,6 +45,15 @@ public class SingleJob /// public string? Detect { get; set; } /// + /// Detection confidence threshold. This is a value between 0.0 and 1.0 that sets the minimum confidence + /// + public float ScoreThreshold { get; set; } = 0.25f; + /// + /// Face or human detectors should only report detections if their upper bound starts below this threshold. + /// This is a value between 0.0 and 1.0 mapped to 0..Height. + /// + public float DetectAbove { get; set; } = 0.7f; + /// /// Set starget segments length explicitly. By default, the splitter calculates segment /// lengths to be equal and not exceed 58 seconds. /// diff --git a/splitter-cli/algo/CameraController.cs b/splitter-cli/algo/CameraController.cs index 531f191..370d8af 100644 --- a/splitter-cli/algo/CameraController.cs +++ b/splitter-cli/algo/CameraController.cs @@ -58,7 +58,7 @@ public sealed class CameraController _kalman.Reset(_cameraCenter); } - private Point2f DefaultCenter => _cmd.GravitateTo ?? new Point2f(_videoWidth / 2f, _videoHeight / 2f); + private Point2f DefaultCenter => _cmd.GravitateTo; public int LostFrames => _lostFrames; public Point2f CameraCenter => _cameraCenter; diff --git a/splitter-cli/algo/DummyDetector.cs b/splitter-cli/algo/DummyDetector.cs index c507e1a..687fbaa 100644 --- a/splitter-cli/algo/DummyDetector.cs +++ b/splitter-cli/algo/DummyDetector.cs @@ -7,7 +7,7 @@ public sealed class DummyDetector : IObjectDetector var h = job.Info.Height; var w = job.Info.Width; - var c = job.Job.GravitateTo ?? new Point2f(0.5f, 0.5f); + var c = job.Job.GravitateTo; var x = (int)(c.X * w); var y = (int)(c.Y * h); diff --git a/splitter-cli/algo/YoloV10ObjectDetector.cs b/splitter-cli/algo/YoloV10ObjectDetector.cs new file mode 100644 index 0000000..e6e9dc9 --- /dev/null +++ b/splitter-cli/algo/YoloV10ObjectDetector.cs @@ -0,0 +1,278 @@ +using System.Runtime.CompilerServices; +using Microsoft.ML.OnnxRuntime; +using Microsoft.ML.OnnxRuntime.Tensors; + +namespace splitter.algo; + +public sealed class YoloV10ObjectDetector : LoggingBase, IObjectDetector, IDisposable +{ + private readonly InferenceSession _session; + private readonly string _inputName; + private readonly string _outputName; + + private const int _inputWidth = 640; + private const int _inputHeight = 640; + private const float _scoreThreshold = 0.35f; + private const float _nmsThreshold = 0.45f; + private const int _personClassIndex = 0; + + private readonly Mat _resizeMat = new(); + private readonly Mat _rgbMat = new(); + + private readonly float[] _inputBuffer; + private readonly DenseTensor _inputTensor; + + private readonly List _inputs = new(1); + + private readonly List _detections = new(256); + private readonly List _nmsBuffer = new(256); + + private readonly List<(Rect box, Point2f center)> _results = new(64); + + private readonly float _inv255 = 1f / 255f; + + private readonly struct Detection + { + public readonly float X; + public readonly float Y; + public readonly float Width; + public readonly float Height; + public readonly float Score; + + public Detection(float x, float y, float w, float h, float score) + { + X = x; + Y = y; + Width = w; + Height = h; + Score = score; + } + } + + public YoloV10ObjectDetector(ILogger logger) : base(logger, -1) + { + var options = new SessionOptions(); + options.AppendExecutionProvider_DML(); + + var basePath = AppDomain.CurrentDomain.BaseDirectory; + var modelPath = Path.Combine(basePath, "models", "yolov10m.onnx"); + + _session = new InferenceSession(modelPath, options); + + _inputName = _session.InputMetadata.Keys.First(); + _outputName = _session.OutputMetadata.Keys.First(); + + _inputBuffer = new float[1 * 3 * _inputHeight * _inputWidth]; + _inputTensor = new DenseTensor(_inputBuffer, new[] { 1, 3, _inputHeight, _inputWidth }); + + _inputs.Add(NamedOnnxValue.CreateFromTensor(_inputName, _inputTensor)); + } + + public List<(Rect box, Point2f center)> DetectAll(SingleTask job, Mat frameCont) + { + if (frameCont.Empty()) + { + _results.Clear(); + return _results; + } + + Cv2.Resize(frameCont, _resizeMat, new Size(_inputWidth, _inputHeight)); + Cv2.CvtColor(_resizeMat, _rgbMat, ColorConversionCodes.BGR2RGB); + + FillInputTensor(_rgbMat); + + using var results = _session.Run(_inputs); + + Tensor? output = null; + foreach (var r in results) + { + if (r.Name == _outputName) + { + output = r.AsTensor(); + break; + } + } + + if (output is null) + { + _results.Clear(); + return _results; + } + + ParseYoloV10( + output, + frameCont.Width, + frameCont.Height, + job.Job.ScoreThreshold, + _personClassIndex, + _detections); + + var final = ApplyNms(_detections, _nmsThreshold, _nmsBuffer); + + _results.Clear(); + for (var i = 0; i < final.Count; i++) + { + var d = final[i]; + + var x = (int)d.X; + var y = (int)d.Y; + var w = (int)d.Width; + var h = (int)d.Height; + + x = Math.Clamp(x, 0, frameCont.Width - 1); + y = Math.Clamp(y, 0, frameCont.Height - 1); + w = Math.Clamp(w, 1, frameCont.Width - x); + h = Math.Clamp(h, 1, frameCont.Height - y); + + var rect = new Rect(x, y, w, h); + var center = new Point2f(x + w / 2f, y + h / 2f); + + _results.Add((rect, center)); + } + + return _results; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void FillInputTensor(Mat rgb) + { + var height = _inputHeight; + var width = _inputWidth; + + var planeSize = height * width; + + Span dst = _inputBuffer.AsSpan(); + + unsafe + { + for (var y = 0; y < height; y++) + { + var rowPtr = (byte*)rgb.Ptr(y).ToPointer(); + var rowSpan = new Span(rowPtr, width * 3); + + var srcIndex = 0; + + for (var x = 0; x < width; x++) + { + var r = rowSpan[srcIndex + 0]; + var g = rowSpan[srcIndex + 1]; + var b = rowSpan[srcIndex + 2]; + + var offset = y * width + x; + + dst[offset] = r * _inv255; + dst[planeSize + offset] = g * _inv255; + dst[2 * planeSize + offset] = b * _inv255; + + srcIndex += 3; + } + } + } + } + + // YOLOv10 parser: [1, 300, 6] => x1, y1, x2, y2, score, class_id + private static void ParseYoloV10( + Tensor output, + int originalWidth, + int originalHeight, + float scoreThreshold, + int classIndex, + List detections) + { + detections.Clear(); + + // dims: [1, 300, 6] + var count = output.Dimensions[1]; + + var xScale = (float)originalWidth / 640f; + var yScale = (float)originalHeight / 640f; + + for (var i = 0; i < count; i++) + { + var x1 = output[0, i, 0]; + var y1 = output[0, i, 1]; + var x2 = output[0, i, 2]; + var y2 = output[0, i, 3]; + var score = output[0, i, 4]; + var cls = (int)output[0, i, 5]; + + if (cls != classIndex) + continue; + + if (score < scoreThreshold) + continue; + + var left = x1 * xScale; + var top = y1 * yScale; + var width = (x2 - x1) * xScale; + var height = (y2 - y1) * yScale; + + detections.Add(new Detection(left, top, width, height, score)); + } + } + + private static List ApplyNms( + List detections, + float nmsThreshold, + List nmsBuffer) + { + nmsBuffer.Clear(); + + if (detections.Count == 0) + return nmsBuffer; + + detections.Sort(static (a, b) => b.Score.CompareTo(a.Score)); + + for (var i = 0; i < detections.Count; i++) + { + var candidate = detections[i]; + var keep = true; + + for (var j = 0; j < nmsBuffer.Count; j++) + { + if (IoU(candidate, nmsBuffer[j]) >= nmsThreshold) + { + keep = false; + break; + } + } + + if (keep) + nmsBuffer.Add(candidate); + } + + return nmsBuffer; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static float IoU(in Detection a, in Detection b) + { + var x1 = MathF.Max(a.X, b.X); + var y1 = MathF.Max(a.Y, b.Y); + var x2 = MathF.Min(a.X + a.Width, b.X + b.Width); + var y2 = MathF.Min(a.Y + a.Height, b.Y + b.Height); + + var interW = x2 - x1; + if (interW <= 0f) return 0f; + + var interH = y2 - y1; + if (interH <= 0f) return 0f; + + var interArea = interW * interH; + + var areaA = a.Width * a.Height; + var areaB = b.Width * b.Height; + + var union = areaA + areaB - interArea; + if (union <= 0f) return 0f; + + return interArea / union; + } + + public void Dispose() + { + _session?.Dispose(); + _resizeMat?.Dispose(); + _rgbMat?.Dispose(); + } +} diff --git a/splitter-cli/algo/YoloOnnxObjectDetector.cs b/splitter-cli/algo/YoloV8ObjectDetector.cs similarity index 98% rename from splitter-cli/algo/YoloOnnxObjectDetector.cs rename to splitter-cli/algo/YoloV8ObjectDetector.cs index 3ee4bd6..e8c6e63 100644 --- a/splitter-cli/algo/YoloOnnxObjectDetector.cs +++ b/splitter-cli/algo/YoloV8ObjectDetector.cs @@ -4,7 +4,7 @@ using Microsoft.ML.OnnxRuntime.Tensors; namespace splitter.algo; -public sealed class YoloOnnxObjectDetector : LoggingBase, IObjectDetector, IDisposable +public sealed class YoloV8ObjectDetector : LoggingBase, IObjectDetector, IDisposable { private readonly InferenceSession _session; private readonly string _inputName; @@ -54,7 +54,7 @@ public sealed class YoloOnnxObjectDetector : LoggingBase, IObjectDetector, IDisp } } - public YoloOnnxObjectDetector(ILogger logger) : base(logger, -1) + public YoloV8ObjectDetector(ILogger logger) : base(logger, -1) { var options = new SessionOptions(); options.AppendExecutionProvider_DML(); diff --git a/splitter-cli/models/yolov8s.onnx b/splitter-cli/models/yolov10m.onnx similarity index 65% rename from splitter-cli/models/yolov8s.onnx rename to splitter-cli/models/yolov10m.onnx index 8fbc42e..7eda40b 100644 Binary files a/splitter-cli/models/yolov8s.onnx and b/splitter-cli/models/yolov10m.onnx differ