Skip to content

Instantly share code, notes, and snippets.

@stephanschulz
Last active February 19, 2025 13:30
Show Gist options
  • Save stephanschulz/bc2b83ee96dc9e2f32111c9ba1f521f4 to your computer and use it in GitHub Desktop.
Save stephanschulz/bc2b83ee96dc9e2f32111c9ba1f521f4 to your computer and use it in GitHub Desktop.
hopefully fixed the model load crash mentioned here: https://github.com/roymacdonald/ofxVision/issues/9
//
// ofxVisionDetection.h
// example
//
// Created by Roy Macdonald on 20-03-24.
//
#pragma once
#include "ofxVisionResults.h"
#ifdef __OBJC__
#import "Detection.h"
#import "ObjectRecognition.h"
#import <AVKit/AVKit.h>
#endif
template<typename resultsType>
class ofxVisionObjectDetection_ {
public:
virtual ~ofxVisionObjectDetection_();
// ofxVisionObjectDetection_();
virtual void detect(ofPixels & image);
void detect(ofVideoGrabber& vidGrabber);
///\brief load a core ML model.
///\param modelPath file path to the model to load
///\param resultsType The type of result that the model will generate. this depends purely on the model you use. For example, yolo uses DETECTION_RECT.
///\param loadLabels load the labels from the MLModel (YOLO has)
///\return true if successfully loaded, false otherwise
bool loadModel(const std::string& modelPath, bool loadLabels = false);
const resultsType& getDetectionResults(){return detectionResults;}
void draw(ofRectangle rect);
protected:
#ifdef __OBJC__
void detect_(CGImageRef image);
void processResults(NSArray* results);
__strong ObjectRecognition* objectRecognition;
#else
void * objectRecognition;
#endif
resultsType detectionResults;
private:
//int requiredInputWidth = 0;
//int requiredInputHeight = 0;
ofPixels requiredPixels;
bool bRemapRects = false;
glm::vec2 pixTranslation, pixScaling;
std::shared_ptr<ofFbo> helperFbo = nullptr;
};
class ofxVisionDetection:
public ofxVisionObjectDetection_<ofxVision::RectsCollection>
{
public:
ofxVisionDetection();
virtual void detect(ofPixels & image) override;
ofParameter<bool> detectAnimal = {"Detect Animal", false};
ofParameter<bool> detectText = {"Detect Text", false};
ofParameter<bool> detectHand = {"Detect Hand", false};
ofParameter<bool> detectFace = {"Detect Face", true};
ofParameter<bool> detectBody = {"Detect Body", false};
#ifdef OFX_VISON_ENABLE_3D_BODY
ofParameter<bool> detectBody3D = {"Detect Body3D", false};
#endif
ofParameter<bool> recognizeObjects = {"Recognize Objects", true};
ofParameterGroup parameters = {"ofxVision Detect", recognizeObjects, detectAnimal, detectText, detectHand, detectFace, detectBody
#ifdef OFX_VISON_ENABLE_3D_BODY
, detectBody3D
#endif
};
void draw(const ofRectangle& rect);
///\brief load a core ML model.
///\param modelPath file path to the model to load
///\param resultsType The type of result that the model will generate. this depends purely on the model you use. For example, yolo uses DETECTION_RECT.
///\return true if successfully loaded, false otherwise
// bool loadModel(const std::string& modelPath);
const ofxVision::RectsCollection& getObjectDetections(){return getDetectionResults();}
const ofxVision::RectsCollection & getAnimalResults(){return animalResults;}
const ofxVision::PointsCollection& getBodyResults(){return bodyResults;}
const ofxVision::Body3DCollection& getBody3DResults(){
#ifdef OFX_VISON_ENABLE_3D_BODY
ofLogError("ofxVision::Body3DCollection& getBody3DResults") << "This feature is not enabled or available on your Macos version";
#endif
return body3DResults;
}
const ofxVision::RectsCollection& getTextResults(){return textResults;}
const ofxVision::PointsCollection& getHandResults(){return handResults;}
const ofxVision::FaceDetectionsCollection& getFaceResults(){return faceResults;}
protected:
// ofxVision::DetectionResultsType objectResultsType = ofxVision::DETECTION_RECT_COLLECTION;
#ifdef __OBJC__
// ObjectRecognition * objectRecognition;
Detection* detection ;
#else
// void * objectRecognition;
void * detection;
#endif
// ofxVision::RectsCollection rectObjectDetections;
ofxVision::RectsCollection animalResults;
ofxVision::PointsCollection bodyResults;
ofxVision::Body3DCollection body3DResults;
ofxVision::RectsCollection textResults;
ofxVision::PointsCollection handResults;
ofxVision::FaceDetectionsCollection faceResults;
};
//#endif
//
// ofxVisionDetection.m
// example
//
// Created by Roy Macdonald on 20-03-24.
//
#include "ofxVisionDetection.h"
#include "ofxVisionHelpers.h"
#pragma clang diagnostic ignored "-Wunguarded-availability"
int getBodyPoseIDfromKey(VNHumanBodyPoseObservationJointName joint_name)
{
if (joint_name == VNHumanBodyPoseObservationJointNameNose) { return BODY_NOSE; }
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftEye) { return BODY_LEFTEYE; }
else if (joint_name == VNHumanBodyPoseObservationJointNameRightEye) { return BODY_RIGHTEYE; }
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftEar) { return BODY_LEFTEAR; }
else if (joint_name == VNHumanBodyPoseObservationJointNameRightEar) { return BODY_RIGHTEAR; }
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftWrist) { return BODY_LEFTWRIST; }
else if (joint_name == VNHumanBodyPoseObservationJointNameRightWrist) { return BODY_RIGHTWRIST; }
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftElbow) { return BODY_LEFTELBOW; }
else if (joint_name == VNHumanBodyPoseObservationJointNameRightElbow) { return BODY_RIGHTELBOW; }
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftShoulder) { return BODY_LEFTSHOULDER; }
else if (joint_name == VNHumanBodyPoseObservationJointNameRightShoulder) { return BODY_RIGHTSHOULDER; }
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftHip) { return BODY_LEFTHIP; }
else if (joint_name == VNHumanBodyPoseObservationJointNameRightHip) { return BODY_RIGHTHIP; }
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftKnee) { return BODY_LEFTKNEE; }
else if (joint_name == VNHumanBodyPoseObservationJointNameRightKnee) { return BODY_RIGHTKNEE; }
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftAnkle) { return BODY_LEFTANKLE; }
else if (joint_name == VNHumanBodyPoseObservationJointNameRightAnkle) { return BODY_RIGHTANKLE; }
else { return BODY_N_PART; }
}
#ifdef OFX_VISON_ENABLE_3D_BODY
int getBodyPose3DIDfromKey(VNHumanBodyPose3DObservationJointName joint_name)
{
if (joint_name == VNHumanBodyPose3DObservationJointNameTopHead) { return BODY3D_TOPHEAD; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameCenterHead) { return BODY3D_CENTERHEAD; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameCenterShoulder) { return BODY3D_CENTERSHOULDER; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameLeftShoulder) { return BODY3D_LEFTSHOULDER; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameRightShoulder) { return BODY3D_RIGHTSHOULDER; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameLeftElbow) { return BODY3D_LEFTELBOW; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameRightElbow) { return BODY3D_RIGHTELBOW; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameLeftWrist) { return BODY3D_LEFTWRIST; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameRightWrist) { return BODY3D_RIGHTWRIST; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameLeftHip) { return BODY3D_LEFTHIP; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameRightHip) { return BODY3D_RIGHTHIP; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameLeftKnee) { return BODY3D_LEFTKNEE; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameRightKnee) { return BODY3D_RIGHTKNEE; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameLeftAnkle) { return BODY3D_LEFTANKLE; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameRightAnkle) { return BODY3D_RIGHTANKLE; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameRoot) { return BODY3D_ROOT; }
else if (joint_name == VNHumanBodyPose3DObservationJointNameSpine) { return BODY3D_SPINE; }
else { return BODY3D_N_PART; }
}
#endif
int getHandPoseIDfromKey(VNHumanHandPoseObservationJointName joint_name)
{
if (joint_name == VNHumanHandPoseObservationJointNameWrist) { return HAND_WRIST; }
else if (joint_name == VNHumanHandPoseObservationJointNameThumbCMC) { return HAND_THUMB0; }
else if (joint_name == VNHumanHandPoseObservationJointNameThumbMP) { return HAND_THUMB1; }
else if (joint_name == VNHumanHandPoseObservationJointNameThumbIP) { return HAND_THUMB2; }
else if (joint_name == VNHumanHandPoseObservationJointNameThumbTip) { return HAND_THUMB3; }
else if (joint_name == VNHumanHandPoseObservationJointNameIndexMCP) { return HAND_INDEX0; }
else if (joint_name == VNHumanHandPoseObservationJointNameIndexPIP) { return HAND_INDEX1; }
else if (joint_name == VNHumanHandPoseObservationJointNameIndexDIP) { return HAND_INDEX2; }
else if (joint_name == VNHumanHandPoseObservationJointNameIndexTip) { return HAND_INDEX3; }
else if (joint_name == VNHumanHandPoseObservationJointNameMiddleMCP) { return HAND_MIDDLE0; }
else if (joint_name == VNHumanHandPoseObservationJointNameMiddlePIP) { return HAND_MIDDLE1; }
else if (joint_name == VNHumanHandPoseObservationJointNameMiddleDIP) { return HAND_MIDDLE2; }
else if (joint_name == VNHumanHandPoseObservationJointNameMiddleTip) { return HAND_MIDDLE3; }
else if (joint_name == VNHumanHandPoseObservationJointNameRingMCP) { return HAND_RING0; }
else if (joint_name == VNHumanHandPoseObservationJointNameRingPIP) { return HAND_RING1; }
else if (joint_name == VNHumanHandPoseObservationJointNameRingDIP) { return HAND_RING2; }
else if (joint_name == VNHumanHandPoseObservationJointNameRingTip) { return HAND_RING3; }
else if (joint_name == VNHumanHandPoseObservationJointNameLittleMCP) { return HAND_PINKY0; }
else if (joint_name == VNHumanHandPoseObservationJointNameLittlePIP) { return HAND_PINKY1; }
else if (joint_name == VNHumanHandPoseObservationJointNameLittleDIP) { return HAND_PINKY2; }
else if (joint_name == VNHumanHandPoseObservationJointNameLittleTip) { return HAND_PINKY3; }
else { return HAND_N_PART; }
}
void setPoint(ofxVision::PointsDetection& points, const size_t& index, VNRecognizedPoint * point){
points.setPoint(index, point.location.x, point.location.y, point.confidence);
}
void setPoint(ofxVision::Body3DDetection& points, const size_t& index, VNPoint * point){
points.setPoint(index, point.x, point.y, 0.0f);
}
#ifdef OFX_VISON_ENABLE_3D_BODY
void setPoint3D(ofxVision::Body3DDetection& points, const size_t& index, VNHumanBodyRecognizedPoint3D* point){
glm::vec3 position = glm::vec3(point.position.columns[3].x, point.position.columns[3].y, point.position.columns[3].z);
points.setGlobalPosition(index, position);
}
#endif
void processAnimalResults(NSArray* source, ofxVision::RectsCollection& dest){
dest.detections.clear();
for(VNRecognizedObjectObservation *observation in source){
dest.detections.push_back(ofxVision::RectDetection(ofxVisionHelper::toOf(observation.boundingBox), observation.confidence) );
for (int i = 0; i < observation.labels.count; i++){
std::string text = [observation.labels[i].identifier UTF8String];
if(i < observation.labels.count -1){
text += ", ";
}
dest.detections.back().label += text;
}
}
}
//------------------------------------------------------------------------------------------------------------------------
void processFaceResults(NSArray* source, ofxVision::FaceDetectionsCollection& dest){
auto & faces = dest.detections;
faces.clear();
for(VNFaceObservation *observation in source){
auto r = ofxVisionHelper::toOf(observation.boundingBox);
faces.push_back({});
faces.back().orientation.x = 0;
faces.back().orientation.y = [observation.yaw floatValue];
faces.back().orientation.z = [observation.roll floatValue];
VNFaceLandmarkRegion2D* landmarks = observation.landmarks.allPoints;
const CGPoint * points = landmarks.normalizedPoints;
for (int i = 0; i < landmarks.pointCount; i++){
faces.back().points[i].x = points[i].x * r.width + r.x;
faces.back().points[i].y = (1-points[i].y) * r.height + r.y;
faces.back().points[i].z = [landmarks.precisionEstimatesPerPoint[i] floatValue];
}
faces.back().updateRect();
faces.back().score = observation.confidence;
}
}
//------------------------------------------------------------------------------------------------------------------------
void processBodyResults(NSArray* source, ofxVision::PointsCollection& dest){
NSError *err;
dest.detections.clear();
for(VNHumanBodyPoseObservation *observation in source){
NSDictionary <VNHumanBodyPoseObservationJointName, VNRecognizedPoint *> *allPts = [observation recognizedPointsForJointsGroupName:VNHumanBodyPoseObservationJointsGroupNameAll error:&err];
if(err){
NSLog(@"ofxVisionResults::processBodyResults Error: %@", [err localizedDescription]);
continue;
}
dest.detections.emplace_back(ofxVision::PointsDetection(BODY_N_PART));
auto & body = dest.detections.back();
body.score = observation.confidence;
for(VNHumanBodyPoseObservationJointName key in allPts)
{
VNRecognizedPoint* point = [allPts objectForKey:key];
setPoint(body, getBodyPoseIDfromKey(key), point);
}
body.updateRect();
}
}
#ifdef OFX_VISON_ENABLE_3D_BODY
//------------------------------------------------------------------------------------------------------------------------
void processBody3DResults(NSArray* source, ofxVision::Body3DCollection& dest){
NSError *err;
dest.detections.clear();
for(VNHumanBodyPose3DObservation *observation in source){
NSDictionary <VNHumanBodyPose3DObservationJointName, VNHumanBodyRecognizedPoint3D *> *allPts = [observation recognizedPointsForJointsGroupName:VNHumanBodyPose3DObservationJointsGroupNameAll error:&err];
if(err){
NSLog(@"ofxVisionResults::processBody3DResults Error: %@", [err localizedDescription]);
continue;
}
dest.detections.emplace_back(ofxVision::Body3DDetection());
auto & body = dest.detections.back();
body.score = observation.confidence;
body.bodyHeight = observation.bodyHeight;
simd_float4x4 camera_pos = observation.cameraOriginMatrix;
// glm::vec3 position = glm::vec3(camera_pos.columns[3].x, camera_pos.columns[3].y, -camera_pos.columns[3].z);
glm::vec3 position = glm::vec3(camera_pos.columns[3].x, -camera_pos.columns[3].y, -camera_pos.columns[3].z);
glm::vec3 x_axis = glm::vec3(camera_pos.columns[0].x, -camera_pos.columns[0].y, -camera_pos.columns[0].z);
glm::vec3 y_axis = glm::vec3(camera_pos.columns[1].x, -camera_pos.columns[1].y, -camera_pos.columns[1].z);
glm::vec3 z_axis = glm::vec3(camera_pos.columns[2].x, -camera_pos.columns[2].y, -camera_pos.columns[2].z);
glm::vec3 scale = glm::vec3(glm::length(x_axis), glm::length(y_axis), glm::length(z_axis)); // scale is always 1
glm::mat3 rotation_mat = glm::mat3(x_axis / scale.x, y_axis / scale.y, z_axis / scale.z);
glm::quat quaternion = glm::quat_cast(rotation_mat);
body.camera.setGlobalPosition(position);
body.camera.setGlobalOrientation(quaternion);
for(VNHumanBodyPose3DObservationJointName key in allPts)
{
VNPoint* point2d = [observation pointInImageForJointName:key error:&err];
setPoint(body, getBodyPose3DIDfromKey(key), point2d);
VNHumanBodyRecognizedPoint3D* point3d = [allPts objectForKey:key];
NSString* parent = point3d.parentJoint;
body.setParent(getBodyPose3DIDfromKey(key), getBodyPose3DIDfromKey(parent));
setPoint3D(body, getBodyPose3DIDfromKey(key), point3d);
}
body.updateRect();
}
}
void processBody3DResults_2(NSArray* source, ofxVision::Body3DCollection& dest) {
NSError *err;
dest.detections.clear();
for(VNHumanBodyPose3DObservation *observation in source) {
NSDictionary <VNHumanBodyPose3DObservationJointName, VNHumanBodyRecognizedPoint3D *> *allPts = [observation recognizedPointsForJointsGroupName:VNHumanBodyPose3DObservationJointsGroupNameAll error:&err];
if(err) {
NSLog(@"ofxVisionResults::processBody3DResults Error: %@", [err localizedDescription]);
continue;
}
dest.detections.emplace_back(ofxVision::Body3DDetection());
auto & body = dest.detections.back();
body.score = observation.confidence;
body.bodyHeight = observation.bodyHeight;
// Get camera transform from root joint
simd_float4x4 camera_pos = observation.cameraOriginMatrix;
glm::vec3 position = glm::vec3(camera_pos.columns[3].x, -camera_pos.columns[3].y, -camera_pos.columns[3].z);
glm::vec3 x_axis = glm::vec3(camera_pos.columns[0].x, -camera_pos.columns[0].y, -camera_pos.columns[0].z);
glm::vec3 y_axis = glm::vec3(camera_pos.columns[1].x, -camera_pos.columns[1].y, -camera_pos.columns[1].z);
glm::vec3 z_axis = glm::vec3(camera_pos.columns[2].x, -camera_pos.columns[2].y, -camera_pos.columns[2].z);
glm::vec3 scale = glm::vec3(glm::length(x_axis), glm::length(y_axis), glm::length(z_axis));
glm::mat3 rotation_mat = glm::mat3(x_axis / scale.x, y_axis / scale.y, z_axis / scale.z);
glm::quat quaternion = glm::quat_cast(rotation_mat);
body.camera.setGlobalPosition(position);
body.camera.setGlobalOrientation(quaternion);
// Create inverse camera transform to convert from root-relative to camera-relative
glm::mat4 cameraTransform = glm::translate(glm::mat4(1.0f), position) * glm::toMat4(quaternion);
glm::mat4 inverseCameraTransform = glm::inverse(cameraTransform);
for(VNHumanBodyPose3DObservationJointName key in allPts) {
// Get 2D point for visualization
VNPoint* point2d = [observation pointInImageForJointName:key error:&err];
setPoint(body, getBodyPose3DIDfromKey(key), point2d);
// Get root-relative 3D position
VNHumanBodyRecognizedPoint3D* point3d = [allPts objectForKey:key];
simd_float4x4 jointTransform = point3d.position;
glm::vec3 rootRelativePos = glm::vec3(
jointTransform.columns[3].x,
jointTransform.columns[3].y,
jointTransform.columns[3].z
);
// Convert from root-relative to camera-relative position
glm::vec4 pos4 = glm::vec4(rootRelativePos, 1.0);
glm::vec4 cameraRelativePos = inverseCameraTransform * pos4;
// Mirror the X coordinate to match the camera view
cameraRelativePos.x = -cameraRelativePos.x;
body.setGlobalPosition(getBodyPose3DIDfromKey(key), glm::vec3(cameraRelativePos));
// Set parent relationship
NSString* parent = point3d.parentJoint;
body.setParent(getBodyPose3DIDfromKey(key), getBodyPose3DIDfromKey(parent));
}
body.updateRect();
}
}
void processBody3DResults_relative(NSArray* source, ofxVision::Body3DCollection& dest) {
NSError *err;
dest.detections.clear();
for(VNHumanBodyPose3DObservation *observation in source) {
NSDictionary <VNHumanBodyPose3DObservationJointName, VNHumanBodyRecognizedPoint3D *> *allPts = [observation recognizedPointsForJointsGroupName:VNHumanBodyPose3DObservationJointsGroupNameAll error:&err];
if(err) {
NSLog(@"ofxVisionResults::processBody3DResults Error: %@", [err localizedDescription]);
continue;
}
dest.detections.emplace_back(ofxVision::Body3DDetection());
auto & body = dest.detections.back();
body.score = observation.confidence;
body.bodyHeight = observation.bodyHeight;
// Set up camera transform
simd_float4x4 camera_pos = observation.cameraOriginMatrix;
glm::vec3 position = glm::vec3(camera_pos.columns[3].x, camera_pos.columns[3].y, camera_pos.columns[3].z);
glm::vec3 x_axis = glm::vec3(camera_pos.columns[0].x, camera_pos.columns[0].y, camera_pos.columns[0].z);
glm::vec3 y_axis = glm::vec3(camera_pos.columns[1].x, camera_pos.columns[1].y, camera_pos.columns[1].z);
glm::vec3 z_axis = glm::vec3(camera_pos.columns[2].x, camera_pos.columns[2].y, camera_pos.columns[2].z);
glm::vec3 scale = glm::vec3(glm::length(x_axis), glm::length(y_axis), glm::length(z_axis));
glm::mat3 rotation_mat = glm::mat3(x_axis / scale.x, y_axis / scale.y, z_axis / scale.z);
glm::quat quaternion = glm::quat_cast(rotation_mat);
body.camera.setGlobalPosition(position);
body.camera.setGlobalOrientation(quaternion);
for(VNHumanBodyPose3DObservationJointName key in allPts) {
// Get 2D point for visualization
VNPoint* point2d = [observation pointInImageForJointName:key error:&err];
setPoint(body, getBodyPose3DIDfromKey(key), point2d);
// Get camera-relative 3D position
simd_float4x4 relativeTransform;
if ([observation getCameraRelativePosition:&relativeTransform forJointName:key error:&err]) {
glm::vec3 position = glm::vec3(
relativeTransform.columns[3].x, // Flip X to mirror
relativeTransform.columns[3].y, // Keep Y as is
-relativeTransform.columns[3].z // Negate Z to move in front of frustum
);
body.setGlobalPosition(getBodyPose3DIDfromKey(key), position);
} else if (err) {
NSLog(@"Error getting camera relative position for joint: %@", [err localizedDescription]);
}
// Set parent relationship
VNHumanBodyRecognizedPoint3D* point3d = [allPts objectForKey:key];
NSString* parent = point3d.parentJoint;
body.setParent(getBodyPose3DIDfromKey(key), getBodyPose3DIDfromKey(parent));
}
body.updateRect();
}
}
#endif
//------------------------------------------------------------------------------------------------------------------------
void processHandResults(NSArray* source, ofxVision::PointsCollection& dest){
NSError *err;
dest.detections.clear();
for(VNHumanHandPoseObservation *observation in source){
NSDictionary <VNHumanHandPoseObservationJointName, VNRecognizedPoint *> *allPts = [observation recognizedPointsForJointsGroupName:VNHumanHandPoseObservationJointsGroupNameAll error:&err];
if(err){
NSLog(@"ofxVisionResults::processHandResults Error: %@", [err localizedDescription]);
continue;
}
dest.detections.push_back(ofxVision::PointsDetection(HAND_N_PART));
auto & hand = dest.detections.back();
hand.score = observation.confidence;
for(VNHumanHandPoseObservationJointName key in allPts)
{
VNRecognizedPoint *point = [allPts objectForKey:key];
setPoint(hand, getHandPoseIDfromKey(key), point);
}
hand.updateRect();
}
}
//------------------------------------------------------------------------------------------------------------------------
void processTextResults(NSArray* source, ofxVision::RectsCollection& dest){
dest.detections.clear();
for(VNRecognizedTextObservation *observation in source){
dest.detections.push_back(ofxVision::RectDetection(ofxVisionHelper::toOf(observation.boundingBox), observation.confidence) );
VNRecognizedText* recognizedText = [observation topCandidates: 1] [0];
dest.detections.back().label += std::string([recognizedText.string UTF8String]);
}
}
//------------------------------------------------------------------------------------------------------------------------
ofxVisionDetection::ofxVisionDetection(){
detection = [[Detection alloc] init];
}
//
//bool ofxVisionDetection::loadModel(const std::string& modelPath){
//
// NSString * s = [NSString stringWithUTF8String:modelPath.c_str()];
// objectRecognition = [[ObjectRecognition alloc] initWithModelPath: s];
//
// return (objectRecognition != nil);
//}
void ofxVisionDetection::detect(ofPixels &pix)
{
// cout << "ofxVisionDetection::detect...";
CGImageRef image = ofxVisionHelper::CGImageRefFromOfPixels(pix,(int)pix.getWidth(),(int)pix.getHeight(),(int)pix.getNumChannels());
if(recognizeObjects.get()) {
detect_(image);
}
// if(!objectRecognition){
// ofLogError("ofxVisionDetection::detect", "model not loaded. Call ofxVisionDetection::loadModel(...) before calling this method");
// // return;
// }else if(recognizeObjects.get() && [objectRecognition process:image]){
// rectObjectDetections.detections.clear();
// for(VNRecognizedObjectObservation *observation in [objectRecognition results]){
//
// ofxVision::RectDetection det(ofxVisionHelper::toOf(observation.boundingBox),0);
//
// // VNClassificationObservation
// if (observation.labels.count > 0){
// det.label = [observation.labels[0].identifier UTF8String];
// det.score = observation.labels[0].confidence ;
// }
// rectObjectDetections.detections.push_back(det);
// }
//
// }
if( detectAnimal.get() || detectText.get() || detectHand.get() || detectFace.get() || detectBody.get()
#ifdef OFX_VISON_ENABLE_3D_BODY
|| detectBody3D.get()
#endif
){
[detection detect:image
detectAnimal: this->detectAnimal.get()
detectText: this->detectText.get()
detectHand: this->detectHand.get()
detectFace: this->detectFace.get()
detectBody: this->detectBody.get()
#ifdef OFX_VISON_ENABLE_3D_BODY
detectBody3D: this->detectBody3D.get()
#endif
];
if( this->detectAnimal.get() ) {processAnimalResults([detection animalResults], animalResults);}
if( this->detectText.get() ) {processTextResults([detection textResults], textResults);}
if( this->detectHand.get() ) {processHandResults([detection handResults], handResults);}
if( this->detectFace.get() ) {processFaceResults([detection faceResults], faceResults);}
if( this->detectBody.get() ) {processBodyResults([detection bodyResults], bodyResults);}
#ifdef OFX_VISON_ENABLE_3D_BODY
if( this->detectBody3D.get() ) {processBody3DResults([detection body3DResults], body3DResults);}
// if( this->detectBody3D.get() ) {processBody3DResults_relative([detection body3DResults], body3DResults);}
// if( this->detectBody3D.get() ) {processBody3DResults_2([detection body3DResults], body3DResults);}
#endif
}
CGImageRelease(image);
}
void ofxVisionDetection::draw(const ofRectangle& rect){
if(recognizeObjects.get()) detectionResults.draw(rect, true, true);
if( this->detectAnimal.get() ) { animalResults.draw(rect); }
if( this->detectText.get() ) { textResults.draw(rect); }
if( this->detectHand.get() ) { handResults.draw(rect); }
if( this->detectFace.get() ) { faceResults.draw(rect); }
if( this->detectBody.get() ) { bodyResults.draw(rect); }
#ifdef OFX_VISON_ENABLE_3D_BODY
if( this->detectBody3D.get() ) { body3DResults.draw(rect); }
#endif
}
//------------------------------------------------------------------------------------------------------------------------
template<typename resultsType>
bool ofxVisionObjectDetection_<resultsType>::loadModel(const std::string& modelPath, bool _loadLabels) {
__block bool success = false;
__block ObjectRecognition* newObjectRecognition = nil;
// Create a dispatch group to synchronize
dispatch_group_t group = dispatch_group_create();
dispatch_group_enter(group);
// Ensure model loading happens on main thread
dispatch_async(dispatch_get_main_queue(), ^{
@autoreleasepool {
@try {
// Clear existing model first
objectRecognition = nil;
// Load new model
NSString* s = [NSString stringWithUTF8String:modelPath.c_str()];
newObjectRecognition = [[ObjectRecognition alloc] initWithModelPath:s loadLabels:(_loadLabels?YES:NO)];
if (newObjectRecognition != nil) {
objectRecognition = newObjectRecognition;
success = true;
// Get image size requirements if available
NSString* NSimgsz = [objectRecognition getImgsz];
if(NSimgsz) {
std::string imgsz([NSimgsz UTF8String]);
imgsz = ofTrim(imgsz);
imgsz.pop_back();
imgsz = imgsz.substr(1);
auto imgszSplit = ofSplitString(imgsz, ",");
if(imgszSplit.size() == 2) {
int requiredInputWidth = ofToInt(imgszSplit[0]);
int requiredInputHeight = ofToInt(imgszSplit[1]);
requiredPixels.allocate(requiredInputWidth, requiredInputHeight, 3);
requiredPixels.set(0);
}
}
}
} @catch (NSException *exception) {
NSLog(@"Exception during model loading: %@", exception);
success = false;
}
}
dispatch_group_leave(group);
});
// Wait for completion with timeout
dispatch_time_t timeout = dispatch_time(DISPATCH_TIME_NOW, 5 * NSEC_PER_SEC);
if (dispatch_group_wait(group, timeout) == 0) {
return success;
} else {
ofLogError("ofxVisionObjectDetection") << "Model loading timed out";
return false;
}
}
// With ARC, we don't need explicit destructor implementation
template<typename resultsType>
ofxVisionObjectDetection_<resultsType>::~ofxVisionObjectDetection_() {
// ARC will handle cleanup automatically
objectRecognition = nil;
}
template<typename resultsType>
void ofxVisionObjectDetection_<resultsType>::detect(ofVideoGrabber& vidGrabber){
ofPixels* pixPtr = nullptr;
if(requiredPixels.isAllocated() && requiredPixels.getWidth() > 0 && requiredPixels.getHeight() >0){
auto& pix = vidGrabber.getPixels();
if(!(pix.getWidth() == requiredPixels.getWidth() && pix.getHeight() == requiredPixels.getHeight())){
if(pix.getWidth() == requiredPixels.getWidth() && pix.getHeight() < requiredPixels.getHeight()){
requiredPixels.set(0);
int y = (requiredPixels.getHeight() - pix.getHeight())/2;
pix.pasteInto(requiredPixels, 0, y);
pixPtr = &requiredPixels;
bRemapRects = true;
pixTranslation = {0, y/float(requiredPixels.getHeight())};
pixScaling = {1, float(requiredPixels.getHeight())/float( pix.getHeight())};
// std::cout << "pastePix\n";
}else{
ofRectangle srcRect(0,0,pix.getWidth(), pix.getHeight());
ofRectangle dstRect(0,0, requiredPixels.getWidth(), requiredPixels.getHeight());
srcRect.scaleTo(dstRect);
if(!helperFbo){
helperFbo = std::make_shared<ofFbo>();
helperFbo->allocate(requiredPixels.getWidth(), requiredPixels.getHeight());
}
if(helperFbo){
helperFbo->begin();
ofClear(0, 255);
vidGrabber.draw(srcRect);
helperFbo->end();
helperFbo->readToPixels(requiredPixels);
pixPtr = &requiredPixels;
// std::cout << "readToPix\n";
pixTranslation = {srcRect.x/dstRect.getWidth(), srcRect.y/dstRect.getHeight()};
pixScaling = {requiredPixels.getWidth()/ srcRect.getWidth(), requiredPixels.getHeight()/ srcRect.getHeight()};
bRemapRects = true;
}else{
ofLogWarning("ofxVisionObjectDetection_<resultsType>::detect") << "Invalid helperFbo";
}
}
}else{
pixPtr = &pix;
// this->detect(pix);
}
}else{
pixPtr = &vidGrabber.getPixels();
}
// if(bRemapRects){
// std::cout << "pixTranslation: " << pixTranslation << "\n";
// std::cout << "pixScaling: " << pixScaling << "\n";
// }
if(pixPtr){
this->detect(*pixPtr);
}else{
ofLogError("ofxVisionObjectDetection_<resultsType>::detect") << "pixPtr is NULL";
}
}
template<typename resultsType>
void ofxVisionObjectDetection_<resultsType>::detect(ofPixels & pix){
// cout << "ofxVisionDetection::detect...";
CGImageRef image = ofxVisionHelper::CGImageRefFromOfPixels(pix,(int)pix.getWidth(),(int)pix.getHeight(),(int)pix.getNumChannels());
detect_(image);
CGImageRelease(image);
}
template<typename resultsType>
void ofxVisionObjectDetection_<resultsType>::detect_(CGImageRef image) {
if (!objectRecognition) return;
// Ensure processing happens on main thread
if ([NSThread isMainThread]) {
if ([objectRecognition process:image]) {
processResults([objectRecognition results]);
}
} else {
dispatch_sync(dispatch_get_main_queue(), ^{
@autoreleasepool {
if ([objectRecognition process:image]) {
processResults([objectRecognition results]);
}
}
});
}
}
//
//template<typename resultsType>
//void ofxVisionObjectDetection_<resultsType>::handleYOLOOutput(VNCoreMLFeatureValueObservation *observation, ofxVision::RectsCollection& collection, float scale, bool bBreakpoint) {
// // Get the feature value (output) from the observation
// MLFeatureValue *featureValue = observation.featureValue;
//
// // Check if it is an MLMultiArray
// if (featureValue.type == MLFeatureTypeMultiArray) {
// MLMultiArray *multiArray = featureValue.multiArrayValue;
// for(NSNumber * stride in multiArray.strides){
// std::cout << "Stride " << stride.intValue <<"\n";
// }
//
// // Get the dimensions of the output array
// NSInteger numBoxes = [multiArray.shape[0] integerValue]; // N (number of bounding boxes)
// NSInteger numAttributes = [multiArray.shape[1] integerValue]; // Typically 85 for YOLO
//
// // Ensure the dimensions are as expected
// if (numAttributes < 5) {
// ofLogWarning("ofxVisionObjectDetection::handleYOLOOutput")<< "Unexpected output dimensions.";
// return;
// }
// if(bBreakpoint){
// ofLogNotice("break");
// }
// // Access the raw data pointer (assuming the data type is Float32)
// const float *dataPointer = (const float *)multiArray.dataPointer;
//
// // Iterate through each bounding box
// for (NSInteger i = 0; i < numBoxes; i++) {
// // Extract values from the multi-array
// float x = dataPointer[i * numAttributes + 0] / scale;
// float y = dataPointer[i * numAttributes + 1] / scale;
// float width = dataPointer[i * numAttributes + 2] / scale;
// float height = dataPointer[i * numAttributes + 3] / scale;
// float confidence = dataPointer[i * numAttributes + 4];
//
//
// // Check if the confidence is above a threshold (e.g., 0.5)
// if (confidence > 0.5) {
//
// ofxVision::RectDetection det;
// det.rect.set(x, y, width, height);
// det.score = confidence;
//
//
// //NSLog(@"Box %ld: x=%f, y=%f, width=%f, height=%f, confidence=%f", (long)i, x, y, width, height, confidence);
//
// // Optionally extract class scores and find the highest one
//
//// if(objectRecognition){
//// auto labels = [objectRecognition getLabels];
////
//// if(labels){
//
// float maxClassScore = 0.0;
// NSInteger maxClassIndex = -1;
// for (NSInteger j = 5; j < numAttributes; j++) {
// float classScore = dataPointer[i * numAttributes + j];
// if (classScore > maxClassScore) {
// maxClassScore = classScore;
// maxClassIndex = j - 5;
// }
// }
//
// if (maxClassIndex >= 0){//} && maxClassIndex < [labels count]) {
// // NSLog(@"Detected class %ld with score %f", (long)maxClassIndex, maxClassScore);
//// det.label = [[labels objectAtIndex:maxClassIndex] UTF8String];
// det.label = ofToString(maxClassIndex);
//
// }
//// }
//// }
// collection.detections.push_back(det);
//
//
// }
// }
// } else {
// NSLog(@"Unexpected feature value type.");
// }
//}
template<>
void ofxVisionObjectDetection_<ofxVision::RectsCollection>::draw( ofRectangle rect){
// if(bRemapRects){
// rect.x -= pixTranslation.x;
// rect.y -= pixTranslation.y;
// rect.width *= pixScaling.x;
// rect.height *= pixScaling.y;
// }
////
detectionResults.draw(rect, true, true);
}
template<>
void ofxVisionObjectDetection_<ofxVision::LabelsCollection>::draw( ofRectangle rect){
detectionResults.draw(rect.x, rect.y);
}
template<>
void ofxVisionObjectDetection_<ofxVision::RectsCollection>::processResults(NSArray* results){
detectionResults.detections.clear();
for (id object in results) {
if ([object isKindOfClass:[VNRecognizedObjectObservation class]]) {
// cout << "VNClassificationObservation\n";
VNRecognizedObjectObservation *observation = object;
auto bb = ofxVisionHelper::toOf(observation.boundingBox);
if(bRemapRects){
bb.x -= pixTranslation.x;
bb.y -= pixTranslation.y;
bb.x *= pixScaling.x;
bb.y *= pixScaling.y;
bb.width *= pixScaling.x;
bb.height *= pixScaling.y;
}
ofxVision::RectDetection det(bb,0);
// VNClassificationObservation
if (observation.labels.count > 0){
det.label = [observation.labels[0].identifier UTF8String];
det.score = observation.labels[0].confidence ;
}
detectionResults.detections.push_back(det);
}else if ([object isKindOfClass:[VNCoreMLFeatureValueObservation class]]) {
VNCoreMLFeatureValueObservation* observation = object;
cout << "observation: " <<[observation.featureName UTF8String] << "\n";
// handleYOLOOutput(observation,detectionResults, drawScale.get(), breakpoint.get());
// info = iterateMultiArrayDynamically(observation.featureValue.multiArrayValue);
}else if([object isMemberOfClass:[VNRecognizedObjectObservation class]]){
cout <<"isMemberOfClass VNRecognizedObjectObservation"<<endl;
VNRecognizedObjectObservation *observation = (VNRecognizedObjectObservation *)object;
cout<<endl;
cout<<"observation.confidence "<<observation.confidence<<endl;
NSLog(@"observation.boundingBox %@", NSStringFromRect(observation.boundingBox));
NSLog(@"observation %@", object);
cout<<endl;
auto bb = ofxVisionHelper::toOf(observation.boundingBox);
if(bRemapRects){
bb.x -= pixTranslation.x;
bb.y -= pixTranslation.y;
bb.x *= pixScaling.x;
bb.y *= pixScaling.y;
bb.width *= pixScaling.x;
bb.height *= pixScaling.y;
}
ofxVision::RectDetection det(bb,0);
if (observation.labels.count > 0){
det.label = [observation.labels[0].identifier UTF8String];
det.score = observation.labels[0].confidence;
}
detectionResults.detections.push_back(det);
}else{
NSString *className = NSStringFromClass([object class]);
cout << "NOT " << [className UTF8String] << "\n";
}
}
// for(VNRecognizedObjectObservation *observation in results){
//
// }
}
template<>
void ofxVisionObjectDetection_<ofxVision::LabelsCollection>::processResults(NSArray* results){
detectionResults.detections.clear();
for (id object in results) {
if ([object isKindOfClass:[VNClassificationObservation class]]) {
//cout << "VNClassificationObservation\n";
VNClassificationObservation *observation = object;
detectionResults.detections.push_back(ofxVision::LabelDetection ([observation.identifier UTF8String], observation.confidence));
}else if ([object isKindOfClass:[VNCoreMLFeatureValueObservation class]]) {
VNCoreMLFeatureValueObservation* observation = object;
}else if([object isMemberOfClass:[VNRecognizedObjectObservation class]]){
cout <<"isMemberOfClass VNRecognizedObjectObservation"<<endl;
VNRecognizedObjectObservation *observation = (VNRecognizedObjectObservation *)object;
cout<<endl;
cout<<"observation.confidence "<<observation.confidence<<endl;
NSLog(@"observation.boundingBox %@", NSStringFromRect(observation.boundingBox));
NSLog(@"observation %@", object);
cout<<endl;
auto bb = ofxVisionHelper::toOf(observation.boundingBox);
if(bRemapRects){
bb.x -= pixTranslation.x;
bb.y -= pixTranslation.y;
bb.x *= pixScaling.x;
bb.y *= pixScaling.y;
bb.width *= pixScaling.x;
bb.height *= pixScaling.y;
}
ofxVision::RectDetection det(bb,0);
if (observation.labels.count > 0){
det.label = [observation.labels[0].identifier UTF8String];
det.score = observation.labels[0].confidence;
}
detectionResults.detections.push_back(det);
}else{
NSString *className = NSStringFromClass([object class]);
cout << "LabelsCollection NOT " << [className UTF8String] << "\n";
}
}
// for(VNClassificationObservation *observation in results){
// detectionResults.detections.push_back(ofxVision::LabelDetection ([observation.identifier UTF8String], observation.confidence));
// }
}
template class ofxVisionObjectDetection_<ofxVision::LabelsCollection>;
template class ofxVisionObjectDetection_<ofxVision::RectsCollection>;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment