Last active
February 19, 2025 13:30
-
-
Save stephanschulz/bc2b83ee96dc9e2f32111c9ba1f521f4 to your computer and use it in GitHub Desktop.
hopefully fixed the model load crash mentioned here: https://github.com/roymacdonald/ofxVision/issues/9
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// ofxVisionDetection.h | |
// example | |
// | |
// Created by Roy Macdonald on 20-03-24. | |
// | |
#pragma once | |
#include "ofxVisionResults.h" | |
#ifdef __OBJC__ | |
#import "Detection.h" | |
#import "ObjectRecognition.h" | |
#import <AVKit/AVKit.h> | |
#endif | |
template<typename resultsType> | |
class ofxVisionObjectDetection_ { | |
public: | |
virtual ~ofxVisionObjectDetection_(); | |
// ofxVisionObjectDetection_(); | |
virtual void detect(ofPixels & image); | |
void detect(ofVideoGrabber& vidGrabber); | |
///\brief load a core ML model. | |
///\param modelPath file path to the model to load | |
///\param resultsType The type of result that the model will generate. this depends purely on the model you use. For example, yolo uses DETECTION_RECT. | |
///\param loadLabels load the labels from the MLModel (YOLO has) | |
///\return true if successfully loaded, false otherwise | |
bool loadModel(const std::string& modelPath, bool loadLabels = false); | |
const resultsType& getDetectionResults(){return detectionResults;} | |
void draw(ofRectangle rect); | |
protected: | |
#ifdef __OBJC__ | |
void detect_(CGImageRef image); | |
void processResults(NSArray* results); | |
__strong ObjectRecognition* objectRecognition; | |
#else | |
void * objectRecognition; | |
#endif | |
resultsType detectionResults; | |
private: | |
//int requiredInputWidth = 0; | |
//int requiredInputHeight = 0; | |
ofPixels requiredPixels; | |
bool bRemapRects = false; | |
glm::vec2 pixTranslation, pixScaling; | |
std::shared_ptr<ofFbo> helperFbo = nullptr; | |
}; | |
class ofxVisionDetection: | |
public ofxVisionObjectDetection_<ofxVision::RectsCollection> | |
{ | |
public: | |
ofxVisionDetection(); | |
virtual void detect(ofPixels & image) override; | |
ofParameter<bool> detectAnimal = {"Detect Animal", false}; | |
ofParameter<bool> detectText = {"Detect Text", false}; | |
ofParameter<bool> detectHand = {"Detect Hand", false}; | |
ofParameter<bool> detectFace = {"Detect Face", true}; | |
ofParameter<bool> detectBody = {"Detect Body", false}; | |
#ifdef OFX_VISON_ENABLE_3D_BODY | |
ofParameter<bool> detectBody3D = {"Detect Body3D", false}; | |
#endif | |
ofParameter<bool> recognizeObjects = {"Recognize Objects", true}; | |
ofParameterGroup parameters = {"ofxVision Detect", recognizeObjects, detectAnimal, detectText, detectHand, detectFace, detectBody | |
#ifdef OFX_VISON_ENABLE_3D_BODY | |
, detectBody3D | |
#endif | |
}; | |
void draw(const ofRectangle& rect); | |
///\brief load a core ML model. | |
///\param modelPath file path to the model to load | |
///\param resultsType The type of result that the model will generate. this depends purely on the model you use. For example, yolo uses DETECTION_RECT. | |
///\return true if successfully loaded, false otherwise | |
// bool loadModel(const std::string& modelPath); | |
const ofxVision::RectsCollection& getObjectDetections(){return getDetectionResults();} | |
const ofxVision::RectsCollection & getAnimalResults(){return animalResults;} | |
const ofxVision::PointsCollection& getBodyResults(){return bodyResults;} | |
const ofxVision::Body3DCollection& getBody3DResults(){ | |
#ifdef OFX_VISON_ENABLE_3D_BODY | |
ofLogError("ofxVision::Body3DCollection& getBody3DResults") << "This feature is not enabled or available on your Macos version"; | |
#endif | |
return body3DResults; | |
} | |
const ofxVision::RectsCollection& getTextResults(){return textResults;} | |
const ofxVision::PointsCollection& getHandResults(){return handResults;} | |
const ofxVision::FaceDetectionsCollection& getFaceResults(){return faceResults;} | |
protected: | |
// ofxVision::DetectionResultsType objectResultsType = ofxVision::DETECTION_RECT_COLLECTION; | |
#ifdef __OBJC__ | |
// ObjectRecognition * objectRecognition; | |
Detection* detection ; | |
#else | |
// void * objectRecognition; | |
void * detection; | |
#endif | |
// ofxVision::RectsCollection rectObjectDetections; | |
ofxVision::RectsCollection animalResults; | |
ofxVision::PointsCollection bodyResults; | |
ofxVision::Body3DCollection body3DResults; | |
ofxVision::RectsCollection textResults; | |
ofxVision::PointsCollection handResults; | |
ofxVision::FaceDetectionsCollection faceResults; | |
}; | |
//#endif | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// ofxVisionDetection.m | |
// example | |
// | |
// Created by Roy Macdonald on 20-03-24. | |
// | |
#include "ofxVisionDetection.h" | |
#include "ofxVisionHelpers.h" | |
#pragma clang diagnostic ignored "-Wunguarded-availability" | |
int getBodyPoseIDfromKey(VNHumanBodyPoseObservationJointName joint_name) | |
{ | |
if (joint_name == VNHumanBodyPoseObservationJointNameNose) { return BODY_NOSE; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftEye) { return BODY_LEFTEYE; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameRightEye) { return BODY_RIGHTEYE; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftEar) { return BODY_LEFTEAR; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameRightEar) { return BODY_RIGHTEAR; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftWrist) { return BODY_LEFTWRIST; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameRightWrist) { return BODY_RIGHTWRIST; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftElbow) { return BODY_LEFTELBOW; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameRightElbow) { return BODY_RIGHTELBOW; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftShoulder) { return BODY_LEFTSHOULDER; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameRightShoulder) { return BODY_RIGHTSHOULDER; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftHip) { return BODY_LEFTHIP; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameRightHip) { return BODY_RIGHTHIP; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftKnee) { return BODY_LEFTKNEE; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameRightKnee) { return BODY_RIGHTKNEE; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameLeftAnkle) { return BODY_LEFTANKLE; } | |
else if (joint_name == VNHumanBodyPoseObservationJointNameRightAnkle) { return BODY_RIGHTANKLE; } | |
else { return BODY_N_PART; } | |
} | |
#ifdef OFX_VISON_ENABLE_3D_BODY | |
int getBodyPose3DIDfromKey(VNHumanBodyPose3DObservationJointName joint_name) | |
{ | |
if (joint_name == VNHumanBodyPose3DObservationJointNameTopHead) { return BODY3D_TOPHEAD; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameCenterHead) { return BODY3D_CENTERHEAD; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameCenterShoulder) { return BODY3D_CENTERSHOULDER; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameLeftShoulder) { return BODY3D_LEFTSHOULDER; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameRightShoulder) { return BODY3D_RIGHTSHOULDER; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameLeftElbow) { return BODY3D_LEFTELBOW; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameRightElbow) { return BODY3D_RIGHTELBOW; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameLeftWrist) { return BODY3D_LEFTWRIST; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameRightWrist) { return BODY3D_RIGHTWRIST; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameLeftHip) { return BODY3D_LEFTHIP; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameRightHip) { return BODY3D_RIGHTHIP; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameLeftKnee) { return BODY3D_LEFTKNEE; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameRightKnee) { return BODY3D_RIGHTKNEE; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameLeftAnkle) { return BODY3D_LEFTANKLE; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameRightAnkle) { return BODY3D_RIGHTANKLE; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameRoot) { return BODY3D_ROOT; } | |
else if (joint_name == VNHumanBodyPose3DObservationJointNameSpine) { return BODY3D_SPINE; } | |
else { return BODY3D_N_PART; } | |
} | |
#endif | |
int getHandPoseIDfromKey(VNHumanHandPoseObservationJointName joint_name) | |
{ | |
if (joint_name == VNHumanHandPoseObservationJointNameWrist) { return HAND_WRIST; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameThumbCMC) { return HAND_THUMB0; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameThumbMP) { return HAND_THUMB1; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameThumbIP) { return HAND_THUMB2; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameThumbTip) { return HAND_THUMB3; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameIndexMCP) { return HAND_INDEX0; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameIndexPIP) { return HAND_INDEX1; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameIndexDIP) { return HAND_INDEX2; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameIndexTip) { return HAND_INDEX3; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameMiddleMCP) { return HAND_MIDDLE0; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameMiddlePIP) { return HAND_MIDDLE1; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameMiddleDIP) { return HAND_MIDDLE2; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameMiddleTip) { return HAND_MIDDLE3; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameRingMCP) { return HAND_RING0; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameRingPIP) { return HAND_RING1; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameRingDIP) { return HAND_RING2; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameRingTip) { return HAND_RING3; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameLittleMCP) { return HAND_PINKY0; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameLittlePIP) { return HAND_PINKY1; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameLittleDIP) { return HAND_PINKY2; } | |
else if (joint_name == VNHumanHandPoseObservationJointNameLittleTip) { return HAND_PINKY3; } | |
else { return HAND_N_PART; } | |
} | |
void setPoint(ofxVision::PointsDetection& points, const size_t& index, VNRecognizedPoint * point){ | |
points.setPoint(index, point.location.x, point.location.y, point.confidence); | |
} | |
void setPoint(ofxVision::Body3DDetection& points, const size_t& index, VNPoint * point){ | |
points.setPoint(index, point.x, point.y, 0.0f); | |
} | |
#ifdef OFX_VISON_ENABLE_3D_BODY | |
void setPoint3D(ofxVision::Body3DDetection& points, const size_t& index, VNHumanBodyRecognizedPoint3D* point){ | |
glm::vec3 position = glm::vec3(point.position.columns[3].x, point.position.columns[3].y, point.position.columns[3].z); | |
points.setGlobalPosition(index, position); | |
} | |
#endif | |
void processAnimalResults(NSArray* source, ofxVision::RectsCollection& dest){ | |
dest.detections.clear(); | |
for(VNRecognizedObjectObservation *observation in source){ | |
dest.detections.push_back(ofxVision::RectDetection(ofxVisionHelper::toOf(observation.boundingBox), observation.confidence) ); | |
for (int i = 0; i < observation.labels.count; i++){ | |
std::string text = [observation.labels[i].identifier UTF8String]; | |
if(i < observation.labels.count -1){ | |
text += ", "; | |
} | |
dest.detections.back().label += text; | |
} | |
} | |
} | |
//------------------------------------------------------------------------------------------------------------------------ | |
void processFaceResults(NSArray* source, ofxVision::FaceDetectionsCollection& dest){ | |
auto & faces = dest.detections; | |
faces.clear(); | |
for(VNFaceObservation *observation in source){ | |
auto r = ofxVisionHelper::toOf(observation.boundingBox); | |
faces.push_back({}); | |
faces.back().orientation.x = 0; | |
faces.back().orientation.y = [observation.yaw floatValue]; | |
faces.back().orientation.z = [observation.roll floatValue]; | |
VNFaceLandmarkRegion2D* landmarks = observation.landmarks.allPoints; | |
const CGPoint * points = landmarks.normalizedPoints; | |
for (int i = 0; i < landmarks.pointCount; i++){ | |
faces.back().points[i].x = points[i].x * r.width + r.x; | |
faces.back().points[i].y = (1-points[i].y) * r.height + r.y; | |
faces.back().points[i].z = [landmarks.precisionEstimatesPerPoint[i] floatValue]; | |
} | |
faces.back().updateRect(); | |
faces.back().score = observation.confidence; | |
} | |
} | |
//------------------------------------------------------------------------------------------------------------------------ | |
void processBodyResults(NSArray* source, ofxVision::PointsCollection& dest){ | |
NSError *err; | |
dest.detections.clear(); | |
for(VNHumanBodyPoseObservation *observation in source){ | |
NSDictionary <VNHumanBodyPoseObservationJointName, VNRecognizedPoint *> *allPts = [observation recognizedPointsForJointsGroupName:VNHumanBodyPoseObservationJointsGroupNameAll error:&err]; | |
if(err){ | |
NSLog(@"ofxVisionResults::processBodyResults Error: %@", [err localizedDescription]); | |
continue; | |
} | |
dest.detections.emplace_back(ofxVision::PointsDetection(BODY_N_PART)); | |
auto & body = dest.detections.back(); | |
body.score = observation.confidence; | |
for(VNHumanBodyPoseObservationJointName key in allPts) | |
{ | |
VNRecognizedPoint* point = [allPts objectForKey:key]; | |
setPoint(body, getBodyPoseIDfromKey(key), point); | |
} | |
body.updateRect(); | |
} | |
} | |
#ifdef OFX_VISON_ENABLE_3D_BODY | |
//------------------------------------------------------------------------------------------------------------------------ | |
void processBody3DResults(NSArray* source, ofxVision::Body3DCollection& dest){ | |
NSError *err; | |
dest.detections.clear(); | |
for(VNHumanBodyPose3DObservation *observation in source){ | |
NSDictionary <VNHumanBodyPose3DObservationJointName, VNHumanBodyRecognizedPoint3D *> *allPts = [observation recognizedPointsForJointsGroupName:VNHumanBodyPose3DObservationJointsGroupNameAll error:&err]; | |
if(err){ | |
NSLog(@"ofxVisionResults::processBody3DResults Error: %@", [err localizedDescription]); | |
continue; | |
} | |
dest.detections.emplace_back(ofxVision::Body3DDetection()); | |
auto & body = dest.detections.back(); | |
body.score = observation.confidence; | |
body.bodyHeight = observation.bodyHeight; | |
simd_float4x4 camera_pos = observation.cameraOriginMatrix; | |
// glm::vec3 position = glm::vec3(camera_pos.columns[3].x, camera_pos.columns[3].y, -camera_pos.columns[3].z); | |
glm::vec3 position = glm::vec3(camera_pos.columns[3].x, -camera_pos.columns[3].y, -camera_pos.columns[3].z); | |
glm::vec3 x_axis = glm::vec3(camera_pos.columns[0].x, -camera_pos.columns[0].y, -camera_pos.columns[0].z); | |
glm::vec3 y_axis = glm::vec3(camera_pos.columns[1].x, -camera_pos.columns[1].y, -camera_pos.columns[1].z); | |
glm::vec3 z_axis = glm::vec3(camera_pos.columns[2].x, -camera_pos.columns[2].y, -camera_pos.columns[2].z); | |
glm::vec3 scale = glm::vec3(glm::length(x_axis), glm::length(y_axis), glm::length(z_axis)); // scale is always 1 | |
glm::mat3 rotation_mat = glm::mat3(x_axis / scale.x, y_axis / scale.y, z_axis / scale.z); | |
glm::quat quaternion = glm::quat_cast(rotation_mat); | |
body.camera.setGlobalPosition(position); | |
body.camera.setGlobalOrientation(quaternion); | |
for(VNHumanBodyPose3DObservationJointName key in allPts) | |
{ | |
VNPoint* point2d = [observation pointInImageForJointName:key error:&err]; | |
setPoint(body, getBodyPose3DIDfromKey(key), point2d); | |
VNHumanBodyRecognizedPoint3D* point3d = [allPts objectForKey:key]; | |
NSString* parent = point3d.parentJoint; | |
body.setParent(getBodyPose3DIDfromKey(key), getBodyPose3DIDfromKey(parent)); | |
setPoint3D(body, getBodyPose3DIDfromKey(key), point3d); | |
} | |
body.updateRect(); | |
} | |
} | |
void processBody3DResults_2(NSArray* source, ofxVision::Body3DCollection& dest) { | |
NSError *err; | |
dest.detections.clear(); | |
for(VNHumanBodyPose3DObservation *observation in source) { | |
NSDictionary <VNHumanBodyPose3DObservationJointName, VNHumanBodyRecognizedPoint3D *> *allPts = [observation recognizedPointsForJointsGroupName:VNHumanBodyPose3DObservationJointsGroupNameAll error:&err]; | |
if(err) { | |
NSLog(@"ofxVisionResults::processBody3DResults Error: %@", [err localizedDescription]); | |
continue; | |
} | |
dest.detections.emplace_back(ofxVision::Body3DDetection()); | |
auto & body = dest.detections.back(); | |
body.score = observation.confidence; | |
body.bodyHeight = observation.bodyHeight; | |
// Get camera transform from root joint | |
simd_float4x4 camera_pos = observation.cameraOriginMatrix; | |
glm::vec3 position = glm::vec3(camera_pos.columns[3].x, -camera_pos.columns[3].y, -camera_pos.columns[3].z); | |
glm::vec3 x_axis = glm::vec3(camera_pos.columns[0].x, -camera_pos.columns[0].y, -camera_pos.columns[0].z); | |
glm::vec3 y_axis = glm::vec3(camera_pos.columns[1].x, -camera_pos.columns[1].y, -camera_pos.columns[1].z); | |
glm::vec3 z_axis = glm::vec3(camera_pos.columns[2].x, -camera_pos.columns[2].y, -camera_pos.columns[2].z); | |
glm::vec3 scale = glm::vec3(glm::length(x_axis), glm::length(y_axis), glm::length(z_axis)); | |
glm::mat3 rotation_mat = glm::mat3(x_axis / scale.x, y_axis / scale.y, z_axis / scale.z); | |
glm::quat quaternion = glm::quat_cast(rotation_mat); | |
body.camera.setGlobalPosition(position); | |
body.camera.setGlobalOrientation(quaternion); | |
// Create inverse camera transform to convert from root-relative to camera-relative | |
glm::mat4 cameraTransform = glm::translate(glm::mat4(1.0f), position) * glm::toMat4(quaternion); | |
glm::mat4 inverseCameraTransform = glm::inverse(cameraTransform); | |
for(VNHumanBodyPose3DObservationJointName key in allPts) { | |
// Get 2D point for visualization | |
VNPoint* point2d = [observation pointInImageForJointName:key error:&err]; | |
setPoint(body, getBodyPose3DIDfromKey(key), point2d); | |
// Get root-relative 3D position | |
VNHumanBodyRecognizedPoint3D* point3d = [allPts objectForKey:key]; | |
simd_float4x4 jointTransform = point3d.position; | |
glm::vec3 rootRelativePos = glm::vec3( | |
jointTransform.columns[3].x, | |
jointTransform.columns[3].y, | |
jointTransform.columns[3].z | |
); | |
// Convert from root-relative to camera-relative position | |
glm::vec4 pos4 = glm::vec4(rootRelativePos, 1.0); | |
glm::vec4 cameraRelativePos = inverseCameraTransform * pos4; | |
// Mirror the X coordinate to match the camera view | |
cameraRelativePos.x = -cameraRelativePos.x; | |
body.setGlobalPosition(getBodyPose3DIDfromKey(key), glm::vec3(cameraRelativePos)); | |
// Set parent relationship | |
NSString* parent = point3d.parentJoint; | |
body.setParent(getBodyPose3DIDfromKey(key), getBodyPose3DIDfromKey(parent)); | |
} | |
body.updateRect(); | |
} | |
} | |
void processBody3DResults_relative(NSArray* source, ofxVision::Body3DCollection& dest) { | |
NSError *err; | |
dest.detections.clear(); | |
for(VNHumanBodyPose3DObservation *observation in source) { | |
NSDictionary <VNHumanBodyPose3DObservationJointName, VNHumanBodyRecognizedPoint3D *> *allPts = [observation recognizedPointsForJointsGroupName:VNHumanBodyPose3DObservationJointsGroupNameAll error:&err]; | |
if(err) { | |
NSLog(@"ofxVisionResults::processBody3DResults Error: %@", [err localizedDescription]); | |
continue; | |
} | |
dest.detections.emplace_back(ofxVision::Body3DDetection()); | |
auto & body = dest.detections.back(); | |
body.score = observation.confidence; | |
body.bodyHeight = observation.bodyHeight; | |
// Set up camera transform | |
simd_float4x4 camera_pos = observation.cameraOriginMatrix; | |
glm::vec3 position = glm::vec3(camera_pos.columns[3].x, camera_pos.columns[3].y, camera_pos.columns[3].z); | |
glm::vec3 x_axis = glm::vec3(camera_pos.columns[0].x, camera_pos.columns[0].y, camera_pos.columns[0].z); | |
glm::vec3 y_axis = glm::vec3(camera_pos.columns[1].x, camera_pos.columns[1].y, camera_pos.columns[1].z); | |
glm::vec3 z_axis = glm::vec3(camera_pos.columns[2].x, camera_pos.columns[2].y, camera_pos.columns[2].z); | |
glm::vec3 scale = glm::vec3(glm::length(x_axis), glm::length(y_axis), glm::length(z_axis)); | |
glm::mat3 rotation_mat = glm::mat3(x_axis / scale.x, y_axis / scale.y, z_axis / scale.z); | |
glm::quat quaternion = glm::quat_cast(rotation_mat); | |
body.camera.setGlobalPosition(position); | |
body.camera.setGlobalOrientation(quaternion); | |
for(VNHumanBodyPose3DObservationJointName key in allPts) { | |
// Get 2D point for visualization | |
VNPoint* point2d = [observation pointInImageForJointName:key error:&err]; | |
setPoint(body, getBodyPose3DIDfromKey(key), point2d); | |
// Get camera-relative 3D position | |
simd_float4x4 relativeTransform; | |
if ([observation getCameraRelativePosition:&relativeTransform forJointName:key error:&err]) { | |
glm::vec3 position = glm::vec3( | |
relativeTransform.columns[3].x, // Flip X to mirror | |
relativeTransform.columns[3].y, // Keep Y as is | |
-relativeTransform.columns[3].z // Negate Z to move in front of frustum | |
); | |
body.setGlobalPosition(getBodyPose3DIDfromKey(key), position); | |
} else if (err) { | |
NSLog(@"Error getting camera relative position for joint: %@", [err localizedDescription]); | |
} | |
// Set parent relationship | |
VNHumanBodyRecognizedPoint3D* point3d = [allPts objectForKey:key]; | |
NSString* parent = point3d.parentJoint; | |
body.setParent(getBodyPose3DIDfromKey(key), getBodyPose3DIDfromKey(parent)); | |
} | |
body.updateRect(); | |
} | |
} | |
#endif | |
//------------------------------------------------------------------------------------------------------------------------ | |
void processHandResults(NSArray* source, ofxVision::PointsCollection& dest){ | |
NSError *err; | |
dest.detections.clear(); | |
for(VNHumanHandPoseObservation *observation in source){ | |
NSDictionary <VNHumanHandPoseObservationJointName, VNRecognizedPoint *> *allPts = [observation recognizedPointsForJointsGroupName:VNHumanHandPoseObservationJointsGroupNameAll error:&err]; | |
if(err){ | |
NSLog(@"ofxVisionResults::processHandResults Error: %@", [err localizedDescription]); | |
continue; | |
} | |
dest.detections.push_back(ofxVision::PointsDetection(HAND_N_PART)); | |
auto & hand = dest.detections.back(); | |
hand.score = observation.confidence; | |
for(VNHumanHandPoseObservationJointName key in allPts) | |
{ | |
VNRecognizedPoint *point = [allPts objectForKey:key]; | |
setPoint(hand, getHandPoseIDfromKey(key), point); | |
} | |
hand.updateRect(); | |
} | |
} | |
//------------------------------------------------------------------------------------------------------------------------ | |
void processTextResults(NSArray* source, ofxVision::RectsCollection& dest){ | |
dest.detections.clear(); | |
for(VNRecognizedTextObservation *observation in source){ | |
dest.detections.push_back(ofxVision::RectDetection(ofxVisionHelper::toOf(observation.boundingBox), observation.confidence) ); | |
VNRecognizedText* recognizedText = [observation topCandidates: 1] [0]; | |
dest.detections.back().label += std::string([recognizedText.string UTF8String]); | |
} | |
} | |
//------------------------------------------------------------------------------------------------------------------------ | |
ofxVisionDetection::ofxVisionDetection(){ | |
detection = [[Detection alloc] init]; | |
} | |
// | |
//bool ofxVisionDetection::loadModel(const std::string& modelPath){ | |
// | |
// NSString * s = [NSString stringWithUTF8String:modelPath.c_str()]; | |
// objectRecognition = [[ObjectRecognition alloc] initWithModelPath: s]; | |
// | |
// return (objectRecognition != nil); | |
//} | |
void ofxVisionDetection::detect(ofPixels &pix) | |
{ | |
// cout << "ofxVisionDetection::detect..."; | |
CGImageRef image = ofxVisionHelper::CGImageRefFromOfPixels(pix,(int)pix.getWidth(),(int)pix.getHeight(),(int)pix.getNumChannels()); | |
if(recognizeObjects.get()) { | |
detect_(image); | |
} | |
// if(!objectRecognition){ | |
// ofLogError("ofxVisionDetection::detect", "model not loaded. Call ofxVisionDetection::loadModel(...) before calling this method"); | |
// // return; | |
// }else if(recognizeObjects.get() && [objectRecognition process:image]){ | |
// rectObjectDetections.detections.clear(); | |
// for(VNRecognizedObjectObservation *observation in [objectRecognition results]){ | |
// | |
// ofxVision::RectDetection det(ofxVisionHelper::toOf(observation.boundingBox),0); | |
// | |
// // VNClassificationObservation | |
// if (observation.labels.count > 0){ | |
// det.label = [observation.labels[0].identifier UTF8String]; | |
// det.score = observation.labels[0].confidence ; | |
// } | |
// rectObjectDetections.detections.push_back(det); | |
// } | |
// | |
// } | |
if( detectAnimal.get() || detectText.get() || detectHand.get() || detectFace.get() || detectBody.get() | |
#ifdef OFX_VISON_ENABLE_3D_BODY | |
|| detectBody3D.get() | |
#endif | |
){ | |
[detection detect:image | |
detectAnimal: this->detectAnimal.get() | |
detectText: this->detectText.get() | |
detectHand: this->detectHand.get() | |
detectFace: this->detectFace.get() | |
detectBody: this->detectBody.get() | |
#ifdef OFX_VISON_ENABLE_3D_BODY | |
detectBody3D: this->detectBody3D.get() | |
#endif | |
]; | |
if( this->detectAnimal.get() ) {processAnimalResults([detection animalResults], animalResults);} | |
if( this->detectText.get() ) {processTextResults([detection textResults], textResults);} | |
if( this->detectHand.get() ) {processHandResults([detection handResults], handResults);} | |
if( this->detectFace.get() ) {processFaceResults([detection faceResults], faceResults);} | |
if( this->detectBody.get() ) {processBodyResults([detection bodyResults], bodyResults);} | |
#ifdef OFX_VISON_ENABLE_3D_BODY | |
if( this->detectBody3D.get() ) {processBody3DResults([detection body3DResults], body3DResults);} | |
// if( this->detectBody3D.get() ) {processBody3DResults_relative([detection body3DResults], body3DResults);} | |
// if( this->detectBody3D.get() ) {processBody3DResults_2([detection body3DResults], body3DResults);} | |
#endif | |
} | |
CGImageRelease(image); | |
} | |
void ofxVisionDetection::draw(const ofRectangle& rect){ | |
if(recognizeObjects.get()) detectionResults.draw(rect, true, true); | |
if( this->detectAnimal.get() ) { animalResults.draw(rect); } | |
if( this->detectText.get() ) { textResults.draw(rect); } | |
if( this->detectHand.get() ) { handResults.draw(rect); } | |
if( this->detectFace.get() ) { faceResults.draw(rect); } | |
if( this->detectBody.get() ) { bodyResults.draw(rect); } | |
#ifdef OFX_VISON_ENABLE_3D_BODY | |
if( this->detectBody3D.get() ) { body3DResults.draw(rect); } | |
#endif | |
} | |
//------------------------------------------------------------------------------------------------------------------------ | |
template<typename resultsType> | |
bool ofxVisionObjectDetection_<resultsType>::loadModel(const std::string& modelPath, bool _loadLabels) { | |
__block bool success = false; | |
__block ObjectRecognition* newObjectRecognition = nil; | |
// Create a dispatch group to synchronize | |
dispatch_group_t group = dispatch_group_create(); | |
dispatch_group_enter(group); | |
// Ensure model loading happens on main thread | |
dispatch_async(dispatch_get_main_queue(), ^{ | |
@autoreleasepool { | |
@try { | |
// Clear existing model first | |
objectRecognition = nil; | |
// Load new model | |
NSString* s = [NSString stringWithUTF8String:modelPath.c_str()]; | |
newObjectRecognition = [[ObjectRecognition alloc] initWithModelPath:s loadLabels:(_loadLabels?YES:NO)]; | |
if (newObjectRecognition != nil) { | |
objectRecognition = newObjectRecognition; | |
success = true; | |
// Get image size requirements if available | |
NSString* NSimgsz = [objectRecognition getImgsz]; | |
if(NSimgsz) { | |
std::string imgsz([NSimgsz UTF8String]); | |
imgsz = ofTrim(imgsz); | |
imgsz.pop_back(); | |
imgsz = imgsz.substr(1); | |
auto imgszSplit = ofSplitString(imgsz, ","); | |
if(imgszSplit.size() == 2) { | |
int requiredInputWidth = ofToInt(imgszSplit[0]); | |
int requiredInputHeight = ofToInt(imgszSplit[1]); | |
requiredPixels.allocate(requiredInputWidth, requiredInputHeight, 3); | |
requiredPixels.set(0); | |
} | |
} | |
} | |
} @catch (NSException *exception) { | |
NSLog(@"Exception during model loading: %@", exception); | |
success = false; | |
} | |
} | |
dispatch_group_leave(group); | |
}); | |
// Wait for completion with timeout | |
dispatch_time_t timeout = dispatch_time(DISPATCH_TIME_NOW, 5 * NSEC_PER_SEC); | |
if (dispatch_group_wait(group, timeout) == 0) { | |
return success; | |
} else { | |
ofLogError("ofxVisionObjectDetection") << "Model loading timed out"; | |
return false; | |
} | |
} | |
// With ARC, we don't need explicit destructor implementation | |
template<typename resultsType> | |
ofxVisionObjectDetection_<resultsType>::~ofxVisionObjectDetection_() { | |
// ARC will handle cleanup automatically | |
objectRecognition = nil; | |
} | |
template<typename resultsType> | |
void ofxVisionObjectDetection_<resultsType>::detect(ofVideoGrabber& vidGrabber){ | |
ofPixels* pixPtr = nullptr; | |
if(requiredPixels.isAllocated() && requiredPixels.getWidth() > 0 && requiredPixels.getHeight() >0){ | |
auto& pix = vidGrabber.getPixels(); | |
if(!(pix.getWidth() == requiredPixels.getWidth() && pix.getHeight() == requiredPixels.getHeight())){ | |
if(pix.getWidth() == requiredPixels.getWidth() && pix.getHeight() < requiredPixels.getHeight()){ | |
requiredPixels.set(0); | |
int y = (requiredPixels.getHeight() - pix.getHeight())/2; | |
pix.pasteInto(requiredPixels, 0, y); | |
pixPtr = &requiredPixels; | |
bRemapRects = true; | |
pixTranslation = {0, y/float(requiredPixels.getHeight())}; | |
pixScaling = {1, float(requiredPixels.getHeight())/float( pix.getHeight())}; | |
// std::cout << "pastePix\n"; | |
}else{ | |
ofRectangle srcRect(0,0,pix.getWidth(), pix.getHeight()); | |
ofRectangle dstRect(0,0, requiredPixels.getWidth(), requiredPixels.getHeight()); | |
srcRect.scaleTo(dstRect); | |
if(!helperFbo){ | |
helperFbo = std::make_shared<ofFbo>(); | |
helperFbo->allocate(requiredPixels.getWidth(), requiredPixels.getHeight()); | |
} | |
if(helperFbo){ | |
helperFbo->begin(); | |
ofClear(0, 255); | |
vidGrabber.draw(srcRect); | |
helperFbo->end(); | |
helperFbo->readToPixels(requiredPixels); | |
pixPtr = &requiredPixels; | |
// std::cout << "readToPix\n"; | |
pixTranslation = {srcRect.x/dstRect.getWidth(), srcRect.y/dstRect.getHeight()}; | |
pixScaling = {requiredPixels.getWidth()/ srcRect.getWidth(), requiredPixels.getHeight()/ srcRect.getHeight()}; | |
bRemapRects = true; | |
}else{ | |
ofLogWarning("ofxVisionObjectDetection_<resultsType>::detect") << "Invalid helperFbo"; | |
} | |
} | |
}else{ | |
pixPtr = &pix; | |
// this->detect(pix); | |
} | |
}else{ | |
pixPtr = &vidGrabber.getPixels(); | |
} | |
// if(bRemapRects){ | |
// std::cout << "pixTranslation: " << pixTranslation << "\n"; | |
// std::cout << "pixScaling: " << pixScaling << "\n"; | |
// } | |
if(pixPtr){ | |
this->detect(*pixPtr); | |
}else{ | |
ofLogError("ofxVisionObjectDetection_<resultsType>::detect") << "pixPtr is NULL"; | |
} | |
} | |
template<typename resultsType> | |
void ofxVisionObjectDetection_<resultsType>::detect(ofPixels & pix){ | |
// cout << "ofxVisionDetection::detect..."; | |
CGImageRef image = ofxVisionHelper::CGImageRefFromOfPixels(pix,(int)pix.getWidth(),(int)pix.getHeight(),(int)pix.getNumChannels()); | |
detect_(image); | |
CGImageRelease(image); | |
} | |
template<typename resultsType> | |
void ofxVisionObjectDetection_<resultsType>::detect_(CGImageRef image) { | |
if (!objectRecognition) return; | |
// Ensure processing happens on main thread | |
if ([NSThread isMainThread]) { | |
if ([objectRecognition process:image]) { | |
processResults([objectRecognition results]); | |
} | |
} else { | |
dispatch_sync(dispatch_get_main_queue(), ^{ | |
@autoreleasepool { | |
if ([objectRecognition process:image]) { | |
processResults([objectRecognition results]); | |
} | |
} | |
}); | |
} | |
} | |
// | |
//template<typename resultsType> | |
//void ofxVisionObjectDetection_<resultsType>::handleYOLOOutput(VNCoreMLFeatureValueObservation *observation, ofxVision::RectsCollection& collection, float scale, bool bBreakpoint) { | |
// // Get the feature value (output) from the observation | |
// MLFeatureValue *featureValue = observation.featureValue; | |
// | |
// // Check if it is an MLMultiArray | |
// if (featureValue.type == MLFeatureTypeMultiArray) { | |
// MLMultiArray *multiArray = featureValue.multiArrayValue; | |
// for(NSNumber * stride in multiArray.strides){ | |
// std::cout << "Stride " << stride.intValue <<"\n"; | |
// } | |
// | |
// // Get the dimensions of the output array | |
// NSInteger numBoxes = [multiArray.shape[0] integerValue]; // N (number of bounding boxes) | |
// NSInteger numAttributes = [multiArray.shape[1] integerValue]; // Typically 85 for YOLO | |
// | |
// // Ensure the dimensions are as expected | |
// if (numAttributes < 5) { | |
// ofLogWarning("ofxVisionObjectDetection::handleYOLOOutput")<< "Unexpected output dimensions."; | |
// return; | |
// } | |
// if(bBreakpoint){ | |
// ofLogNotice("break"); | |
// } | |
// // Access the raw data pointer (assuming the data type is Float32) | |
// const float *dataPointer = (const float *)multiArray.dataPointer; | |
// | |
// // Iterate through each bounding box | |
// for (NSInteger i = 0; i < numBoxes; i++) { | |
// // Extract values from the multi-array | |
// float x = dataPointer[i * numAttributes + 0] / scale; | |
// float y = dataPointer[i * numAttributes + 1] / scale; | |
// float width = dataPointer[i * numAttributes + 2] / scale; | |
// float height = dataPointer[i * numAttributes + 3] / scale; | |
// float confidence = dataPointer[i * numAttributes + 4]; | |
// | |
// | |
// // Check if the confidence is above a threshold (e.g., 0.5) | |
// if (confidence > 0.5) { | |
// | |
// ofxVision::RectDetection det; | |
// det.rect.set(x, y, width, height); | |
// det.score = confidence; | |
// | |
// | |
// //NSLog(@"Box %ld: x=%f, y=%f, width=%f, height=%f, confidence=%f", (long)i, x, y, width, height, confidence); | |
// | |
// // Optionally extract class scores and find the highest one | |
// | |
//// if(objectRecognition){ | |
//// auto labels = [objectRecognition getLabels]; | |
//// | |
//// if(labels){ | |
// | |
// float maxClassScore = 0.0; | |
// NSInteger maxClassIndex = -1; | |
// for (NSInteger j = 5; j < numAttributes; j++) { | |
// float classScore = dataPointer[i * numAttributes + j]; | |
// if (classScore > maxClassScore) { | |
// maxClassScore = classScore; | |
// maxClassIndex = j - 5; | |
// } | |
// } | |
// | |
// if (maxClassIndex >= 0){//} && maxClassIndex < [labels count]) { | |
// // NSLog(@"Detected class %ld with score %f", (long)maxClassIndex, maxClassScore); | |
//// det.label = [[labels objectAtIndex:maxClassIndex] UTF8String]; | |
// det.label = ofToString(maxClassIndex); | |
// | |
// } | |
//// } | |
//// } | |
// collection.detections.push_back(det); | |
// | |
// | |
// } | |
// } | |
// } else { | |
// NSLog(@"Unexpected feature value type."); | |
// } | |
//} | |
template<> | |
void ofxVisionObjectDetection_<ofxVision::RectsCollection>::draw( ofRectangle rect){ | |
// if(bRemapRects){ | |
// rect.x -= pixTranslation.x; | |
// rect.y -= pixTranslation.y; | |
// rect.width *= pixScaling.x; | |
// rect.height *= pixScaling.y; | |
// } | |
//// | |
detectionResults.draw(rect, true, true); | |
} | |
template<> | |
void ofxVisionObjectDetection_<ofxVision::LabelsCollection>::draw( ofRectangle rect){ | |
detectionResults.draw(rect.x, rect.y); | |
} | |
template<> | |
void ofxVisionObjectDetection_<ofxVision::RectsCollection>::processResults(NSArray* results){ | |
detectionResults.detections.clear(); | |
for (id object in results) { | |
if ([object isKindOfClass:[VNRecognizedObjectObservation class]]) { | |
// cout << "VNClassificationObservation\n"; | |
VNRecognizedObjectObservation *observation = object; | |
auto bb = ofxVisionHelper::toOf(observation.boundingBox); | |
if(bRemapRects){ | |
bb.x -= pixTranslation.x; | |
bb.y -= pixTranslation.y; | |
bb.x *= pixScaling.x; | |
bb.y *= pixScaling.y; | |
bb.width *= pixScaling.x; | |
bb.height *= pixScaling.y; | |
} | |
ofxVision::RectDetection det(bb,0); | |
// VNClassificationObservation | |
if (observation.labels.count > 0){ | |
det.label = [observation.labels[0].identifier UTF8String]; | |
det.score = observation.labels[0].confidence ; | |
} | |
detectionResults.detections.push_back(det); | |
}else if ([object isKindOfClass:[VNCoreMLFeatureValueObservation class]]) { | |
VNCoreMLFeatureValueObservation* observation = object; | |
cout << "observation: " <<[observation.featureName UTF8String] << "\n"; | |
// handleYOLOOutput(observation,detectionResults, drawScale.get(), breakpoint.get()); | |
// info = iterateMultiArrayDynamically(observation.featureValue.multiArrayValue); | |
}else if([object isMemberOfClass:[VNRecognizedObjectObservation class]]){ | |
cout <<"isMemberOfClass VNRecognizedObjectObservation"<<endl; | |
VNRecognizedObjectObservation *observation = (VNRecognizedObjectObservation *)object; | |
cout<<endl; | |
cout<<"observation.confidence "<<observation.confidence<<endl; | |
NSLog(@"observation.boundingBox %@", NSStringFromRect(observation.boundingBox)); | |
NSLog(@"observation %@", object); | |
cout<<endl; | |
auto bb = ofxVisionHelper::toOf(observation.boundingBox); | |
if(bRemapRects){ | |
bb.x -= pixTranslation.x; | |
bb.y -= pixTranslation.y; | |
bb.x *= pixScaling.x; | |
bb.y *= pixScaling.y; | |
bb.width *= pixScaling.x; | |
bb.height *= pixScaling.y; | |
} | |
ofxVision::RectDetection det(bb,0); | |
if (observation.labels.count > 0){ | |
det.label = [observation.labels[0].identifier UTF8String]; | |
det.score = observation.labels[0].confidence; | |
} | |
detectionResults.detections.push_back(det); | |
}else{ | |
NSString *className = NSStringFromClass([object class]); | |
cout << "NOT " << [className UTF8String] << "\n"; | |
} | |
} | |
// for(VNRecognizedObjectObservation *observation in results){ | |
// | |
// } | |
} | |
template<> | |
void ofxVisionObjectDetection_<ofxVision::LabelsCollection>::processResults(NSArray* results){ | |
detectionResults.detections.clear(); | |
for (id object in results) { | |
if ([object isKindOfClass:[VNClassificationObservation class]]) { | |
//cout << "VNClassificationObservation\n"; | |
VNClassificationObservation *observation = object; | |
detectionResults.detections.push_back(ofxVision::LabelDetection ([observation.identifier UTF8String], observation.confidence)); | |
}else if ([object isKindOfClass:[VNCoreMLFeatureValueObservation class]]) { | |
VNCoreMLFeatureValueObservation* observation = object; | |
}else if([object isMemberOfClass:[VNRecognizedObjectObservation class]]){ | |
cout <<"isMemberOfClass VNRecognizedObjectObservation"<<endl; | |
VNRecognizedObjectObservation *observation = (VNRecognizedObjectObservation *)object; | |
cout<<endl; | |
cout<<"observation.confidence "<<observation.confidence<<endl; | |
NSLog(@"observation.boundingBox %@", NSStringFromRect(observation.boundingBox)); | |
NSLog(@"observation %@", object); | |
cout<<endl; | |
auto bb = ofxVisionHelper::toOf(observation.boundingBox); | |
if(bRemapRects){ | |
bb.x -= pixTranslation.x; | |
bb.y -= pixTranslation.y; | |
bb.x *= pixScaling.x; | |
bb.y *= pixScaling.y; | |
bb.width *= pixScaling.x; | |
bb.height *= pixScaling.y; | |
} | |
ofxVision::RectDetection det(bb,0); | |
if (observation.labels.count > 0){ | |
det.label = [observation.labels[0].identifier UTF8String]; | |
det.score = observation.labels[0].confidence; | |
} | |
detectionResults.detections.push_back(det); | |
}else{ | |
NSString *className = NSStringFromClass([object class]); | |
cout << "LabelsCollection NOT " << [className UTF8String] << "\n"; | |
} | |
} | |
// for(VNClassificationObservation *observation in results){ | |
// detectionResults.detections.push_back(ofxVision::LabelDetection ([observation.identifier UTF8String], observation.confidence)); | |
// } | |
} | |
template class ofxVisionObjectDetection_<ofxVision::LabelsCollection>; | |
template class ofxVisionObjectDetection_<ofxVision::RectsCollection>; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment