simonbyrne · August 22, 2025 00:45
diff --git a/matx_alloc.h b/matx_alloc.h
 #include "matx.h"
 #include "holoscan/holoscan.hpp"
 #include <memory>
 #include <type_traits>
 #include <stdexcept>

 namespace holoscan {

 /**
 * @brief Holoscan-based replacement for MatX's raw_pointer_buffer
 * 
 * This class provides the same interface as MatX's raw_pointer_buffer but uses
 * Holoscan allocators for memory management. It integrates seamlessly with
 * MatX's basic_storage system.
 */
 template <typename T>
 class holoscan_pointer_buffer {
 public:
  using value_type = T;
  using iterator = T*;
  using citerator = T const*;

  /**
   * @brief Default constructor
   */
  holoscan_pointer_buffer() = default;

  /**
   * @brief Construct and allocate memory using Holoscan allocator
   * 
   * @param allocator Holoscan allocator to use
   * @param size Size in bytes to allocate
   * @param memory_type Memory storage type
   */
  holoscan_pointer_buffer(std::shared_ptr<Allocator> allocator,
                         size_t size,
                         MemoryStorageType memory_type = MemoryStorageType::kDevice)
      : allocator_(allocator), size_(size) {
    if (!allocator_) {
      throw std::invalid_argument("holoscan_pointer_buffer: allocator cannot be null");
    }
    
    nvidia::byte* raw_ptr = allocator_->allocate(size, memory_type);
    if (!raw_ptr) {
      throw std::bad_alloc();
    }
    
    T* ptr = reinterpret_cast<T*>(raw_ptr);
    ConfigureShared(ptr);
  }

  /**
   * @brief Copy constructor
   */
  holoscan_pointer_buffer(const holoscan_pointer_buffer& other) = default;

  /**
   * @brief Move constructor
   */
  holoscan_pointer_buffer(holoscan_pointer_buffer&& other) noexcept = default;

  /**
   * @brief Assignment operator
   */
  holoscan_pointer_buffer& operator=(const holoscan_pointer_buffer& other) = default;

  /**
   * @brief Move assignment operator
   */
  holoscan_pointer_buffer& operator=(holoscan_pointer_buffer&& other) noexcept = default;

  /**
   * @brief Get raw data pointer
   */
  __MATX_INLINE__ __MATX_HOST__ T* Data() const noexcept {
    return data_.get();
  }

  /**
   * @brief Get raw data pointer (lowercase for basic_storage compatibility)
   */
  __MATX_INLINE__ __MATX_HOST__ T* data() noexcept {
    return data_.get();
  }

  /**
   * @brief Get raw data pointer (const version for basic_storage compatibility)
   */
  __MATX_INLINE__ __MATX_HOST__ const T* data() const noexcept {
    return data_.get();
  }

  /**
   * @brief Get size in elements
   */
  __MATX_INLINE__ __MATX_HOST__ auto Size() const noexcept {
    return size_ / sizeof(T);
  }

  /**
   * @brief Get size in elements (lowercase for basic_storage compatibility)
   */
  __MATX_INLINE__ __MATX_HOST__ auto size() const noexcept {
    return size_ / sizeof(T);
  }

  /**
   * @brief Get capacity in elements (for basic_storage compatibility)
   */
  __MATX_INLINE__ __MATX_HOST__ auto capacity() const noexcept {
    return size_ / sizeof(T);
  }

  /**
   * @brief Get size in bytes
   */
  __MATX_INLINE__ __MATX_HOST__ auto Bytes() const noexcept {
    return size_;
  }

  /**
   * @brief Get iterator to beginning
   */
  __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ iterator begin() const noexcept {
    return data_.get();
  }

  /**
   * @brief Get iterator to end
   */
  __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ iterator end() const noexcept {
    return data_.get() + Size();
  }

  /**
   * @brief Get const iterator to beginning
   */
  __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ citerator cbegin() const noexcept {
    return data_.get();
  }

  /**
   * @brief Get const iterator to end
   */
  __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ citerator cend() const noexcept {
    return data_.get() + Size();
  }

  /**
   * @brief Get reference count
   */
  __MATX_INLINE__ __MATX_HOST__ auto use_count() const noexcept {
    return data_.use_count();
  }

  /**
   * @brief Swap with another buffer
   */
  friend void swap(holoscan_pointer_buffer& lhs, holoscan_pointer_buffer& rhs) noexcept {
    std::swap(lhs.allocator_, rhs.allocator_);
    std::swap(lhs.data_, rhs.data_);
    std::swap(lhs.size_, rhs.size_);
  }

 private:
  std::shared_ptr<Allocator> allocator_;
  std::shared_ptr<T> data_;
  size_t size_ = 0;

  void ConfigureShared(T* ptr) {
    // Always use Holoscan allocator for deallocation since we always own the memory
    data_ = std::shared_ptr<T>(ptr, [allocator = allocator_](auto p) {
      if (allocator && p) {
        allocator->free(reinterpret_cast<nvidia::byte*>(p));
      }
    });
  }
 };

 } // namespace holoscan

 namespace matx {

 /**
 * @brief Create a MatX tensor using a Holoscan allocator with C array shape
 * 
 * This creates a tensor that uses the Holoscan allocator for memory management.
 * The tensor will properly clean up memory when it goes out of scope.
 * 
 * @tparam T Element type of the tensor
 * @tparam RANK Number of dimensions
 * @param allocator Holoscan allocator instance
 * @param shape Shape of tensor as C array
 * @param memory_type Memory storage type (default: device memory)
 * @param stream CUDA stream (optional, defaults to 0)
 * @return New tensor allocated with the Holoscan allocator
 */
 template <typename T, int RANK>
 auto make_tensor(std::shared_ptr<holoscan::Allocator> allocator,
                 const index_t (&shape)[RANK],
                 holoscan::MemoryStorageType memory_type = holoscan::MemoryStorageType::kDevice,
                 cudaStream_t stream = 0) {
  // Create descriptor for the tensor shape
  DefaultDescriptor<RANK> desc{shape};
  
  // Calculate total size needed
  size_t size = static_cast<size_t>(desc.TotalSize()) * sizeof(T);
  
  // Create Holoscan pointer buffer that handles allocation/deallocation
  holoscan::holoscan_pointer_buffer<T> buffer(allocator, size, memory_type);
  
  // Create basic_storage with the Holoscan buffer
  basic_storage<holoscan::holoscan_pointer_buffer<T>> storage{std::move(buffer)};
  
  // Create tensor with the storage
  return tensor_t<T, RANK, decltype(storage), decltype(desc)>{std::move(storage), std::move(desc)};
 }

 /**
 * @brief Create a MatX tensor using a Holoscan allocator with container shape
 * 
 * @tparam T Element type of the tensor
 * @tparam ShapeType Container type for shape (e.g., std::array, std::vector)
 * @param allocator Holoscan allocator instance
 * @param shape Shape of tensor as container
 * @param memory_type Memory storage type (default: device memory)
 * @param stream CUDA stream (optional, defaults to 0)
 * @return New tensor allocated with the Holoscan allocator
 */
 template <typename T, typename ShapeType,
          std::enable_if_t<!std::is_array_v<std::remove_cv_t<std::remove_reference_t<ShapeType>>>, bool> = true>
 auto make_tensor(std::shared_ptr<holoscan::Allocator> allocator,
                 ShapeType&& shape,
                 holoscan::MemoryStorageType memory_type = holoscan::MemoryStorageType::kDevice,
                 cudaStream_t stream = 0) {
  constexpr int rank = static_cast<int>(std::tuple_size_v<std::remove_cv_t<std::remove_reference_t<ShapeType>>>);
  DefaultDescriptor<rank> desc{std::forward<ShapeType>(shape)};
  
  // Calculate total size needed
  size_t size = static_cast<size_t>(desc.TotalSize()) * sizeof(T);
  
  // Create Holoscan pointer buffer that handles allocation/deallocation
  holoscan::holoscan_pointer_buffer<T> buffer(allocator, size, memory_type);
  
  // Create basic_storage with the Holoscan buffer
  basic_storage<holoscan::holoscan_pointer_buffer<T>> storage{std::move(buffer)};
  
  // Create tensor with the storage
  return tensor_t<T, rank, decltype(storage), decltype(desc)>{std::move(storage), std::move(desc)};
 }

 /**
 * @brief Create a MatX tensor using a Holoscan allocator with existing tensor reference
 * 
 * @tparam TensorType MatX tensor type
 * @param tensor Reference to tensor to populate
 * @param allocator Holoscan allocator instance
 * @param shape Shape of tensor as C array
 * @param memory_type Memory storage type (default: device memory)
 * @param stream CUDA stream (optional, defaults to 0)
 */
 template <typename TensorType,
          std::enable_if_t<is_tensor_view_v<TensorType>, bool> = true>
 void make_tensor(TensorType& tensor,
                 std::shared_ptr<holoscan::Allocator> allocator,
                 const index_t (&shape)[TensorType::Rank()],
                 holoscan::MemoryStorageType memory_type = holoscan::MemoryStorageType::kDevice,
                 cudaStream_t stream = 0) {
  auto tmp = make_tensor<typename TensorType::value_type, TensorType::Rank()>(
      allocator, shape, memory_type, stream);
  tensor.Shallow(tmp);
 }

 /**
 * @brief Create a 0D (scalar) MatX tensor using a Holoscan allocator
 * 
 * @tparam T Element type of the tensor
 * @param allocator Holoscan allocator instance
 * @param memory_type Memory storage type (default: device memory)
 * @param stream CUDA stream (optional, defaults to 0)
 * @return New 0D tensor allocated with the Holoscan allocator
 */
 template <typename T>
 auto make_tensor(std::shared_ptr<holoscan::Allocator> allocator,
                 holoscan::MemoryStorageType memory_type = holoscan::MemoryStorageType::kDevice,
                 cudaStream_t stream = 0) {
  // Create a scalar tensor (0-dimensional) directly without using std::array
  DefaultDescriptor<0> desc{};
  
  // Calculate total size needed (just one element)
  size_t size = sizeof(T);
  
  // Create Holoscan pointer buffer that handles allocation/deallocation
  holoscan::holoscan_pointer_buffer<T> buffer(allocator, size, memory_type);
  
  // Create basic_storage with the Holoscan buffer
  basic_storage<holoscan::holoscan_pointer_buffer<T>> storage{std::move(buffer)};
  
  // Create tensor with the storage
  return tensor_t<T, 0, decltype(storage), decltype(desc)>{std::move(storage), std::move(desc)};
 }

 } // namespace matx
	#include "matx.h"
	#include "holoscan/holoscan.hpp"
	#include <memory>
	#include <type_traits>
	#include <stdexcept>

	namespace holoscan {

	/**
	* @brief Holoscan-based replacement for MatX's raw_pointer_buffer
	*
	* This class provides the same interface as MatX's raw_pointer_buffer but uses
	* Holoscan allocators for memory management. It integrates seamlessly with
	* MatX's basic_storage system.
	*/
	template <typename T>
	class holoscan_pointer_buffer {
	public:
	using value_type = T;
	using iterator = T*;
	using citerator = T const*;

	/**
	* @brief Default constructor
	*/
	holoscan_pointer_buffer() = default;

	/**
	* @brief Construct and allocate memory using Holoscan allocator
	*
	* @param allocator Holoscan allocator to use
	* @param size Size in bytes to allocate
	* @param memory_type Memory storage type
	*/
	holoscan_pointer_buffer(std::shared_ptr<Allocator> allocator,
	size_t size,
	MemoryStorageType memory_type = MemoryStorageType::kDevice)
	: allocator_(allocator), size_(size) {
	if (!allocator_) {
	throw std::invalid_argument("holoscan_pointer_buffer: allocator cannot be null");
	}

	nvidia::byte* raw_ptr = allocator_->allocate(size, memory_type);
	if (!raw_ptr) {
	throw std::bad_alloc();
	}

	T* ptr = reinterpret_cast<T*>(raw_ptr);
	ConfigureShared(ptr);
	}

	/**
	* @brief Copy constructor
	*/
	holoscan_pointer_buffer(const holoscan_pointer_buffer& other) = default;

	/**
	* @brief Move constructor
	*/
	holoscan_pointer_buffer(holoscan_pointer_buffer&& other) noexcept = default;

	/**
	* @brief Assignment operator
	*/
	holoscan_pointer_buffer& operator=(const holoscan_pointer_buffer& other) = default;

	/**
	* @brief Move assignment operator
	*/
	holoscan_pointer_buffer& operator=(holoscan_pointer_buffer&& other) noexcept = default;

	/**
	* @brief Get raw data pointer
	*/
	__MATX_INLINE__ __MATX_HOST__ T* Data() const noexcept {
	return data_.get();
	}

	/**
	* @brief Get raw data pointer (lowercase for basic_storage compatibility)
	*/
	__MATX_INLINE__ __MATX_HOST__ T* data() noexcept {
	return data_.get();
	}

	/**
	* @brief Get raw data pointer (const version for basic_storage compatibility)
	*/
	__MATX_INLINE__ __MATX_HOST__ const T* data() const noexcept {
	return data_.get();
	}

	/**
	* @brief Get size in elements
	*/
	__MATX_INLINE__ __MATX_HOST__ auto Size() const noexcept {
	return size_ / sizeof(T);
	}

	/**
	* @brief Get size in elements (lowercase for basic_storage compatibility)
	*/
	__MATX_INLINE__ __MATX_HOST__ auto size() const noexcept {
	return size_ / sizeof(T);
	}

	/**
	* @brief Get capacity in elements (for basic_storage compatibility)
	*/
	__MATX_INLINE__ __MATX_HOST__ auto capacity() const noexcept {
	return size_ / sizeof(T);
	}

	/**
	* @brief Get size in bytes
	*/
	__MATX_INLINE__ __MATX_HOST__ auto Bytes() const noexcept {
	return size_;
	}

	/**
	* @brief Get iterator to beginning
	*/
	__MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ iterator begin() const noexcept {
	return data_.get();
	}

	/**
	* @brief Get iterator to end
	*/
	__MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ iterator end() const noexcept {
	return data_.get() + Size();
	}

	/**
	* @brief Get const iterator to beginning
	*/
	__MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ citerator cbegin() const noexcept {
	return data_.get();
	}

	/**
	* @brief Get const iterator to end
	*/
	__MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ citerator cend() const noexcept {
	return data_.get() + Size();
	}

	/**
	* @brief Get reference count
	*/
	__MATX_INLINE__ __MATX_HOST__ auto use_count() const noexcept {
	return data_.use_count();
	}

	/**
	* @brief Swap with another buffer
	*/
	friend void swap(holoscan_pointer_buffer& lhs, holoscan_pointer_buffer& rhs) noexcept {
	std::swap(lhs.allocator_, rhs.allocator_);
	std::swap(lhs.data_, rhs.data_);
	std::swap(lhs.size_, rhs.size_);
	}

	private:
	std::shared_ptr<Allocator> allocator_;
	std::shared_ptr<T> data_;
	size_t size_ = 0;

	void ConfigureShared(T* ptr) {
	// Always use Holoscan allocator for deallocation since we always own the memory
	data_ = std::shared_ptr<T>(ptr, [allocator = allocator_](auto p) {
	if (allocator && p) {
	allocator->free(reinterpret_cast<nvidia::byte*>(p));
	}
	});
	}
	};

	} // namespace holoscan

	namespace matx {

	/**
	* @brief Create a MatX tensor using a Holoscan allocator with C array shape
	*
	* This creates a tensor that uses the Holoscan allocator for memory management.
	* The tensor will properly clean up memory when it goes out of scope.
	*
	* @tparam T Element type of the tensor
	* @tparam RANK Number of dimensions
	* @param allocator Holoscan allocator instance
	* @param shape Shape of tensor as C array
	* @param memory_type Memory storage type (default: device memory)
	* @param stream CUDA stream (optional, defaults to 0)
	* @return New tensor allocated with the Holoscan allocator
	*/
	template <typename T, int RANK>
	auto make_tensor(std::shared_ptr<holoscan::Allocator> allocator,
	const index_t (&shape)[RANK],
	holoscan::MemoryStorageType memory_type = holoscan::MemoryStorageType::kDevice,
	cudaStream_t stream = 0) {
	// Create descriptor for the tensor shape
	DefaultDescriptor<RANK> desc{shape};

	// Calculate total size needed
	size_t size = static_cast<size_t>(desc.TotalSize()) * sizeof(T);

	// Create Holoscan pointer buffer that handles allocation/deallocation
	holoscan::holoscan_pointer_buffer<T> buffer(allocator, size, memory_type);

	// Create basic_storage with the Holoscan buffer
	basic_storage<holoscan::holoscan_pointer_buffer<T>> storage{std::move(buffer)};

	// Create tensor with the storage
	return tensor_t<T, RANK, decltype(storage), decltype(desc)>{std::move(storage), std::move(desc)};
	}

	/**
	* @brief Create a MatX tensor using a Holoscan allocator with container shape
	*
	* @tparam T Element type of the tensor
	* @tparam ShapeType Container type for shape (e.g., std::array, std::vector)
	* @param allocator Holoscan allocator instance
	* @param shape Shape of tensor as container
	* @param memory_type Memory storage type (default: device memory)
	* @param stream CUDA stream (optional, defaults to 0)
	* @return New tensor allocated with the Holoscan allocator
	*/
	template <typename T, typename ShapeType,
	std::enable_if_t<!std::is_array_v<std::remove_cv_t<std::remove_reference_t<ShapeType>>>, bool> = true>
	auto make_tensor(std::shared_ptr<holoscan::Allocator> allocator,
	ShapeType&& shape,
	holoscan::MemoryStorageType memory_type = holoscan::MemoryStorageType::kDevice,
	cudaStream_t stream = 0) {
	constexpr int rank = static_cast<int>(std::tuple_size_v<std::remove_cv_t<std::remove_reference_t<ShapeType>>>);
	DefaultDescriptor<rank> desc{std::forward<ShapeType>(shape)};

	// Calculate total size needed
	size_t size = static_cast<size_t>(desc.TotalSize()) * sizeof(T);

	// Create Holoscan pointer buffer that handles allocation/deallocation
	holoscan::holoscan_pointer_buffer<T> buffer(allocator, size, memory_type);

	// Create basic_storage with the Holoscan buffer
	basic_storage<holoscan::holoscan_pointer_buffer<T>> storage{std::move(buffer)};

	// Create tensor with the storage
	return tensor_t<T, rank, decltype(storage), decltype(desc)>{std::move(storage), std::move(desc)};
	}

	/**
	* @brief Create a MatX tensor using a Holoscan allocator with existing tensor reference
	*
	* @tparam TensorType MatX tensor type
	* @param tensor Reference to tensor to populate
	* @param allocator Holoscan allocator instance
	* @param shape Shape of tensor as C array
	* @param memory_type Memory storage type (default: device memory)
	* @param stream CUDA stream (optional, defaults to 0)
	*/
	template <typename TensorType,
	std::enable_if_t<is_tensor_view_v<TensorType>, bool> = true>
	void make_tensor(TensorType& tensor,
	std::shared_ptr<holoscan::Allocator> allocator,
	const index_t (&shape)[TensorType::Rank()],
	holoscan::MemoryStorageType memory_type = holoscan::MemoryStorageType::kDevice,
	cudaStream_t stream = 0) {
	auto tmp = make_tensor<typename TensorType::value_type, TensorType::Rank()>(
	allocator, shape, memory_type, stream);
	tensor.Shallow(tmp);
	}

	/**
	* @brief Create a 0D (scalar) MatX tensor using a Holoscan allocator
	*
	* @tparam T Element type of the tensor
	* @param allocator Holoscan allocator instance
	* @param memory_type Memory storage type (default: device memory)
	* @param stream CUDA stream (optional, defaults to 0)
	* @return New 0D tensor allocated with the Holoscan allocator
	*/
	template <typename T>
	auto make_tensor(std::shared_ptr<holoscan::Allocator> allocator,
	holoscan::MemoryStorageType memory_type = holoscan::MemoryStorageType::kDevice,
	cudaStream_t stream = 0) {
	// Create a scalar tensor (0-dimensional) directly without using std::array
	DefaultDescriptor<0> desc{};

	// Calculate total size needed (just one element)
	size_t size = sizeof(T);

	// Create Holoscan pointer buffer that handles allocation/deallocation
	holoscan::holoscan_pointer_buffer<T> buffer(allocator, size, memory_type);

	// Create basic_storage with the Holoscan buffer
	basic_storage<holoscan::holoscan_pointer_buffer<T>> storage{std::move(buffer)};

	// Create tensor with the storage
	return tensor_t<T, 0, decltype(storage), decltype(desc)>{std::move(storage), std::move(desc)};
	}

	} // namespace matx