jszuppe · March 16, 2016 17:48
diff --git a/reduce.hpp b/reduce.hpp
 namespace detail {

 template<class InputIterator, class OutputIterator, class BinaryFunction>
 inline void dispatch_gpu_reduce(InputIterator first,
                                InputIterator last,
                                OutputIterator result,
                                BinaryFunction function,
                                command_queue &queue)
 {
 ... 
 }

 template<class InputIterator, class OutputIterator, class BinaryFunction>
 inline void dispatch_cpu_reduce(InputIterator first,
                                InputIterator last,
                                OutputIterator result,
                                BinaryFunction function,
                                command_queue &queue)
 {
 ... 
 }

 } // detail namespace

 // when InputIterator is a HOST iterator (we will need to copy)
 template<class InputIterator, class OutputIterator, class BinaryFunction>
 inline void reduce(InputIterator first,
                   InputIterator last,
                   OutputIterator result,
                   BinaryFunction function,
                   command_queue &queue = system::default_queue())
 {
    if(small number of elements) { // We may add some option to always force copying and using OpenCL-based algorithm
        // run STL algorithm (why not?)
        // if there's no STL equivalent implement it in C++ or run serial version
    }
    else {
        // copy
        // run device algorithm (below)
    }
 }

 // when InputIterator is a device iterator
 template<class InputIterator, class OutputIterator, class BinaryFunction>
 inline void reduce(InputIterator first,
                   InputIterator last,
                   OutputIterator result,
                   BinaryFunction function,
                   command_queue &queue = system::default_queue())
 {
    if(small number of elements in the input vector) {
        // run serial algorithm
    }
    
    if(device is a CPU){
        // dispatch a CPU algorithm
    }
    else if (device is a GPU) {
        // dispatch a GPU algorithm
    }
    // some kind of accelerator, we don't know
    else { 
        // run serial algorithm (at least it'll work)
        // for SVM memory we can just map the memory and use STL algorithm
    }
 }
	namespace detail {

	template<class InputIterator, class OutputIterator, class BinaryFunction>
	inline void dispatch_gpu_reduce(InputIterator first,
	InputIterator last,
	OutputIterator result,
	BinaryFunction function,
	command_queue &queue)
	{
	...
	}

	template<class InputIterator, class OutputIterator, class BinaryFunction>
	inline void dispatch_cpu_reduce(InputIterator first,
	InputIterator last,
	OutputIterator result,
	BinaryFunction function,
	command_queue &queue)
	{
	...
	}

	} // detail namespace

	// when InputIterator is a HOST iterator (we will need to copy)
	template<class InputIterator, class OutputIterator, class BinaryFunction>
	inline void reduce(InputIterator first,
	InputIterator last,
	OutputIterator result,
	BinaryFunction function,
	command_queue &queue = system::default_queue())
	{
	if(small number of elements) { // We may add some option to always force copying and using OpenCL-based algorithm
	// run STL algorithm (why not?)
	// if there's no STL equivalent implement it in C++ or run serial version
	}
	else {
	// copy
	// run device algorithm (below)
	}
	}

	// when InputIterator is a device iterator
	template<class InputIterator, class OutputIterator, class BinaryFunction>
	inline void reduce(InputIterator first,
	InputIterator last,
	OutputIterator result,
	BinaryFunction function,
	command_queue &queue = system::default_queue())
	{
	if(small number of elements in the input vector) {
	// run serial algorithm
	}

	if(device is a CPU){
	// dispatch a CPU algorithm
	}
	else if (device is a GPU) {
	// dispatch a GPU algorithm
	}
	// some kind of accelerator, we don't know
	else {
	// run serial algorithm (at least it'll work)
	// for SVM memory we can just map the memory and use STL algorithm
	}
	}