chenrui333 · April 15, 2026 15:57
diff --git a/llama-pr21869-b8790-backport.patch b/llama-pr21869-b8790-backport.patch
 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
 index 3c06aeaffb1fc2324eab663e45bfb223bf7f7433..4a8f6d4287da040efef27e8b394e0a48502db2f8 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
 @@ -348,6 +348,53 @@ extern "C" {
     // Set a callback to be called for each resulting node during graph compute
     GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
 
 +    //
 +    // Meta backend
 +    //
 +
 +#define GGML_BACKEND_META_MAX_DEVICES 16
 +
 +    enum ggml_backend_meta_split_axis {
 +        // tensor split by tensor dimensions:
 +        GGML_BACKEND_SPLIT_AXIS_0 = 0,
 +        GGML_BACKEND_SPLIT_AXIS_1 = 1,
 +        GGML_BACKEND_SPLIT_AXIS_2 = 2,
 +        GGML_BACKEND_SPLIT_AXIS_3 = 3,
 +
 +        GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
 +        GGML_BACKEND_SPLIT_AXIS_PARTIAL  = 11, // each backend has a partial sum
 +
 +        // for internal bookkeeping only:
 +        GGML_BACKEND_SPLIT_AXIS_NONE    = 98,
 +        GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
 +    };
 +    GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
 +
 +    struct ggml_backend_meta_split_state {
 +        enum ggml_backend_meta_split_axis axis;
 +
 +        // for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
 +        //   - each device has a slice of the tensor along the split axis
 +        //   - most tensors have n_segments == 1 and a contiguous slice of the tensor data
 +        //   - some tensors have an inhomogenenous data layout along the split axis,
 +        //     those tensors are divided into segments which are each individually split across devices
 +        //   - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
 +        //     the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
 +        //   - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
 +        //     that each need to be split individually across devices so that each device gets a slice of Q, K, and V
 +        int64_t  ne[16*GGML_BACKEND_META_MAX_DEVICES];
 +        uint32_t n_segments;
 +    };
 +
 +    // function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
 +    typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
 +
 +    // create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
 +    // TODO: this looks a bit strange - a backend API creates a device. I think we should try
 +    //       express this as a backend registry functionality instead
 +    GGML_API ggml_backend_dev_t ggml_backend_meta_device(
 +        ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
 +
     //
     // Utils
     //
 diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
 index e9b70398ffc55812be95a816ebbaa6abf518b6b6..a4b01ccf8a16a9de98335b698f92203404d454ac 100644
 --- a/ggml/src/ggml-alloc.c
 +++ b/ggml/src/ggml-alloc.c
 @@ -2,6 +2,7 @@
 #include "ggml-backend-impl.h"
 #include "ggml.h"
 #include "ggml-impl.h"
 +
 #include <assert.h>
 #include <limits.h>
 #include <stdarg.h>
 diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp
 index a2ab8872c4a1cbca831276b15f7bbe5f2b4576be..0a8eea4e9450b28aa6396fd5149d00aeb926e679 100644
 --- a/ggml/src/ggml-backend-meta.cpp
 +++ b/ggml/src/ggml-backend-meta.cpp
 @@ -5,9 +5,6 @@
 #include "ggml-alloc.h"
 #include "ggml-cpp.h"
 
 -// TODO: tmp
 -#include "ggml-ext.h"
 -
 #include <algorithm>
 #include <cassert>
 #include <cmath>
 diff --git a/ggml/src/ggml-ext.h b/ggml/src/ggml-ext.h
 deleted file mode 100644
 index 56b0e6d314efd87601fd8bd35b991ce72334ded1..0000000000000000000000000000000000000000
 --- a/ggml/src/ggml-ext.h
 +++ /dev/null
 @@ -1,56 +0,0 @@
 -#pragma once
 -
 -#include "ggml.h"
 -#include "ggml-backend.h"
 -
 -// This is a "staging" header for new ggml API
 -// It is not publicly available and it should not be used by 3rd party projects
 -//
 -// When the API matures enough, it will be moved to the official public API
 -
 -//
 -// Meta backend
 -//
 -
 -#define GGML_BACKEND_META_MAX_DEVICES 16
 -
 -enum ggml_backend_meta_split_axis {
 -    // tensor split by tensor dimensions:
 -    GGML_BACKEND_SPLIT_AXIS_0   =  0,
 -    GGML_BACKEND_SPLIT_AXIS_1   =  1,
 -    GGML_BACKEND_SPLIT_AXIS_2   =  2,
 -    GGML_BACKEND_SPLIT_AXIS_3   =  3,
 -
 -    GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
 -    GGML_BACKEND_SPLIT_AXIS_PARTIAL  = 11, // each backend has a partial sum
 -
 -    // for internal bookkeeping only:
 -    GGML_BACKEND_SPLIT_AXIS_NONE     = 98,
 -    GGML_BACKEND_SPLIT_AXIS_UNKNOWN  = 99,
 -};
 -GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
 -
 -struct ggml_backend_meta_split_state {
 -    enum ggml_backend_meta_split_axis axis;
 -
 -    // for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
 -    //   - each device has a slice of the tensor along the split axis
 -    //   - most tensors have n_segments == 1 and a contiguous slice of the tensor data
 -    //   - some tensors have an inhomogenenous data layout along the split axis,
 -    //     those tensors are divided into segments which are each individually split across devices
 -    //   - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
 -    //     the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
 -    //   - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
 -    //     that each need to be split individually across devices so that each device gets a slice of Q, K, and V
 -    int64_t  ne[16*GGML_BACKEND_META_MAX_DEVICES];
 -    uint32_t n_segments;
 -};
 -
 -// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
 -typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
 -
 -// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
 -// TODO: this looks a bit strange - a backend API creates a device. I think we should try
 -//       express this as a backend registry functionality instead
 -GGML_API ggml_backend_dev_t ggml_backend_meta_device(
 -    ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
 diff --git a/src/llama-model.cpp b/src/llama-model.cpp
 index d2ffc1f45f414670e0b23b27cada2b26238e46d4..b265394ef7366202d19df59d63a5713fb70e3f6f 100644
 --- a/src/llama-model.cpp
 +++ b/src/llama-model.cpp
 @@ -18,9 +18,6 @@
 #include "ggml.h"
 #include "ggml-cpp.h"
 
 -// TODO: tmp until the ggml meta backend matures and becomes public
 -#include "../src/ggml-ext.h"
 -
 #include <algorithm>
 #include <cassert>
 #include <cfloat>
 diff --git a/src/llama.cpp b/src/llama.cpp
 index ce575246714f0f3cce06ec7931e02ca6ad6f68c7..484372d8d106fedbd3f4a29abe87ebb4f02c9d4a 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
 @@ -15,9 +15,6 @@
 #include "ggml-backend.h"
 #include "gguf.h"
 
 -// TODO: tmp until the ggml meta backend matures and becomes public
 -#include "../src/ggml-ext.h"
 -
 #include <algorithm>
 #include <cassert>
 #include <cinttypes>
	diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
	index 3c06aeaffb1fc2324eab663e45bfb223bf7f7433..4a8f6d4287da040efef27e8b394e0a48502db2f8 100644
	--- a/ggml/include/ggml-backend.h
	+++ b/ggml/include/ggml-backend.h
	@@ -348,6 +348,53 @@ extern "C" {
	// Set a callback to be called for each resulting node during graph compute
	GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);

	+ //
	+ // Meta backend
	+ //
	+
	+#define GGML_BACKEND_META_MAX_DEVICES 16
	+
	+ enum ggml_backend_meta_split_axis {
	+ // tensor split by tensor dimensions:
	+ GGML_BACKEND_SPLIT_AXIS_0 = 0,
	+ GGML_BACKEND_SPLIT_AXIS_1 = 1,
	+ GGML_BACKEND_SPLIT_AXIS_2 = 2,
	+ GGML_BACKEND_SPLIT_AXIS_3 = 3,
	+
	+ GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
	+ GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum
	+
	+ // for internal bookkeeping only:
	+ GGML_BACKEND_SPLIT_AXIS_NONE = 98,
	+ GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
	+ };
	+ GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
	+
	+ struct ggml_backend_meta_split_state {
	+ enum ggml_backend_meta_split_axis axis;
	+
	+ // for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
	+ // - each device has a slice of the tensor along the split axis
	+ // - most tensors have n_segments == 1 and a contiguous slice of the tensor data
	+ // - some tensors have an inhomogenenous data layout along the split axis,
	+ // those tensors are divided into segments which are each individually split across devices
	+ // - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
	+ // the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
	+ // - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
	+ // that each need to be split individually across devices so that each device gets a slice of Q, K, and V
	+ int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES];
	+ uint32_t n_segments;
	+ };
	+
	+ // function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
	+ typedef struct ggml_backend_meta_split_state(ggml_backend_meta_get_split_state_t)(const struct ggml_tensor tensor, void * userdata);
	+
	+ // create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
	+ // TODO: this looks a bit strange - a backend API creates a device. I think we should try
	+ // express this as a backend registry functionality instead
	+ GGML_API ggml_backend_dev_t ggml_backend_meta_device(
	+ ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
	+
	//
	// Utils
	//
	diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
	index e9b70398ffc55812be95a816ebbaa6abf518b6b6..a4b01ccf8a16a9de98335b698f92203404d454ac 100644
	--- a/ggml/src/ggml-alloc.c
	+++ b/ggml/src/ggml-alloc.c
	@@ -2,6 +2,7 @@
	#include "ggml-backend-impl.h"
	#include "ggml.h"
	#include "ggml-impl.h"
	+
	#include <assert.h>
	#include <limits.h>
	#include <stdarg.h>
	diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp
	index a2ab8872c4a1cbca831276b15f7bbe5f2b4576be..0a8eea4e9450b28aa6396fd5149d00aeb926e679 100644
	--- a/ggml/src/ggml-backend-meta.cpp
	+++ b/ggml/src/ggml-backend-meta.cpp
	@@ -5,9 +5,6 @@
	#include "ggml-alloc.h"
	#include "ggml-cpp.h"

	-// TODO: tmp
	-#include "ggml-ext.h"
	-
	#include <algorithm>
	#include <cassert>
	#include <cmath>
	diff --git a/ggml/src/ggml-ext.h b/ggml/src/ggml-ext.h
	deleted file mode 100644
	index 56b0e6d314efd87601fd8bd35b991ce72334ded1..0000000000000000000000000000000000000000
	--- a/ggml/src/ggml-ext.h
	+++ /dev/null
	@@ -1,56 +0,0 @@
	-#pragma once
	-
	-#include "ggml.h"
	-#include "ggml-backend.h"
	-
	-// This is a "staging" header for new ggml API
	-// It is not publicly available and it should not be used by 3rd party projects
	-//
	-// When the API matures enough, it will be moved to the official public API
	-
	-//
	-// Meta backend
	-//
	-
	-#define GGML_BACKEND_META_MAX_DEVICES 16
	-
	-enum ggml_backend_meta_split_axis {
	- // tensor split by tensor dimensions:
	- GGML_BACKEND_SPLIT_AXIS_0 = 0,
	- GGML_BACKEND_SPLIT_AXIS_1 = 1,
	- GGML_BACKEND_SPLIT_AXIS_2 = 2,
	- GGML_BACKEND_SPLIT_AXIS_3 = 3,
	-
	- GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
	- GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum
	-
	- // for internal bookkeeping only:
	- GGML_BACKEND_SPLIT_AXIS_NONE = 98,
	- GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
	-};
	-GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
	-
	-struct ggml_backend_meta_split_state {
	- enum ggml_backend_meta_split_axis axis;
	-
	- // for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
	- // - each device has a slice of the tensor along the split axis
	- // - most tensors have n_segments == 1 and a contiguous slice of the tensor data
	- // - some tensors have an inhomogenenous data layout along the split axis,
	- // those tensors are divided into segments which are each individually split across devices
	- // - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
	- // the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
	- // - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
	- // that each need to be split individually across devices so that each device gets a slice of Q, K, and V
	- int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES];
	- uint32_t n_segments;
	-};
	-
	-// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
	-typedef struct ggml_backend_meta_split_state(ggml_backend_meta_get_split_state_t)(const struct ggml_tensor tensor, void * userdata);
	-
	-// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
	-// TODO: this looks a bit strange - a backend API creates a device. I think we should try
	-// express this as a backend registry functionality instead
	-GGML_API ggml_backend_dev_t ggml_backend_meta_device(
	- ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
	diff --git a/src/llama-model.cpp b/src/llama-model.cpp
	index d2ffc1f45f414670e0b23b27cada2b26238e46d4..b265394ef7366202d19df59d63a5713fb70e3f6f 100644
	--- a/src/llama-model.cpp
	+++ b/src/llama-model.cpp
	@@ -18,9 +18,6 @@
	#include "ggml.h"
	#include "ggml-cpp.h"

	-// TODO: tmp until the ggml meta backend matures and becomes public
	-#include "../src/ggml-ext.h"
	-
	#include <algorithm>
	#include <cassert>
	#include <cfloat>
	diff --git a/src/llama.cpp b/src/llama.cpp
	index ce575246714f0f3cce06ec7931e02ca6ad6f68c7..484372d8d106fedbd3f4a29abe87ebb4f02c9d4a 100644
	--- a/src/llama.cpp
	+++ b/src/llama.cpp
	@@ -15,9 +15,6 @@
	#include "ggml-backend.h"
	#include "gguf.h"

	-// TODO: tmp until the ggml meta backend matures and becomes public
	-#include "../src/ggml-ext.h"
	-
	#include <algorithm>
	#include <cassert>
	#include <cinttypes>
No results found