Created
April 15, 2026 15:57
-
-
Save chenrui333/ae4d6005cbdddb9e2ba47652cbaa3644 to your computer and use it in GitHub Desktop.
llama.cpp b8790 PR21869 backport patch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h | |
| index 3c06aeaffb1fc2324eab663e45bfb223bf7f7433..4a8f6d4287da040efef27e8b394e0a48502db2f8 100644 | |
| --- a/ggml/include/ggml-backend.h | |
| +++ b/ggml/include/ggml-backend.h | |
| @@ -348,6 +348,53 @@ extern "C" { | |
| // Set a callback to be called for each resulting node during graph compute | |
| GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data); | |
| + // | |
| + // Meta backend | |
| + // | |
| + | |
| +#define GGML_BACKEND_META_MAX_DEVICES 16 | |
| + | |
| + enum ggml_backend_meta_split_axis { | |
| + // tensor split by tensor dimensions: | |
| + GGML_BACKEND_SPLIT_AXIS_0 = 0, | |
| + GGML_BACKEND_SPLIT_AXIS_1 = 1, | |
| + GGML_BACKEND_SPLIT_AXIS_2 = 2, | |
| + GGML_BACKEND_SPLIT_AXIS_3 = 3, | |
| + | |
| + GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends | |
| + GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum | |
| + | |
| + // for internal bookkeeping only: | |
| + GGML_BACKEND_SPLIT_AXIS_NONE = 98, | |
| + GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99, | |
| + }; | |
| + GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis); | |
| + | |
| + struct ggml_backend_meta_split_state { | |
| + enum ggml_backend_meta_split_axis axis; | |
| + | |
| + // for tensors with axis >= 0 && axis < GGML_MAX_DIMS: | |
| + // - each device has a slice of the tensor along the split axis | |
| + // - most tensors have n_segments == 1 and a contiguous slice of the tensor data | |
| + // - some tensors have an inhomogenenous data layout along the split axis, | |
| + // those tensors are divided into segments which are each individually split across devices | |
| + // - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis, | |
| + // the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1], | |
| + // - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments | |
| + // that each need to be split individually across devices so that each device gets a slice of Q, K, and V | |
| + int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES]; | |
| + uint32_t n_segments; | |
| + }; | |
| + | |
| + // function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible: | |
| + typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata); | |
| + | |
| + // create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this: | |
| + // TODO: this looks a bit strange - a backend API creates a device. I think we should try | |
| + // express this as a backend registry functionality instead | |
| + GGML_API ggml_backend_dev_t ggml_backend_meta_device( | |
| + ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud); | |
| + | |
| // | |
| // Utils | |
| // | |
| diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c | |
| index e9b70398ffc55812be95a816ebbaa6abf518b6b6..a4b01ccf8a16a9de98335b698f92203404d454ac 100644 | |
| --- a/ggml/src/ggml-alloc.c | |
| +++ b/ggml/src/ggml-alloc.c | |
| @@ -2,6 +2,7 @@ | |
| #include "ggml-backend-impl.h" | |
| #include "ggml.h" | |
| #include "ggml-impl.h" | |
| + | |
| #include <assert.h> | |
| #include <limits.h> | |
| #include <stdarg.h> | |
| diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp | |
| index a2ab8872c4a1cbca831276b15f7bbe5f2b4576be..0a8eea4e9450b28aa6396fd5149d00aeb926e679 100644 | |
| --- a/ggml/src/ggml-backend-meta.cpp | |
| +++ b/ggml/src/ggml-backend-meta.cpp | |
| @@ -5,9 +5,6 @@ | |
| #include "ggml-alloc.h" | |
| #include "ggml-cpp.h" | |
| -// TODO: tmp | |
| -#include "ggml-ext.h" | |
| - | |
| #include <algorithm> | |
| #include <cassert> | |
| #include <cmath> | |
| diff --git a/ggml/src/ggml-ext.h b/ggml/src/ggml-ext.h | |
| deleted file mode 100644 | |
| index 56b0e6d314efd87601fd8bd35b991ce72334ded1..0000000000000000000000000000000000000000 | |
| --- a/ggml/src/ggml-ext.h | |
| +++ /dev/null | |
| @@ -1,56 +0,0 @@ | |
| -#pragma once | |
| - | |
| -#include "ggml.h" | |
| -#include "ggml-backend.h" | |
| - | |
| -// This is a "staging" header for new ggml API | |
| -// It is not publicly available and it should not be used by 3rd party projects | |
| -// | |
| -// When the API matures enough, it will be moved to the official public API | |
| - | |
| -// | |
| -// Meta backend | |
| -// | |
| - | |
| -#define GGML_BACKEND_META_MAX_DEVICES 16 | |
| - | |
| -enum ggml_backend_meta_split_axis { | |
| - // tensor split by tensor dimensions: | |
| - GGML_BACKEND_SPLIT_AXIS_0 = 0, | |
| - GGML_BACKEND_SPLIT_AXIS_1 = 1, | |
| - GGML_BACKEND_SPLIT_AXIS_2 = 2, | |
| - GGML_BACKEND_SPLIT_AXIS_3 = 3, | |
| - | |
| - GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends | |
| - GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum | |
| - | |
| - // for internal bookkeeping only: | |
| - GGML_BACKEND_SPLIT_AXIS_NONE = 98, | |
| - GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99, | |
| -}; | |
| -GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis); | |
| - | |
| -struct ggml_backend_meta_split_state { | |
| - enum ggml_backend_meta_split_axis axis; | |
| - | |
| - // for tensors with axis >= 0 && axis < GGML_MAX_DIMS: | |
| - // - each device has a slice of the tensor along the split axis | |
| - // - most tensors have n_segments == 1 and a contiguous slice of the tensor data | |
| - // - some tensors have an inhomogenenous data layout along the split axis, | |
| - // those tensors are divided into segments which are each individually split across devices | |
| - // - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis, | |
| - // the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1], | |
| - // - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments | |
| - // that each need to be split individually across devices so that each device gets a slice of Q, K, and V | |
| - int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES]; | |
| - uint32_t n_segments; | |
| -}; | |
| - | |
| -// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible: | |
| -typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata); | |
| - | |
| -// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this: | |
| -// TODO: this looks a bit strange - a backend API creates a device. I think we should try | |
| -// express this as a backend registry functionality instead | |
| -GGML_API ggml_backend_dev_t ggml_backend_meta_device( | |
| - ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud); | |
| diff --git a/src/llama-model.cpp b/src/llama-model.cpp | |
| index d2ffc1f45f414670e0b23b27cada2b26238e46d4..b265394ef7366202d19df59d63a5713fb70e3f6f 100644 | |
| --- a/src/llama-model.cpp | |
| +++ b/src/llama-model.cpp | |
| @@ -18,9 +18,6 @@ | |
| #include "ggml.h" | |
| #include "ggml-cpp.h" | |
| -// TODO: tmp until the ggml meta backend matures and becomes public | |
| -#include "../src/ggml-ext.h" | |
| - | |
| #include <algorithm> | |
| #include <cassert> | |
| #include <cfloat> | |
| diff --git a/src/llama.cpp b/src/llama.cpp | |
| index ce575246714f0f3cce06ec7931e02ca6ad6f68c7..484372d8d106fedbd3f4a29abe87ebb4f02c9d4a 100644 | |
| --- a/src/llama.cpp | |
| +++ b/src/llama.cpp | |
| @@ -15,9 +15,6 @@ | |
| #include "ggml-backend.h" | |
| #include "gguf.h" | |
| -// TODO: tmp until the ggml meta backend matures and becomes public | |
| -#include "../src/ggml-ext.h" | |
| - | |
| #include <algorithm> | |
| #include <cassert> | |
| #include <cinttypes> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment