Skip to content

Instantly share code, notes, and snippets.

@chenrui333
Created April 15, 2026 15:57
Show Gist options
  • Select an option

  • Save chenrui333/ae4d6005cbdddb9e2ba47652cbaa3644 to your computer and use it in GitHub Desktop.

Select an option

Save chenrui333/ae4d6005cbdddb9e2ba47652cbaa3644 to your computer and use it in GitHub Desktop.
llama.cpp b8790 PR21869 backport patch
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 3c06aeaffb1fc2324eab663e45bfb223bf7f7433..4a8f6d4287da040efef27e8b394e0a48502db2f8 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -348,6 +348,53 @@ extern "C" {
// Set a callback to be called for each resulting node during graph compute
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
+ //
+ // Meta backend
+ //
+
+#define GGML_BACKEND_META_MAX_DEVICES 16
+
+ enum ggml_backend_meta_split_axis {
+ // tensor split by tensor dimensions:
+ GGML_BACKEND_SPLIT_AXIS_0 = 0,
+ GGML_BACKEND_SPLIT_AXIS_1 = 1,
+ GGML_BACKEND_SPLIT_AXIS_2 = 2,
+ GGML_BACKEND_SPLIT_AXIS_3 = 3,
+
+ GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
+ GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum
+
+ // for internal bookkeeping only:
+ GGML_BACKEND_SPLIT_AXIS_NONE = 98,
+ GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
+ };
+ GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
+
+ struct ggml_backend_meta_split_state {
+ enum ggml_backend_meta_split_axis axis;
+
+ // for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
+ // - each device has a slice of the tensor along the split axis
+ // - most tensors have n_segments == 1 and a contiguous slice of the tensor data
+ // - some tensors have an inhomogenenous data layout along the split axis,
+ // those tensors are divided into segments which are each individually split across devices
+ // - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
+ // the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
+ // - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
+ // that each need to be split individually across devices so that each device gets a slice of Q, K, and V
+ int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES];
+ uint32_t n_segments;
+ };
+
+ // function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
+ typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
+
+ // create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
+ // TODO: this looks a bit strange - a backend API creates a device. I think we should try
+ // express this as a backend registry functionality instead
+ GGML_API ggml_backend_dev_t ggml_backend_meta_device(
+ ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
+
//
// Utils
//
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index e9b70398ffc55812be95a816ebbaa6abf518b6b6..a4b01ccf8a16a9de98335b698f92203404d454ac 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -2,6 +2,7 @@
#include "ggml-backend-impl.h"
#include "ggml.h"
#include "ggml-impl.h"
+
#include <assert.h>
#include <limits.h>
#include <stdarg.h>
diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp
index a2ab8872c4a1cbca831276b15f7bbe5f2b4576be..0a8eea4e9450b28aa6396fd5149d00aeb926e679 100644
--- a/ggml/src/ggml-backend-meta.cpp
+++ b/ggml/src/ggml-backend-meta.cpp
@@ -5,9 +5,6 @@
#include "ggml-alloc.h"
#include "ggml-cpp.h"
-// TODO: tmp
-#include "ggml-ext.h"
-
#include <algorithm>
#include <cassert>
#include <cmath>
diff --git a/ggml/src/ggml-ext.h b/ggml/src/ggml-ext.h
deleted file mode 100644
index 56b0e6d314efd87601fd8bd35b991ce72334ded1..0000000000000000000000000000000000000000
--- a/ggml/src/ggml-ext.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-// This is a "staging" header for new ggml API
-// It is not publicly available and it should not be used by 3rd party projects
-//
-// When the API matures enough, it will be moved to the official public API
-
-//
-// Meta backend
-//
-
-#define GGML_BACKEND_META_MAX_DEVICES 16
-
-enum ggml_backend_meta_split_axis {
- // tensor split by tensor dimensions:
- GGML_BACKEND_SPLIT_AXIS_0 = 0,
- GGML_BACKEND_SPLIT_AXIS_1 = 1,
- GGML_BACKEND_SPLIT_AXIS_2 = 2,
- GGML_BACKEND_SPLIT_AXIS_3 = 3,
-
- GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
- GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum
-
- // for internal bookkeeping only:
- GGML_BACKEND_SPLIT_AXIS_NONE = 98,
- GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
-};
-GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
-
-struct ggml_backend_meta_split_state {
- enum ggml_backend_meta_split_axis axis;
-
- // for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
- // - each device has a slice of the tensor along the split axis
- // - most tensors have n_segments == 1 and a contiguous slice of the tensor data
- // - some tensors have an inhomogenenous data layout along the split axis,
- // those tensors are divided into segments which are each individually split across devices
- // - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
- // the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
- // - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
- // that each need to be split individually across devices so that each device gets a slice of Q, K, and V
- int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES];
- uint32_t n_segments;
-};
-
-// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
-typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
-
-// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
-// TODO: this looks a bit strange - a backend API creates a device. I think we should try
-// express this as a backend registry functionality instead
-GGML_API ggml_backend_dev_t ggml_backend_meta_device(
- ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index d2ffc1f45f414670e0b23b27cada2b26238e46d4..b265394ef7366202d19df59d63a5713fb70e3f6f 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -18,9 +18,6 @@
#include "ggml.h"
#include "ggml-cpp.h"
-// TODO: tmp until the ggml meta backend matures and becomes public
-#include "../src/ggml-ext.h"
-
#include <algorithm>
#include <cassert>
#include <cfloat>
diff --git a/src/llama.cpp b/src/llama.cpp
index ce575246714f0f3cce06ec7931e02ca6ad6f68c7..484372d8d106fedbd3f4a29abe87ebb4f02c9d4a 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -15,9 +15,6 @@
#include "ggml-backend.h"
#include "gguf.h"
-// TODO: tmp until the ggml meta backend matures and becomes public
-#include "../src/ggml-ext.h"
-
#include <algorithm>
#include <cassert>
#include <cinttypes>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment