Created
February 24, 2025 07:44
-
-
Save cmdr2/fa03a4184530ee8128146c54042fd6c2 to your computer and use it in GitHub Desktop.
Add two float32 tensors using ggml. Each tensor takes 1 GB of memory.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "ggml.h" | |
#include "ggml-cpu.h" | |
#ifdef GGML_USE_CUDA | |
#include "ggml-cuda.h" | |
#endif | |
#include <vector> | |
#include <iostream> | |
#include <chrono> | |
ggml_backend_t backend = NULL; | |
ggml_gallocr_t allocr = NULL; | |
using namespace std::chrono; | |
void init_backend() { | |
#ifdef GGML_USE_CUDA | |
fprintf(stderr, "%s: using CUDA backend\n", __func__); | |
backend = ggml_backend_cuda_init(0); // init device 0 | |
if (!backend) { | |
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); | |
} | |
#endif | |
if (!backend) { | |
backend = ggml_backend_cpu_init(); | |
} | |
} | |
void init_mem_allocator() { | |
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); | |
} | |
void predict() { | |
// Record initial VRAM state | |
size_t free_mem_start, total_mem; | |
ggml_backend_cuda_get_device_memory(0, &free_mem_start, &total_mem); | |
// create a context | |
struct ggml_init_params params = { | |
/*.mem_size =*/ ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(), | |
/*.mem_buffer =*/ NULL, | |
/*.no_alloc =*/ true, | |
}; | |
struct ggml_context* ctx = ggml_init(params); | |
const int N = 1024 * 1024 * 500; | |
// 1. Define the tensor variables | |
struct ggml_tensor* a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, N); | |
struct ggml_tensor* b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, N); | |
// 2. Define the computation graph | |
struct ggml_tensor* result = ggml_add(ctx, a, b); | |
struct ggml_cgraph* gf = ggml_new_graph(ctx); | |
ggml_build_forward_expand(gf, result); | |
// 3. Allocate memory for the tensor variables, and assign the data | |
ggml_gallocr_alloc_graph(allocr, gf); | |
std::vector<float> a_data(N); | |
std::vector<float> b_data(N); | |
for (size_t i = 0; i < N; ++i) { | |
a_data[i] = 1.5f; | |
b_data[i] = 2.5f; | |
} | |
size_t size = N * ggml_type_size(GGML_TYPE_F32); | |
printf("host mem usage: %lld MB\n", 2 * size / (1024 * 1024)); // one per array | |
ggml_backend_tensor_set(a, a_data.data(), 0, ggml_nbytes(a)); | |
ggml_backend_tensor_set(b, b_data.data(), 0, ggml_nbytes(b)); | |
// 4. Run the computation, and read the result | |
auto start = high_resolution_clock::now(); | |
ggml_backend_graph_compute(backend, gf); | |
auto stop = high_resolution_clock::now(); | |
std::cout<<"Time taken: "<<duration_cast<milliseconds>(stop - start).count()<<" ms"<<std::endl; | |
// Record final VRAM state | |
size_t free_mem_end; | |
ggml_backend_cuda_get_device_memory(0, &free_mem_end, &total_mem); | |
// Calculate peak VRAM usage | |
size_t peak_usage = free_mem_start - free_mem_end; | |
printf("Peak VRAM usage: %f MB\n", peak_usage / (1024.0 * 1024.0)); | |
struct ggml_tensor* result_node = ggml_graph_node(gf, -1); // get the last node in the graph | |
int n = ggml_nelements(result_node); // create an array to store the result data | |
std::vector<float> result_data(n); | |
// copy the data from the backend memory into the result array | |
ggml_backend_tensor_get(result_node, result_data.data(), 0, ggml_nbytes(result_node)); | |
// print the data | |
for (int i = 0; i < 10; i++) { | |
std::cout<<result_data[i]<<", "; | |
} | |
std::cout<<std::endl; | |
// free the resources | |
ggml_free(ctx); | |
} | |
int main(int argc, char* argv[]) { | |
init_backend(); | |
init_mem_allocator(); | |
predict(); | |
// free the resources | |
ggml_gallocr_free(allocr); | |
ggml_backend_free(backend); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment