Last active
August 21, 2025 17:29
-
-
Save ivankra/b1c541c5ffba7eff9e436790f8d79b92 to your computer and use it in GitHub Desktop.
Tiny tail-calling interpreter demo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Tiny tail-calling interpreter demo | |
| // https://godbolt.org/z/TPozdErM5 | |
| // | |
| // As seen in | |
| // * Protobuf: | |
| // https://blog.reverberate.org/2021/04/21/musttail-efficient-interpreters.html | |
| // * LuaJIT remake (Deegen): | |
| // https://sillycross.github.io/2022/11/22/2022-11-22/ | |
| // * CPython 3.14: | |
| // https://lwn.net/Articles/1010905/ | |
| // https://github.com/python/cpython/issues/128563 | |
| // | |
| // Idea in a gist: a typical giant switch-case interpreter loop is an extreme | |
| // outlier case for compilers to optimize due to many variables and code paths. | |
| // An alternative here is to split it into many small, easily-optimizable | |
| // functions and dispatch between them using tail calls that, hopefully, | |
| // would get optimized away to a simple jump. | |
| // | |
| // For certainty rather than hope, new 'musttail' annotation is needed from newer | |
| // compilers (gcc 15+, clang 13+), or "return goto" from recent n3266 proposed | |
| // extension to C. | |
| // | |
| // 'preserve_none' annotation (clang only for now) complements it by getting | |
| // rid of prologues/epilogues, allowing data to effortlessly flow in registers. | |
| // Finally, plain C/C++ can catch up with LuaJIT's hand-written assembly and | |
| // fancy engines' macroassemblers (V8 Ignition etc)... | |
| // | |
| // Another benefit is this is easier on CPU's branch predictor as branches | |
| // are spread over multiple code locations rather than one hot spot (but this | |
| // is not unique to tail calling, also can be achieved with computed goto's | |
| // instead of break's in the switch.) | |
| #include <bits/stdc++.h> | |
| using i64 = int64_t; | |
| #if __has_attribute(preserve_none) | |
| #define PRESERVE_NONE __attribute__((preserve_none)) | |
| #elif __has_attribute(preserve_most) | |
| #define PRESERVE_NONE __attribute__((preserve_most)) | |
| #else | |
| #warning "preserve_none not supported" | |
| #define PRESERVE_NONE | |
| #endif | |
| #if defined(__clang__) && __has_cpp_attribute(clang::musttail) | |
| #define MUSTTAIL [[clang::musttail]] | |
| #elif defined(__GNUC__) && __has_cpp_attribute(gnu::musttail) | |
| #define MUSTTAIL [[gnu::musttail]] | |
| #elif __has_attribute(musttail) | |
| #define MUSTTAIL __attribute__((musttail)) | |
| #else | |
| #warning "musttail not supported" | |
| #define MUSTTAIL | |
| #endif | |
| enum Op : i64 { OP_PUSH, OP_DUP, OP_ADD, OP_JNZ, OP_PRINT, OP_HALT }; | |
| struct VM { | |
| std::vector<i64> code; | |
| std::vector<i64> stack; | |
| VM() : stack(1000) {} | |
| // Put any hot vars into parameters to pin them to registers | |
| #define TAIL_PARAMS i64* ip, i64* sp | |
| #define TAIL_ARGS ip, sp | |
| #define HANDLER(name) PRESERVE_NONE static i64 name(TAIL_PARAMS) | |
| #define DISPATCH MUSTTAIL return ((VM::Handler*)jump_table)[*ip](TAIL_ARGS) | |
| using Handler = PRESERVE_NONE i64(*)(TAIL_PARAMS); | |
| static Handler jump_table[]; | |
| i64 run() { | |
| i64* ip = code.data(); | |
| return jump_table[*ip](ip, stack.data()); | |
| } | |
| HANDLER(op_push) { | |
| *sp++ = ip[1]; | |
| ip += 2; | |
| DISPATCH; | |
| } | |
| HANDLER(op_dup) { | |
| *sp++ = sp[-1]; | |
| ip++; | |
| DISPATCH; | |
| } | |
| HANDLER(op_add) { | |
| i64 b = *--sp, a = *--sp; | |
| *sp++ = a + b; | |
| ip++; | |
| DISPATCH; | |
| } | |
| HANDLER(op_jnz) { | |
| ip += *--sp ? ip[1] : 2; | |
| DISPATCH; | |
| } | |
| HANDLER(op_print) { | |
| std::cout << *--sp << std::endl; | |
| ip++; | |
| DISPATCH; | |
| } | |
| HANDLER(op_halt) { | |
| return 0; | |
| } | |
| }; | |
| VM::Handler VM::jump_table[] = { | |
| &VM::op_push, &VM::op_dup, &VM::op_add, | |
| &VM::op_jnz, &VM::op_print, &VM::op_halt | |
| }; | |
| int main() { | |
| std::cout << "Test 1: 10 + 5 - 3 = "; | |
| VM vm; | |
| vm.code = { | |
| OP_PUSH, 10, | |
| OP_PUSH, 5, | |
| OP_ADD, | |
| OP_PUSH, -3, | |
| OP_ADD, | |
| OP_PRINT, | |
| OP_HALT | |
| }; | |
| vm.run(); | |
| std::cout << "Test 2: 1B loop stress test..." << std::endl; | |
| vm.code = { | |
| OP_PUSH, 1000000000LL, | |
| OP_PUSH, -1, | |
| OP_ADD, | |
| OP_DUP, | |
| OP_JNZ, -4, | |
| OP_HALT | |
| }; | |
| auto t0 = std::chrono::high_resolution_clock::now(); | |
| vm.run(); | |
| auto t1 = std::chrono::high_resolution_clock::now(); | |
| auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0); | |
| std::cout << duration.count() << "ms\n"; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment