-
-
Save cipri-tom/e4f28c2785ff0de30b71 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
local function printf(s, ...) | |
io.write(s:format(...)) | |
end | |
local ffi = require("ffi") | |
ffi.cdef[[ | |
typedef void (*cb)(void); | |
void set_v(int n, void (*)(void )); | |
void set_i(int n, void (*)(int )); | |
void set_d(int n, void (*)(double)); | |
int get_i(int n, int (*)(void)); | |
double get_d(int n, double (*)(void)); | |
void call_v(void ); | |
void call_i(int ); | |
void call_d(double); | |
void loop (int n); | |
]] | |
local callback = ffi.load("./callback.so") | |
local timeit = require("timeit") | |
local v = 0 | |
local function lset_v( ) v = v + 1 end | |
local function lset_a(a) a = a + 1 end | |
local function lget ( ) return v*2 end | |
print("operation ", "reps ", "time(s)", "nsec/call") | |
local c2l = { | |
{name='set_v', func=lset_v}, | |
{name='set_i', func=lset_a}, | |
{name='set_d', func=lset_a}, | |
{name='get_i', func=lget }, | |
{name='get_d', func=lget } | |
} | |
for _,test in ipairs(c2l) do | |
local r = timeit(function(n) | |
callback[test.name](n, test.func) | |
end) | |
printf("C into Lua %-12s %s\n", test.name, r) | |
end | |
print("Lua into C call(void) ", timeit(function(n) | |
for i = 1, n do callback.call_v() end | |
end)) | |
print("Lua into C call(int) ", timeit(function(n) | |
for i = 1, n do callback.call_i(3) end | |
end)) | |
print("Lua into C call(double)", timeit(function(n) | |
for i = 1, n do callback.call_d(3.5) end | |
end)) | |
print("Lua into Lua ", timeit(function(n) | |
for i = 1, n do lset_v() end | |
end)) | |
print("C empty loop ", timeit(function(n) | |
callback.loop(n) | |
end)) | |
print("Lua empty loop ", timeit(function(n) | |
for i = 1, n do end | |
end)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// gcc -std=c99 -Wall -pedantic -O3 -shared -static-libgcc -fPIC callback.c -o callback.so | |
// --- CALLS ------------------------------------------------------------------ | |
void call_v(void) { | |
} | |
void call_i(int v) { | |
v += 5; | |
} | |
void call_d(double v) { | |
v += 5.0; | |
} | |
void loop(int n) { | |
for (int i = 0; i < n; i++) { | |
/* prevent compiler optimizations from skipping loop entirely */ | |
__asm__(""); | |
} | |
} | |
// --- SETTERS ---------------------------------------------------------------- | |
void set_v(int n, void (*f)(void)) { | |
for (int i = 0; i < n; i++) { | |
f(); | |
} | |
} | |
void set_i(int n, void (*f)(int)) { | |
for (int i = 0; i < n; i++) { | |
f(i); | |
} | |
} | |
void set_d(int n, void (*f)(double)) { | |
double a = 3.0; | |
for (int i = 0; i < n; i++) { | |
f(a); | |
} | |
} | |
// --- GETTERS ---------------------------------------------------------------- | |
void get_i(int n, int (*f)(void)) { | |
int v; | |
for (int i = 0; i < n; i++) { | |
v = f(); | |
} | |
(void)v; // avoid 'set-but-unused' warning | |
} | |
void get_d(int n, double (*f)(void)) { | |
double v; | |
for (int i = 0; i < n; i++) { | |
v = f(); | |
} | |
(void)v; // avoid 'set-but-unused' warning | |
} | |
// --- PUSH vs PULL ----------------------------------------------------------- | |
typedef double (*getter_fp)(int len, unsigned char mono[len]); | |
struct Arr { | |
int size; | |
double data[]; | |
}; | |
enum constants {MONO_LEN = 5}; | |
unsigned char mono[MONO_LEN] = {1, 2, 3, 4, 5}; | |
// --- --- PUSH style --------------------------------------------------------- | |
void push_style(struct Arr *a, getter_fp get_multiplier) | |
{ | |
for (int i = 0; i < a->size; ++i) | |
a->data[i] *= get_multiplier(MONO_LEN, mono); | |
} | |
// --- --- PULL style --------------------------------------------------------- | |
int get_mono_len() | |
{ | |
return MONO_LEN; | |
} | |
unsigned char* | |
get_mono(int idx) | |
{ | |
return mono; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- OBJECTIVE : apply a LUA function to all members of an array | |
-- -- PUSH style: do it in C side, with callback to the LUA function | |
-- -- PULL style: do it in Lua side, with calls to C to get necessary info | |
local ffi = require("ffi") | |
ffi.cdef[[ | |
typedef double (*getter_fp)(int len, unsigned char mono[]); | |
struct Arr { | |
int size; | |
double data[?]; | |
}; | |
void push_style(struct Arr *a, getter_fp get_multiplier); | |
int get_mono_len(); | |
unsigned char* get_mono(int idx); | |
]] | |
local callback = ffi.load("./callback.so") | |
local timeit = require("timeit") | |
local arr_t = ffi.typeof("struct Arr"); | |
-- the callback | |
local function lget_multiplier(len, mono) | |
local s = 0 | |
for i=0,len-1 do s = s + mono[i] end | |
return s * 0.5 | |
end | |
-- PUSH style ----------------------------------------------------------------- | |
local function push_style(n) | |
local a = arr_t(n, {n}) | |
for i=0,n-1 do a.data[i] = i end | |
local cb = ffi.cast("getter_fp", lget_multiplier) | |
callback.push_style(a, cb) | |
return a | |
end | |
-- PULL style ----------------------------------------------------------------- | |
local function pull_style(n) | |
local a = arr_t(n, {n}) | |
for i=0,n-1 do a.data[i] = i end | |
local mono_len, mono = callback.get_mono_len() | |
for i=0,n-1 do | |
mono = callback.get_mono(i) | |
a.data[i] = a.data[i] * lget_multiplier(mono_len, mono) | |
end | |
return a | |
end | |
-- CHECK ---------------------------------------------------------------------- | |
local push_v, pull_v = push_style(100), pull_style(100) | |
assert(push_v.size == pull_v.size) | |
for i=0,push_v.size-1 do | |
assert(push_v.data[i] == pull_v.data[i]) | |
end | |
-- BENCH ---------------------------------------------------------------------- | |
print("PUSH style", timeit(push_style)) | |
print("PULL style", timeit(pull_style)) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
operation reps time(s) nsec/call | |
C into Lua set_v 10000000 0.498 49.817 | |
C into Lua set_i 10000000 0.662 66.249 | |
C into Lua set_d 10000000 0.681 68.143 | |
C into Lua get_i 10000000 0.633 63.272 | |
C into Lua get_d 10000000 0.650 64.990 | |
Lua into C call(void) 100000000 0.381 3.807 | |
Lua into C call(int) 100000000 0.381 3.815 | |
Lua into C call(double) 100000000 0.415 4.154 | |
Lua into Lua 100000000 0.104 1.039 | |
C empty loop 1000000000 0.695 0.695 | |
Lua empty loop 1000000000 0.693 0.693 | |
PUSH style 1000000 0.158 158.256 | |
PULL style 1000000 0.207 207.297 | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- call a function with a repeat count argument. | |
-- Takes a single argument, which is a function which in turn takes one argument, | |
-- a repeat count. The function is called with increasingly large repeat counts | |
-- until it takes at least a certain amount of time to run, and is then called four | |
-- more times with the same repeat count, with the minimum elapsed time recorded. | |
-- Modeled loosely on Python's timeit, except the function passed in is responsible | |
-- for doing the actual repetition. | |
return function(func) | |
local reps = 10000 | |
local elapsed | |
repeat | |
reps = reps * 10 | |
local start = os.clock() | |
func(reps) | |
elapsed = os.clock() - start | |
until elapsed > 0.1 or reps >= 1e9 | |
for i = 1, 4 do | |
local start = os.clock() | |
func(reps) | |
elapsed = math.min(elapsed, os.clock() - start) | |
end | |
return ("%10d\t%.3f\t%7.3f"):format(reps, elapsed, elapsed / reps * 1e9) | |
end |
Thanks for the detailed analysis and run on very different architectures !
Really nice to see ARM in there too, we can only expect it to get better, indeed.
I had no idea people are still interested in this benchmark. I re-visited the answers to the question and I find that indeed, this is measuring the CPU more than anything else. I really recommend people take Josh's suggestion and benchmark everything in context, since this empty benchmark may not replicate to one's real scenario.
But other than that, we can see that the performance is about the same regardless of the direction (Lua -> C, C->Lua) except on ARM, so I'd first write for readability and ease of use, and then only move code to the other side if it turns out to be a bottleneck.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I happened to be curious about the real-world difference between push vs pull approaches - how much of a performance impact there is, but also what sort of timescale the slowdown is occurring at (which is arguably more important for orientation).
A quick google found https://stackoverflow.com/questions/12329128/luajit-ffi-callback-performance/12435278#12435278, an answer that referenced a source Gist with benchmark code. Unsure how to run it (-.-) I noticed it had a few forks - then stumbled on this interesting-looking slightly extended version. (Edit: Yup, I didn't scroll slightly further down to find the additional answer directly underneath the one I found, pointing straight here...)
After figuring out that, y'know, you just run the two scripts ;) (but thanks very much for adding the
gcc
invocation, I might not have figured that out myself before giving up), I got sliiiightly sidetracked wondering what sort of performance differences might exist in various compute environments.My own hardware is getting pretty old; it would be very interesting to see how much faster the latest chipsets perform, and whether they're significantly faster (eg, by 1-10%) or only slightly faster.
Without reference points I'm not sure if the free-tier public cloud resources I've tested against just happen to closely correlate with my own systems' performance, or whether things are broadly neck-and-neck.
It was particularly interesting to discover what hardware I was running on in some environments! EPYC's getting around... (duh)
All results below are from 2.1.0-beta3. I wondered about also comparing with Git HEAD, and while that does sound interesting, I didn't want to make this twice as long...
i3-3220 (3.3GHz) "server" w/ 1600MHz DDR3:
EliteBook 8470p (i5-3360M (2.8GHz), 1600MHz DDR3) + Chrome chewing ~12% CPU 🔥:
Google Cloud shell session (4 cores, EPYC 7B12):
Google Cloud free tier f1-micro (1 core, "Xeon(R) CPU @ 2.20GHz"):
AWS free tier t2-micro (1 core, E5-2676 v3 @ 2.4GHz):
Contabo VPS (2 cores, E5-2620 v3 @ 2.4GHz) + some (I/O-throttled) background processing 💽:
Oracle Cloud free tier, VM.Standard.E2.1.Micro (2 cores, EPYC 7551):
Oracle Cloud free tier, VM.Standard.A1.Flex (4/4 cores enabled, Ampere Neoverse N1):
Of note is that LuaJIT's AArch64 JIT (as of 2.1.0) doesn't optimize Lua-into-C yet. (Maybe Git HEAD does?)
An extra tidbit: I had to download and build LuaJIT from source on a couple of environments to get the same version everywhere. It was a bit of a small-pause moment to realize how fast the ARM system was (you did read the last entry above, right? 😺) in comparison to the rest of the systems.
Using
make -j32
, I saw:Contabo (2 cores):
AWS t2-micro (1 core):
Google Cloud shell session (4 cores):
Oracle Cloud VM.Standard.A1.Flex (4 cores):
It's been very interesting to realize what sorts of resources are being made available for free (with the Ampere offering).
It's very reasonable to posit that the current situation only exists to bootstrap developers' interest, and may change once there is sufficient mindshare (and tenancy!) saturation - but if this is the sort of performance on offer, once that mindshare has been established, it'll rapidly become entrenched and demand will only increase.
Scaling 4 cores and 24GB RAM out for free is actually kind of interesting - that's significant enough that, if the pricing structure *were* to change (c'est la vie Amazon Cloud Drive), other provider(s) would likely be able to step up and maintain this new status quo, because of the favorable economies of scale (the Ampere compute shape scales out to 80 cores and 512GB RAM).
This has been unexpectedly educational.