Created
September 19, 2019 18:58
-
-
Save omorgan7/aabab1c496fa46a1543323ac0272feaa to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// compile with: | |
// c++ range_for.cpp -std=c++11 -m[sse4.2|avx2|avx|arch=native] | |
// run with ./a.out array_size | |
#include <vector> | |
#include <thread> | |
#include <cstdio> | |
#include <numeric> | |
#include <emmintrin.h> | |
#include <smmintrin.h> | |
#include <tmmintrin.h> | |
#include <chrono> | |
#include <cassert> | |
__attribute__((noinline)) | |
int sum(const std::vector<int>& a) | |
{ | |
int i = 0; | |
for (const auto& b : a) i += b; | |
return i; | |
} | |
__attribute__((noinline)) | |
int sum2(const std::vector<int>& a) | |
{ | |
int i = 0; | |
for (const int b : a) i += b; | |
return i; | |
} | |
__attribute__((noinline)) | |
int sum3(const std::vector<int>& a) | |
{ | |
int sum = 0; | |
for (size_t i = 0, I = a.size(); i < I; ++i) sum += a[i]; | |
return sum; | |
} | |
__attribute__((noinline)) | |
int sum4(const std::vector<int>& a) | |
{ | |
size_t i = 0; | |
size_t size = a.size(); | |
constexpr int element_width = 8; | |
static_assert(element_width % 4 == 0, "Must be a multiple of 4!"); | |
size_t size4 = (size / element_width) * element_width; | |
const int* aPtr = a.data(); | |
int sum = 0; | |
// read elements until we align to a 16 byte boundary | |
// this allows the faster aligned load rather than loadu to be used. | |
while (i < size && (reinterpret_cast<unsigned long long>(aPtr + i) % 16) != 0) | |
{ | |
sum += aPtr[i++]; | |
} | |
__m128i a0, a1; | |
a0 = a1 = _mm_setzero_si128(); | |
for (; i < size4; i += element_width) | |
{ | |
__m128i elems0 = _mm_load_si128(reinterpret_cast<const __m128i*>(aPtr + i + 0)); | |
__m128i elems1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aPtr + i + 4)); | |
// __m128i elems2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aPtr + i + 8)); | |
// __m128i elems3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aPtr + i + 12)); | |
a0 = _mm_add_epi32(a0, elems0); | |
a1 = _mm_add_epi32(a1, elems1); | |
// a2 = _mm_add_epi32(a2, elems2); | |
// a3 = _mm_add_epi32(a3, elems3); | |
} | |
a0 = _mm_add_epi32(a0, a1); | |
// a2 = _mm_add_epi32(a2, a3); | |
// a0 = _mm_add_epi32(a0, a2); | |
// handle remaining cases for sizes non-divisible by 4 | |
for (; i < size; ++i) | |
{ | |
sum += aPtr[i]; | |
} | |
a0 = _mm_hadd_epi32(a0, a0); | |
a0 = _mm_hadd_epi32(a0, a0); | |
int out = _mm_extract_epi32(a0, 0); | |
return sum + out; | |
} | |
void do_not_optimise(int a) | |
{ | |
if (std::this_thread::get_id() == std::thread::id()) | |
{ | |
printf("%d\n", a); | |
} | |
} | |
int main(int argc, char** argv) | |
{ | |
if (argc < 2) return 1; | |
std::vector<int> a(static_cast<size_t>(std::atoi(argv[1]))); | |
std::iota(a.begin(), a.end(), 0); | |
{ | |
int s = sum(a); | |
do_not_optimise(s); | |
auto time0 = std::chrono::high_resolution_clock::now(); | |
s = sum(a); | |
auto time1 = std::chrono::high_resolution_clock::now(); | |
do_not_optimise(s); | |
printf("sum() took: %llu nanoseconds\n", (time1 - time0).count()); | |
} | |
{ | |
int s = sum2(a); | |
do_not_optimise(s); | |
auto time0 = std::chrono::high_resolution_clock::now(); | |
s = sum2(a); | |
auto time1 = std::chrono::high_resolution_clock::now(); | |
do_not_optimise(s); | |
printf("sum2() took: %llu nanoseconds\n", (time1 - time0).count()); | |
} | |
{ | |
int s = sum3(a); | |
do_not_optimise(s); | |
auto time0 = std::chrono::high_resolution_clock::now(); | |
s = sum3(a); | |
auto time1 = std::chrono::high_resolution_clock::now(); | |
do_not_optimise(s); | |
printf("sum3() took: %llu nanoseconds\n", (time1 - time0).count()); | |
} | |
{ | |
int s = sum4(a); | |
do_not_optimise(s); | |
auto time0 = std::chrono::high_resolution_clock::now(); | |
s = sum4(a); | |
auto time1 = std::chrono::high_resolution_clock::now(); | |
do_not_optimise(s); | |
printf("sum4() took: %llu nanoseconds\n", (time1 - time0).count()); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment