RealNeGate · September 14, 2023 14:52 · Sep 14, 2023 · Sep 14, 2023 · Sep 14, 2023 · Sep 14, 2023
diff --git a/tb_comparison.md b/tb_comparison.md
@@ -61,17 +61,17 @@ if (cond) {
 }
 ```
 the big win is making it easier to do later loop optimizations.
-their codegen generates `inc rax` instead of `add rax, 1` which doesn't really matter to me
-oh and the other one is that my register allocator does not coalesce a copy of the loop index
-in practice these loops run at about the same speed but the gap grows in more complex examples
-loop rotation eventually becomes helpful once you need to hoist memory ops from loops
+their codegen generates `inc rax` instead of `add rax, 1` which doesn't really matter to me.
+oh and the other one is that my register allocator does not coalesce a copy of the loop index.
+In practice these loops run at about the same speed but the gap grows in more complex examples.
+Loop rotation eventually becomes helpful once you need to hoist memory ops from loops:
 ```c
 // broadcast item across all elements
 for (int i = 0; i < n; i++) {
   dst[i] = src[0];
 }
 ```
-given the src is a pointer i haven't yet accessed and n can be 0, i can't hoist that load because just placing it outside of the loop means it'll force a load even if n == 0 which could be a problem 
+given the src is a pointer i haven't yet accessed and n can be 0, i can't hoist that load because just placing it outside of the loop means it'll force a load even if n == 0 which could be a problem. 
 but when we do loop rotation...
 ```c
 size_t i = 0;
@@ -83,7 +83,7 @@ if (i < n) {
 }
 ```
 we can now guarentee that once we're in the loop body we're doing at least one trip
-so hoisting is now fair game
+so hoisting is now fair game:
 ```diff
 + // if logic proved we do one trip which means we're met the condition to read src[0]
 + size_t hoisted = src[0];

diff --git a/tb_comparison.md b/tb_comparison.md
@@ -60,8 +60,8 @@ if (cond) {
   } while (cond);
 }
 ```
-the big win is making it easier to do later loop optimizations
-their tiling generates inc rax instead of add rax, 1 which doesn't really matter to me
+the big win is making it easier to do later loop optimizations.
+their codegen generates `inc rax` instead of `add rax, 1` which doesn't really matter to me
 oh and the other one is that my register allocator does not coalesce a copy of the loop index
 in practice these loops run at about the same speed but the gap grows in more complex examples
 loop rotation eventually becomes helpful once you need to hoist memory ops from loops

diff --git a/tb_comparison.md b/tb_comparison.md
@@ -30,18 +30,18 @@ L0000021FE98F0540:
 
 # clang -O1
 max_array:
-        test    rdi, rdi
-        je      .LBB0_3
-        xor     eax, eax
+  test    rdi, rdi
+  je      .LBB0_3
+  xor     eax, eax
 .LBB0_2:
-        movss   xmm0, dword ptr [rsi + 4*rax]
-        maxss   xmm0, dword ptr [rdx + 4*rax]
-        movss   dword ptr [rsi + 4*rax], xmm0
-        inc     rax
-        cmp     rdi, rax
-        jne     .LBB0_2
+  movss   xmm0, dword ptr [rsi + 4*rax]
+  maxss   xmm0, dword ptr [rdx + 4*rax]
+  movss   dword ptr [rsi + 4*rax], xmm0
+  inc     rax
+  cmp     rdi, rax
+  jne     .LBB0_2
 .LBB0_3:
-        ret
+  ret
 ```
 
 you'll notice two checks instead of one this is because they do loop rotation simple rundown is this 

diff --git a/tb_comparison.md b/tb_comparison.md
@@ -45,7 +45,7 @@ max_array:
 ```
 
 you'll notice two checks instead of one this is because they do loop rotation simple rundown is this 
-```
+```c
 while (cond) {
   ...
 }
@@ -65,15 +65,15 @@ their tiling generates inc rax instead of add rax, 1 which doesn't really matter
 oh and the other one is that my register allocator does not coalesce a copy of the loop index
 in practice these loops run at about the same speed but the gap grows in more complex examples
 loop rotation eventually becomes helpful once you need to hoist memory ops from loops
-```
+```c
 // broadcast item across all elements
 for (int i = 0; i < n; i++) {
   dst[i] = src[0];
 }
 ```
 given the src is a pointer i haven't yet accessed and n can be 0, i can't hoist that load because just placing it outside of the loop means it'll force a load even if n == 0 which could be a problem 
 but when we do loop rotation...
-```
+```c
 size_t i = 0;
 if (i < n) {
   do {
@@ -84,7 +84,7 @@ if (i < n) {
 ```
 we can now guarentee that once we're in the loop body we're doing at least one trip
 so hoisting is now fair game
-```
+```diff
 + // if logic proved we do one trip which means we're met the condition to read src[0]
 + size_t hoisted = src[0];
   do {

diff --git a/tb_comparison.md b/tb_comparison.md
@@ -92,10 +92,4 @@ so hoisting is now fair game
 +   dst[i] = hoisted;
   } while (i < n);
 ```
-
-yea that's my ted talk.
-
-I started implementing this a while back and stopped because i only do optimizer stuff on the side, it's lower priority than other crap. 
-it won't be ready until far later.
-
-- NeGate
+Yea that's my ted talk. I started implementing this a while back and stopped because I only do optimizer stuff on the side, it's low priority for now.
diff --git a/tb_comparison.md b/tb_comparison.md
@@ -93,6 +93,9 @@ so hoisting is now fair game
   } while (i < n);
 ```
 
-yea that's my ted talk
-i started implementing this a while back and stopped because i only do optimizer stuff on the side, it's lower priority than other crap 
-it won't be ready until far later
+yea that's my ted talk.
+
+I started implementing this a while back and stopped because i only do optimizer stuff on the side, it's lower priority than other crap. 
+it won't be ready until far later.
+
+- NeGate
diff --git a/tb_comparison.md b/tb_comparison.md
@@ -44,9 +44,8 @@ max_array:
         ret
 ```
 
-you'll notice two checks instead of one
-this is because they do loop rotation
-simple rundown is this 
+you'll notice two checks instead of one this is because they do loop rotation simple rundown is this 
+```
 while (cond) {
   ...
 }
@@ -60,7 +59,7 @@ if (cond) {
     ...
   } while (cond);
 }
- 
+```
 the big win is making it easier to do later loop optimizations
 their tiling generates inc rax instead of add rax, 1 which doesn't really matter to me
 oh and the other one is that my register allocator does not coalesce a copy of the loop index

diff --git a/tb_comparison.md b/tb_comparison.md
@@ -0,0 +1,99 @@
+Let's compare Clang and Cuik/TB a bit to get a picture of what i need to do.
+
+Given this C code:
+```c
+void max_array(size_t n, float* x, float* y) {
+    for (size_t i = 0; i < n; i++) {
+        x[i] = x[i] > y[i] ? x[i] : y[i];
+    }
+}
+```
+Clang and I generate:
+```x86
+# cuik -O1
+max_array:
+  xor RAX, RAX
+L0000021FE98F03C0:
+  mov R9, RAX
+  cmp R9, RCX
+  jnb L0000021FE98F0540
+L0000021FE98F0440:
+  movss XMM0, xmmword [R8 + RAX*4]
+  maxss XMM0, xmmword [RDX + RAX*4]
+  movss xmmword [RDX + RAX*4], XMM0
+  add RAX, 1
+  jmp L0000021FE98F03C0
+L0000021FE98F0540:
+  ret
+
+
+
+# clang -O1
+max_array:
+        test    rdi, rdi
+        je      .LBB0_3
+        xor     eax, eax
+.LBB0_2:
+        movss   xmm0, dword ptr [rsi + 4*rax]
+        maxss   xmm0, dword ptr [rdx + 4*rax]
+        movss   dword ptr [rsi + 4*rax], xmm0
+        inc     rax
+        cmp     rdi, rax
+        jne     .LBB0_2
+.LBB0_3:
+        ret
+```
+
+you'll notice two checks instead of one
+this is because they do loop rotation
+simple rundown is this 
+while (cond) {
+  ...
+}
+
+// into...
+
+if (cond) {
+  // loop invariants are placed here
+  //   none in this example :p
+  do {
+    ...
+  } while (cond);
+}
+
+the big win is making it easier to do later loop optimizations
+their tiling generates inc rax instead of add rax, 1 which doesn't really matter to me
+oh and the other one is that my register allocator does not coalesce a copy of the loop index
+in practice these loops run at about the same speed but the gap grows in more complex examples
+loop rotation eventually becomes helpful once you need to hoist memory ops from loops
+```
+// broadcast item across all elements
+for (int i = 0; i < n; i++) {
+  dst[i] = src[0];
+}
+```
+given the src is a pointer i haven't yet accessed and n can be 0, i can't hoist that load because just placing it outside of the loop means it'll force a load even if n == 0 which could be a problem 
+but when we do loop rotation...
+```
+size_t i = 0;
+if (i < n) {
+  do {
+    dst[i] = src[0];
+    i += 1;
+  } while (i < n);
+}
+```
+we can now guarentee that once we're in the loop body we're doing at least one trip
+so hoisting is now fair game
+```
++ // if logic proved we do one trip which means we're met the condition to read src[0]
++ size_t hoisted = src[0];
+  do {
+-   dst[i] = src[0];
++   dst[i] = hoisted;
+  } while (i < n);
+```
+
+yea that's my ted talk
+i started implementing this a while back and stopped because i only do optimizer stuff on the side, it's lower priority than other crap 
+it won't be ready until far later