Skip to content

Instantly share code, notes, and snippets.

@BeMg
Last active May 29, 2020 08:16
Show Gist options
  • Save BeMg/2848694999c420aa1c3221a2ad93517b to your computer and use it in GitHub Desktop.
Save BeMg/2848694999c420aa1c3221a2ad93517b to your computer and use it in GitHub Desktop.
transpose example
; ModuleID = 'transpose_barrier_condition.bc'
source_filename = "transpose_barrier_condition.bc"
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "x86_64-unknown-unknown-unknown"
@transpose.buffer = internal unnamed_addr addrspace(3) global [8 x [8 x float]] undef, align 4
@kernel_config_l2 = external addrspace(2) constant [32 x i32]
; Function Attrs: noduplicate
declare protected void @barrier(i32) #0
; Function Attrs: noinline nounwind
define void @transpose(i32 %P, i32 %Q, float addrspace(1)* nocapture readonly %input, float addrspace(1)* nocapture %output) #1 {
pregion_for_init15:
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4
call void @barrier(i32 0) #2
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !2
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !2
%4 = sub i32 %2, %3
%5 = udiv i32 %4, %0
%6 = shl i32 %5, 3
%7 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !2
%8 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !2
%9 = sub i32 %7, %8
%10 = udiv i32 %9, %1
%11 = shl i32 %10, 3
%12 = mul i32 %11, %P
%13 = add i32 %6, %12
%14 = getelementptr inbounds float, float addrspace(1)* %input, i32 %13
%15 = load float, float addrspace(1)* %14, align 4, !tbaa !6
store float %15, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6
%16 = mul i32 %6, %Q
%17 = add i32 %11, %16
%18 = getelementptr inbounds float, float addrspace(1)* %output, i32 %17
store float %15, float addrspace(1)* %18, align 4, !tbaa !6
br label %pregion_for_entry.pregion_for_init12
pregion_for_entry.pregion_for_init12: ; preds = %pregion_for_cond17, %pregion_for_init15
%19 = phi i32 [ 0, %pregion_for_init15 ], [ %32, %pregion_for_cond17 ]
%20 = phi i32 [ 1, %pregion_for_init15 ], [ 0, %pregion_for_cond17 ]
%21 = add i32 %6, %19
%22 = mul i32 %21, %Q
br label %pregion_for_cond14
pregion_for_cond14: ; preds = %pregion_for_cond14, %pregion_for_entry.pregion_for_init12
%23 = phi i32 [ %20, %pregion_for_entry.pregion_for_init12 ], [ %30, %pregion_for_cond14 ]
%24 = load float, float addrspace(1)* %14, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
store float %24, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%25 = add i32 %11, %23
%26 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %23, i32 %19
%27 = load float, float addrspace(3)* %26, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%28 = add i32 %25, %22
%29 = getelementptr inbounds float, float addrspace(1)* %output, i32 %28
store float %27, float addrspace(1)* %29, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%30 = add i32 %23, 1
%31 = icmp slt i32 %30, %0
br i1 %31, label %pregion_for_cond14, label %pregion_for_cond17, !llvm.loop !9
pregion_for_cond17: ; preds = %pregion_for_cond14
%32 = add i32 %19, 1
%33 = icmp slt i32 %32, %1
br i1 %33, label %pregion_for_entry.pregion_for_init12, label %exit.barrier, !llvm.loop !10
exit.barrier: ; preds = %pregion_for_cond17
call void @barrier(i32 0) #2
ret void
}
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { noinline nounwind "_kernel_local_size"="256" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind }
!llvm.ident = !{!0}
!opencl.kernels = !{!1}
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"}
!1 = !{void (i32, i32, float addrspace(1)*, float addrspace(1)*)* @transpose}
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}
!6 = !{!7, !7, i64 0}
!7 = !{!"float", !4, i64 0}
!8 = !{!9, !10}
!9 = distinct !{!9}
!10 = distinct !{!10}
; ModuleID = 'transpose_barrier_condition.bc'
source_filename = "transpose_barrier_condition.bc"
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "x86_64-unknown-unknown-unknown"
@transpose.buffer = internal unnamed_addr addrspace(3) global [8 x [8 x float]] undef, align 4
@kernel_config_l2 = external addrspace(2) constant [32 x i32]
@str = private unnamed_addr constant [12 x i8] c"Some Print.\00"
@str2 = private unnamed_addr constant [13 x i8] c"Print again.\00"
; Function Attrs: noduplicate
declare protected void @barrier(i32) #0
; Function Attrs: nounwind
declare i32 @puts(i8* nocapture readonly) #1
; Function Attrs: noinline nounwind
define void @transpose(i32 %P, i32 %Q, float addrspace(1)* nocapture readonly %input, float addrspace(1)* nocapture %output) #2 {
pregion_for_init8:
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4
call void @barrier(i32 0) #1
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !2
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !2
%4 = sub i32 %2, %3
%5 = udiv i32 %4, %0
%6 = shl i32 %5, 3
%7 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !2
%8 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !2
%9 = sub i32 %7, %8
%10 = udiv i32 %9, %1
%11 = shl i32 %10, 3
%12 = mul i32 %11, %P
%13 = add i32 %6, %12
%14 = getelementptr inbounds float, float addrspace(1)* %input, i32 %13
%15 = load float, float addrspace(1)* %14, align 4, !tbaa !6
store float %15, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6
%puts = tail call i32 @puts(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @str, i32 0, i32 0))
br label %pregion_for_entry.pregion_for_init5
.r_entry.peeled_wi: ; preds = %pregion_for_entry.pregion_for_init5, %.r_entry.peeled_wi
%16 = phi i32 [ %25, %pregion_for_entry.pregion_for_init5 ], [ %22, %.r_entry.peeled_wi ]
%17 = add i32 %6, %16
%18 = add i32 %17, %27
%19 = getelementptr inbounds float, float addrspace(1)* %input, i32 %18
%20 = load float, float addrspace(1)* %19, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%21 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %24, i32 %16
store float %20, float addrspace(3)* %21, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%puts.peeled_wi = tail call i32 @puts(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @str, i32 0, i32 0)), !llvm.mem.parallel_loop_access !8
%22 = add i32 %16, 1
%23 = icmp slt i32 %22, %0
br i1 %23, label %.r_entry.peeled_wi, label %pregion_for_cond10, !llvm.loop !9
pregion_for_entry.pregion_for_init5: ; preds = %pregion_for_cond10, %pregion_for_init8
%24 = phi i32 [ 0, %pregion_for_init8 ], [ %28, %pregion_for_cond10 ]
%25 = phi i32 [ 1, %pregion_for_init8 ], [ 0, %pregion_for_cond10 ]
%26 = add i32 %11, %24
%27 = mul i32 %26, %P
br label %.r_entry.peeled_wi
pregion_for_cond10: ; preds = %.r_entry.peeled_wi
%28 = add i32 %24, 1
%29 = icmp slt i32 %28, %1
br i1 %29, label %pregion_for_entry.pregion_for_init5, label %pregion_for_init, !llvm.loop !10
pregion_for_init: ; preds = %pregion_for_cond10
tail call void @barrier(i32 1) #3
br label %pregion_for_entry.pregion_for_init
pregion_for_entry.pregion_for_init: ; preds = %pregion_for_cond4, %pregion_for_init
%30 = phi i32 [ 0, %pregion_for_init ], [ %41, %pregion_for_cond4 ]
%31 = add i32 %6, %30
%32 = mul i32 %31, %Q
br label %pregion_for_cond
pregion_for_cond: ; preds = %pregion_for_cond, %pregion_for_entry.pregion_for_init
%33 = phi i32 [ 0, %pregion_for_entry.pregion_for_init ], [ %39, %pregion_for_cond ]
%puts1 = tail call i32 @puts(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @str2, i32 0, i32 0)), !llvm.mem.parallel_loop_access !11
%34 = add i32 %11, %33
%35 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %33, i32 %30
%36 = load float, float addrspace(3)* %35, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !11
%37 = add i32 %34, %32
%38 = getelementptr inbounds float, float addrspace(1)* %output, i32 %37
store float %36, float addrspace(1)* %38, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !11
%39 = add i32 %33, 1
%40 = icmp slt i32 %39, %0
br i1 %40, label %pregion_for_cond, label %pregion_for_cond4, !llvm.loop !12
pregion_for_cond4: ; preds = %pregion_for_cond
%41 = add i32 %30, 1
%42 = icmp slt i32 %41, %1
br i1 %42, label %pregion_for_entry.pregion_for_init, label %exit.barrier2, !llvm.loop !13
exit.barrier2: ; preds = %pregion_for_cond4
call void @barrier(i32 0) #1
ret void
}
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind }
attributes #2 = { noinline nounwind "_kernel_local_size"="256" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { noduplicate nounwind }
!llvm.ident = !{!0}
!opencl.kernels = !{!1}
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"}
!1 = !{void (i32, i32, float addrspace(1)*, float addrspace(1)*)* @transpose}
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}
!6 = !{!7, !7, i64 0}
!7 = !{!"float", !4, i64 0}
!8 = !{!9, !10}
!9 = distinct !{!9}
!10 = distinct !{!10}
!11 = !{!12, !13}
!12 = distinct !{!12}
!13 = distinct !{!13}
; ModuleID = 'transpose_barrier_condition.bc'
source_filename = "transpose_barrier_condition.bc"
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "x86_64-unknown-unknown-unknown"
@transpose.buffer = internal unnamed_addr addrspace(3) global [8 x [8 x float]] undef, align 4
@kernel_config_l2 = external addrspace(2) constant [32 x i32]
; Function Attrs: noduplicate
declare protected void @barrier(i32) #0
; Function Attrs: noinline nounwind
define void @transpose(i32 %P, i32 %Q, float addrspace(1)* nocapture readonly %input, float addrspace(1)* nocapture %output) #1 {
pregion_for_init15:
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4
call void @barrier(i32 0) #2
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !2
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !2
%4 = sub i32 %2, %3
%5 = udiv i32 %4, %0
%6 = shl i32 %5, 3
%7 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !2
%8 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !2
%9 = sub i32 %7, %8
%10 = udiv i32 %9, %1
%11 = shl i32 %10, 3
%12 = mul i32 %11, %P
%13 = add i32 %6, %12
%14 = getelementptr inbounds float, float addrspace(1)* %input, i32 %13
%15 = load float, float addrspace(1)* %14, align 4, !tbaa !6
store float %15, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6
%16 = mul i32 %6, %Q
%17 = add i32 %11, %16
%18 = getelementptr inbounds float, float addrspace(1)* %output, i32 %17
store float %15, float addrspace(1)* %18, align 4, !tbaa !6
br label %pregion_for_entry.pregion_for_init12
pregion_for_entry.pregion_for_init12: ; preds = %pregion_for_cond17, %pregion_for_init15
%19 = phi i32 [ 0, %pregion_for_init15 ], [ %32, %pregion_for_cond17 ]
%20 = phi i32 [ 1, %pregion_for_init15 ], [ 0, %pregion_for_cond17 ]
%21 = add i32 %6, %19
%22 = mul i32 %21, %Q
br label %pregion_for_cond14
pregion_for_cond14: ; preds = %pregion_for_cond14, %pregion_for_entry.pregion_for_init12
%23 = phi i32 [ %20, %pregion_for_entry.pregion_for_init12 ], [ %30, %pregion_for_cond14 ]
%24 = load float, float addrspace(1)* %14, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
store float %24, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%25 = add i32 %11, %23
%26 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %23, i32 %19
%27 = load float, float addrspace(3)* %26, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%28 = add i32 %25, %22
%29 = getelementptr inbounds float, float addrspace(1)* %output, i32 %28
store float %27, float addrspace(1)* %29, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%30 = add i32 %23, 1
%31 = icmp slt i32 %30, %0
br i1 %31, label %pregion_for_cond14, label %pregion_for_cond17, !llvm.loop !9
pregion_for_cond17: ; preds = %pregion_for_cond14
%32 = add i32 %19, 1
%33 = icmp slt i32 %32, %1
br i1 %33, label %pregion_for_entry.pregion_for_init12, label %exit.barrier, !llvm.loop !10
exit.barrier: ; preds = %pregion_for_cond17
call void @barrier(i32 0) #2
ret void
}
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { noinline nounwind "_kernel_local_size"="256" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind }
!llvm.ident = !{!0}
!opencl.kernels = !{!1}
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"}
!1 = !{void (i32, i32, float addrspace(1)*, float addrspace(1)*)* @transpose}
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}
!6 = !{!7, !7, i64 0}
!7 = !{!"float", !4, i64 0}
!8 = !{!9, !10}
!9 = distinct !{!9}
!10 = distinct !{!10}
; ModuleID = 'transpose_barrier_condition.bc'
source_filename = "transpose_barrier_condition.bc"
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "x86_64-unknown-unknown-unknown"
@transpose.buffer = internal unnamed_addr addrspace(3) global [8 x [8 x float]] undef, align 4
@kernel_config_l2 = external addrspace(2) constant [32 x i32]
; Function Attrs: noduplicate
declare protected void @barrier(i32) #0
; Function Attrs: noinline nounwind
define void @transpose(i32 %P, i32 %Q, float addrspace(1)* nocapture readonly %input, float addrspace(1)* nocapture %output) #1 {
pregion_for_init15:
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4
call void @barrier(i32 0) #2
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !2
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !2
%4 = sub i32 %2, %3
%5 = udiv i32 %4, %0
%6 = shl i32 %5, 3
%7 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !2
%8 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !2
%9 = sub i32 %7, %8
%10 = udiv i32 %9, %1
%11 = shl i32 %10, 3
%12 = mul i32 %11, %P
%13 = add i32 %6, %12
%14 = getelementptr inbounds float, float addrspace(1)* %input, i32 %13
%15 = load float, float addrspace(1)* %14, align 4, !tbaa !6
store float %15, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6
%16 = mul i32 %6, %Q
%17 = add i32 %11, %16
%18 = getelementptr inbounds float, float addrspace(1)* %output, i32 %17
store float %15, float addrspace(1)* %18, align 4, !tbaa !6
br label %pregion_for_entry.pregion_for_init12
pregion_for_entry.pregion_for_init12: ; preds = %pregion_for_cond17, %pregion_for_init15
%19 = phi i32 [ 0, %pregion_for_init15 ], [ %32, %pregion_for_cond17 ]
%20 = phi i32 [ 1, %pregion_for_init15 ], [ 0, %pregion_for_cond17 ]
%21 = add i32 %6, %19
%22 = mul i32 %21, %Q
br label %pregion_for_cond14
pregion_for_cond14: ; preds = %pregion_for_cond14, %pregion_for_entry.pregion_for_init12
%23 = phi i32 [ %20, %pregion_for_entry.pregion_for_init12 ], [ %30, %pregion_for_cond14 ]
%24 = load float, float addrspace(1)* %14, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
store float %24, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%25 = add i32 %11, %23
%26 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %23, i32 %19
%27 = load float, float addrspace(3)* %26, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%28 = add i32 %25, %22
%29 = getelementptr inbounds float, float addrspace(1)* %output, i32 %28
store float %27, float addrspace(1)* %29, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%30 = add i32 %23, 1
%31 = icmp slt i32 %30, %0
br i1 %31, label %pregion_for_cond14, label %pregion_for_cond17, !llvm.loop !9
pregion_for_cond17: ; preds = %pregion_for_cond14
%32 = add i32 %19, 1
%33 = icmp slt i32 %32, %1
br i1 %33, label %pregion_for_entry.pregion_for_init12, label %exit.barrier, !llvm.loop !10
exit.barrier: ; preds = %pregion_for_cond17
call void @barrier(i32 0) #2
ret void
}
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { noinline nounwind "_kernel_local_size"="256" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind }
!llvm.ident = !{!0}
!opencl.kernels = !{!1}
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"}
!1 = !{void (i32, i32, float addrspace(1)*, float addrspace(1)*)* @transpose}
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}
!6 = !{!7, !7, i64 0}
!7 = !{!"float", !4, i64 0}
!8 = !{!9, !10}
!9 = distinct !{!9}
!10 = distinct !{!10}
// P is matrix row
// Q is matrix column
__kernel void
transpose (const uint P, const uint Q, const __global float *input,
__global float *output)
{
const uint tx = get_local_id (0);
const uint ty = get_local_id (1);
const uint ID0 = get_group_id (0) * 8 + tx;
const uint ID1 = get_group_id (1) * 8 + ty;
__local float buffer[8][8];
buffer[ty][tx] = input[ID1 * P + ID0];
barrier (CLK_LOCAL_MEM_FENCE);
const uint newID0 = get_group_id (1) * 8 + tx;
const uint newID1 = get_group_id (0) * 8 + ty;
output[newID1 * Q + newID0] = buffer[tx][ty];
}
; ModuleID = 'transport_barrier.bc'
source_filename = "transport_barrier.bc"
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "x86_64-unknown-unknown-unknown"
@transpose.buffer = internal unnamed_addr addrspace(3) global [8 x [8 x float]] undef, align 4
@kernel_config_l2 = external addrspace(2) constant [32 x i32]
; Function Attrs: noduplicate
declare protected void @barrier(i32) #0
; Function Attrs: noinline nounwind
define void @transpose(i32 %P, i32 %Q, float addrspace(1)* nocapture readonly %input, float addrspace(1)* nocapture %output) #1 {
pregion_for_init6:
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4
call void @barrier(i32 0) #2
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !2
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !2
%4 = sub i32 %2, %3
%5 = udiv i32 %4, %0
%6 = shl i32 %5, 3
%7 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !2
%8 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !2
%9 = sub i32 %7, %8
%10 = udiv i32 %9, %1
%11 = shl i32 %10, 3
br label %pregion_for_entry.pregion_for_init3
pregion_for_entry.pregion_for_init3: ; preds = %pregion_for_cond8, %pregion_for_init6
%12 = phi i32 [ 0, %pregion_for_init6 ], [ %23, %pregion_for_cond8 ]
%13 = add i32 %11, %12
%14 = mul i32 %13, %P
br label %pregion_for_cond5
pregion_for_cond5: ; preds = %pregion_for_cond5, %pregion_for_entry.pregion_for_init3
%15 = phi i32 [ 0, %pregion_for_entry.pregion_for_init3 ], [ %21, %pregion_for_cond5 ]
%16 = add i32 %6, %15
%17 = add i32 %16, %14
%18 = getelementptr inbounds float, float addrspace(1)* %input, i32 %17
%19 = load float, float addrspace(1)* %18, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%20 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %12, i32 %15
store float %19, float addrspace(3)* %20, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%21 = add i32 %15, 1
%22 = icmp slt i32 %21, %0
br i1 %22, label %pregion_for_cond5, label %pregion_for_cond8, !llvm.loop !9
pregion_for_cond8: ; preds = %pregion_for_cond5
%23 = add i32 %12, 1
%24 = icmp slt i32 %23, %1
br i1 %24, label %pregion_for_entry.pregion_for_init3, label %pregion_for_init, !llvm.loop !10
pregion_for_init: ; preds = %pregion_for_cond8
tail call void @barrier(i32 1) #3
br label %pregion_for_entry.pregion_for_init
pregion_for_entry.pregion_for_init: ; preds = %pregion_for_cond2, %pregion_for_init
%25 = phi i32 [ 0, %pregion_for_init ], [ %36, %pregion_for_cond2 ]
%26 = add i32 %6, %25
%27 = mul i32 %26, %Q
br label %pregion_for_cond
pregion_for_cond: ; preds = %pregion_for_cond, %pregion_for_entry.pregion_for_init
%28 = phi i32 [ 0, %pregion_for_entry.pregion_for_init ], [ %34, %pregion_for_cond ]
%29 = add i32 %11, %28
%30 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %28, i32 %25
%31 = load float, float addrspace(3)* %30, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !11
%32 = add i32 %29, %27
%33 = getelementptr inbounds float, float addrspace(1)* %output, i32 %32
store float %31, float addrspace(1)* %33, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !11
%34 = add i32 %28, 1
%35 = icmp slt i32 %34, %0
br i1 %35, label %pregion_for_cond, label %pregion_for_cond2, !llvm.loop !12
pregion_for_cond2: ; preds = %pregion_for_cond
%36 = add i32 %25, 1
%37 = icmp slt i32 %36, %1
br i1 %37, label %pregion_for_entry.pregion_for_init, label %exit.barrier, !llvm.loop !13
exit.barrier: ; preds = %pregion_for_cond2
call void @barrier(i32 0) #2
ret void
}
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { noinline nounwind "_kernel_local_size"="256" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind }
attributes #3 = { noduplicate nounwind }
!llvm.ident = !{!0}
!opencl.kernels = !{!1}
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"}
!1 = !{void (i32, i32, float addrspace(1)*, float addrspace(1)*)* @transpose}
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}
!6 = !{!7, !7, i64 0}
!7 = !{!"float", !4, i64 0}
!8 = !{!9, !10}
!9 = distinct !{!9}
!10 = distinct !{!10}
!11 = !{!12, !13}
!12 = distinct !{!12}
!13 = distinct !{!13}
// Simple transpose kernel for a P * Q matrix
__kernel void
transpose (const uint P, const uint Q, const __global float *input,
__global float *output)
{
// Thread identifiers
const uint tx = get_local_id (0);
const uint ty = get_local_id (1);
const uint ID0 = get_group_id (0) * 8 + tx; // 0..P
const uint ID1 = get_group_id (1) * 8 + ty; // 0..Q
// Set-up the local memory for shuffling
__local float buffer[8][8];
// Swap the x and y coordinates to perform the rotation (coalesced)
// if (ID0 < P && ID1 < Q) {
buffer[ty][tx] = input[ID1 * P + ID0];
// }
// Synchronise all threads
if(ty < 30) {
barrier (CLK_LOCAL_MEM_FENCE);
}
// We don't have to swap the x and y thread indices here,
// because that's already done in the local memory
const uint newID0 = get_group_id (1) * 8 + tx;
const uint newID1 = get_group_id (0) * 8 + ty;
// Store the transposed result (coalesced)
// if (newID0 < Q && newID1 < P) {
output[newID1 * Q + newID0] = buffer[tx][ty];
// }
}
; ModuleID = 'transport_barrier_in_condition.bc'
source_filename = "transport_barrier_in_condition.bc"
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
target triple = "x86_64-unknown-unknown-unknown"
@transpose.buffer = internal unnamed_addr addrspace(3) global [8 x [8 x float]] undef, align 4
@kernel_config_l2 = external addrspace(2) constant [32 x i32]
; Function Attrs: noduplicate
declare protected void @barrier(i32) #0
; Function Attrs: noinline nounwind
define void @transpose(i32 %P, i32 %Q, float addrspace(1)* nocapture readonly %input, float addrspace(1)* nocapture %output) #1 {
pregion_for_init7:
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4
call void @barrier(i32 0) #2
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !2
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !2
%4 = sub i32 %2, %3
%5 = udiv i32 %4, %0
%6 = shl i32 %5, 3
%7 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !2
%8 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !2
%9 = sub i32 %7, %8
%10 = udiv i32 %9, %1
%11 = shl i32 %10, 3
%12 = mul i32 %11, %P
%13 = add i32 %6, %12
%14 = getelementptr inbounds float, float addrspace(1)* %input, i32 %13
%15 = load float, float addrspace(1)* %14, align 4, !tbaa !6
store float %15, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6
br label %pregion_for_entry.pregion_for_init4
.r_entry.peeled_wi: ; preds = %pregion_for_entry.pregion_for_init4, %.r_entry.peeled_wi
%16 = phi i32 [ %25, %pregion_for_entry.pregion_for_init4 ], [ %22, %.r_entry.peeled_wi ]
%17 = add i32 %6, %16
%18 = add i32 %17, %27
%19 = getelementptr inbounds float, float addrspace(1)* %input, i32 %18
%20 = load float, float addrspace(1)* %19, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%21 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %24, i32 %16
store float %20, float addrspace(3)* %21, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
%22 = add i32 %16, 1
%23 = icmp slt i32 %22, %0
br i1 %23, label %.r_entry.peeled_wi, label %pregion_for_cond9, !llvm.loop !9
pregion_for_entry.pregion_for_init4: ; preds = %pregion_for_cond9, %pregion_for_init7
%24 = phi i32 [ 0, %pregion_for_init7 ], [ %28, %pregion_for_cond9 ]
%25 = phi i32 [ 1, %pregion_for_init7 ], [ 0, %pregion_for_cond9 ]
%26 = add i32 %11, %24
%27 = mul i32 %26, %P
br label %.r_entry.peeled_wi
pregion_for_cond9: ; preds = %.r_entry.peeled_wi
%28 = add i32 %24, 1
%29 = icmp slt i32 %28, %1
br i1 %29, label %pregion_for_entry.pregion_for_init4, label %pregion_for_init, !llvm.loop !10
pregion_for_init: ; preds = %pregion_for_cond9
tail call void @barrier(i32 1) #3
br label %pregion_for_entry.pregion_for_init
pregion_for_entry.pregion_for_init: ; preds = %pregion_for_cond3, %pregion_for_init
%30 = phi i32 [ 0, %pregion_for_init ], [ %41, %pregion_for_cond3 ]
%31 = add i32 %6, %30
%32 = mul i32 %31, %Q
br label %pregion_for_cond
pregion_for_cond: ; preds = %pregion_for_cond, %pregion_for_entry.pregion_for_init
%33 = phi i32 [ 0, %pregion_for_entry.pregion_for_init ], [ %39, %pregion_for_cond ]
%34 = add i32 %11, %33
%35 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %33, i32 %30
%36 = load float, float addrspace(3)* %35, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !11
%37 = add i32 %34, %32
%38 = getelementptr inbounds float, float addrspace(1)* %output, i32 %37
store float %36, float addrspace(1)* %38, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !11
%39 = add i32 %33, 1
%40 = icmp slt i32 %39, %0
br i1 %40, label %pregion_for_cond, label %pregion_for_cond3, !llvm.loop !12
pregion_for_cond3: ; preds = %pregion_for_cond
%41 = add i32 %30, 1
%42 = icmp slt i32 %41, %1
br i1 %42, label %pregion_for_entry.pregion_for_init, label %exit.barrier1, !llvm.loop !13
exit.barrier1: ; preds = %pregion_for_cond3
call void @barrier(i32 0) #2
ret void
}
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { noinline nounwind "_kernel_local_size"="256" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind }
attributes #3 = { noduplicate nounwind }
!llvm.ident = !{!0}
!opencl.kernels = !{!1}
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"}
!1 = !{void (i32, i32, float addrspace(1)*, float addrspace(1)*)* @transpose}
!2 = !{!3, !3, i64 0}
!3 = !{!"int", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}
!6 = !{!7, !7, i64 0}
!7 = !{!"float", !4, i64 0}
!8 = !{!9, !10}
!9 = distinct !{!9}
!10 = distinct !{!10}
!11 = !{!12, !13}
!12 = distinct !{!12}
!13 = distinct !{!13}
__kernel void
transpose (const uint P, const uint Q, const __global float *input,
__global float *output)
{
const uint tx = get_local_id (0);
const uint ty = get_local_id (1);
const uint ID0 = get_group_id (0) * 8 + tx;
const uint ID1 = get_group_id (1) * 8 + ty;
__local float buffer[8][8];
buffer[ty][tx] = input[ID1 * P + ID0];
if(ty < 30) {
printf("Some Print.\n");
barrier (CLK_LOCAL_MEM_FENCE);
printf("Print again.\n");
} else {
printf("else here.\n");
}
const uint newID0 = get_group_id (1) * 8 + tx;
const uint newID1 = get_group_id (0) * 8 + ty;
output[newID1 * Q + newID0] = buffer[tx][ty];
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment