Last active
May 29, 2020 08:16
-
-
Save BeMg/2848694999c420aa1c3221a2ad93517b to your computer and use it in GitHub Desktop.
transpose example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = 'transpose_barrier_condition.bc' | |
source_filename = "transpose_barrier_condition.bc" | |
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" | |
target triple = "x86_64-unknown-unknown-unknown" | |
@transpose.buffer = internal unnamed_addr addrspace(3) global [8 x [8 x float]] undef, align 4 | |
@kernel_config_l2 = external addrspace(2) constant [32 x i32] | |
; Function Attrs: noduplicate | |
declare protected void @barrier(i32) #0 | |
; Function Attrs: noinline nounwind | |
define void @transpose(i32 %P, i32 %Q, float addrspace(1)* nocapture readonly %input, float addrspace(1)* nocapture %output) #1 { | |
pregion_for_init15: | |
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4 | |
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4 | |
call void @barrier(i32 0) #2 | |
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !2 | |
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !2 | |
%4 = sub i32 %2, %3 | |
%5 = udiv i32 %4, %0 | |
%6 = shl i32 %5, 3 | |
%7 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !2 | |
%8 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !2 | |
%9 = sub i32 %7, %8 | |
%10 = udiv i32 %9, %1 | |
%11 = shl i32 %10, 3 | |
%12 = mul i32 %11, %P | |
%13 = add i32 %6, %12 | |
%14 = getelementptr inbounds float, float addrspace(1)* %input, i32 %13 | |
%15 = load float, float addrspace(1)* %14, align 4, !tbaa !6 | |
store float %15, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6 | |
%16 = mul i32 %6, %Q | |
%17 = add i32 %11, %16 | |
%18 = getelementptr inbounds float, float addrspace(1)* %output, i32 %17 | |
store float %15, float addrspace(1)* %18, align 4, !tbaa !6 | |
br label %pregion_for_entry.pregion_for_init12 | |
pregion_for_entry.pregion_for_init12: ; preds = %pregion_for_cond17, %pregion_for_init15 | |
%19 = phi i32 [ 0, %pregion_for_init15 ], [ %32, %pregion_for_cond17 ] | |
%20 = phi i32 [ 1, %pregion_for_init15 ], [ 0, %pregion_for_cond17 ] | |
%21 = add i32 %6, %19 | |
%22 = mul i32 %21, %Q | |
br label %pregion_for_cond14 | |
pregion_for_cond14: ; preds = %pregion_for_cond14, %pregion_for_entry.pregion_for_init12 | |
%23 = phi i32 [ %20, %pregion_for_entry.pregion_for_init12 ], [ %30, %pregion_for_cond14 ] | |
%24 = load float, float addrspace(1)* %14, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
store float %24, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
%25 = add i32 %11, %23 | |
%26 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %23, i32 %19 | |
%27 = load float, float addrspace(3)* %26, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
%28 = add i32 %25, %22 | |
%29 = getelementptr inbounds float, float addrspace(1)* %output, i32 %28 | |
store float %27, float addrspace(1)* %29, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
%30 = add i32 %23, 1 | |
%31 = icmp slt i32 %30, %0 | |
br i1 %31, label %pregion_for_cond14, label %pregion_for_cond17, !llvm.loop !9 | |
pregion_for_cond17: ; preds = %pregion_for_cond14 | |
%32 = add i32 %19, 1 | |
%33 = icmp slt i32 %32, %1 | |
br i1 %33, label %pregion_for_entry.pregion_for_init12, label %exit.barrier, !llvm.loop !10 | |
exit.barrier: ; preds = %pregion_for_cond17 | |
call void @barrier(i32 0) #2 | |
ret void | |
} | |
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #1 = { noinline nounwind "_kernel_local_size"="256" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #2 = { nounwind } | |
!llvm.ident = !{!0} | |
!opencl.kernels = !{!1} | |
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"} | |
!1 = !{void (i32, i32, float addrspace(1)*, float addrspace(1)*)* @transpose} | |
!2 = !{!3, !3, i64 0} | |
!3 = !{!"int", !4, i64 0} | |
!4 = !{!"omnipotent char", !5, i64 0} | |
!5 = !{!"Simple C/C++ TBAA"} | |
!6 = !{!7, !7, i64 0} | |
!7 = !{!"float", !4, i64 0} | |
!8 = !{!9, !10} | |
!9 = distinct !{!9} | |
!10 = distinct !{!10} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = 'transpose_barrier_condition.bc' | |
source_filename = "transpose_barrier_condition.bc" | |
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" | |
target triple = "x86_64-unknown-unknown-unknown" | |
@transpose.buffer = internal unnamed_addr addrspace(3) global [8 x [8 x float]] undef, align 4 | |
@kernel_config_l2 = external addrspace(2) constant [32 x i32] | |
@str = private unnamed_addr constant [12 x i8] c"Some Print.\00" | |
@str2 = private unnamed_addr constant [13 x i8] c"Print again.\00" | |
; Function Attrs: noduplicate | |
declare protected void @barrier(i32) #0 | |
; Function Attrs: nounwind | |
declare i32 @puts(i8* nocapture readonly) #1 | |
; Function Attrs: noinline nounwind | |
define void @transpose(i32 %P, i32 %Q, float addrspace(1)* nocapture readonly %input, float addrspace(1)* nocapture %output) #2 { | |
pregion_for_init8: | |
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4 | |
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4 | |
call void @barrier(i32 0) #1 | |
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !2 | |
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !2 | |
%4 = sub i32 %2, %3 | |
%5 = udiv i32 %4, %0 | |
%6 = shl i32 %5, 3 | |
%7 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !2 | |
%8 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !2 | |
%9 = sub i32 %7, %8 | |
%10 = udiv i32 %9, %1 | |
%11 = shl i32 %10, 3 | |
%12 = mul i32 %11, %P | |
%13 = add i32 %6, %12 | |
%14 = getelementptr inbounds float, float addrspace(1)* %input, i32 %13 | |
%15 = load float, float addrspace(1)* %14, align 4, !tbaa !6 | |
store float %15, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6 | |
%puts = tail call i32 @puts(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @str, i32 0, i32 0)) | |
br label %pregion_for_entry.pregion_for_init5 | |
.r_entry.peeled_wi: ; preds = %pregion_for_entry.pregion_for_init5, %.r_entry.peeled_wi | |
%16 = phi i32 [ %25, %pregion_for_entry.pregion_for_init5 ], [ %22, %.r_entry.peeled_wi ] | |
%17 = add i32 %6, %16 | |
%18 = add i32 %17, %27 | |
%19 = getelementptr inbounds float, float addrspace(1)* %input, i32 %18 | |
%20 = load float, float addrspace(1)* %19, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
%21 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %24, i32 %16 | |
store float %20, float addrspace(3)* %21, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
%puts.peeled_wi = tail call i32 @puts(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @str, i32 0, i32 0)), !llvm.mem.parallel_loop_access !8 | |
%22 = add i32 %16, 1 | |
%23 = icmp slt i32 %22, %0 | |
br i1 %23, label %.r_entry.peeled_wi, label %pregion_for_cond10, !llvm.loop !9 | |
pregion_for_entry.pregion_for_init5: ; preds = %pregion_for_cond10, %pregion_for_init8 | |
%24 = phi i32 [ 0, %pregion_for_init8 ], [ %28, %pregion_for_cond10 ] | |
%25 = phi i32 [ 1, %pregion_for_init8 ], [ 0, %pregion_for_cond10 ] | |
%26 = add i32 %11, %24 | |
%27 = mul i32 %26, %P | |
br label %.r_entry.peeled_wi | |
pregion_for_cond10: ; preds = %.r_entry.peeled_wi | |
%28 = add i32 %24, 1 | |
%29 = icmp slt i32 %28, %1 | |
br i1 %29, label %pregion_for_entry.pregion_for_init5, label %pregion_for_init, !llvm.loop !10 | |
pregion_for_init: ; preds = %pregion_for_cond10 | |
tail call void @barrier(i32 1) #3 | |
br label %pregion_for_entry.pregion_for_init | |
pregion_for_entry.pregion_for_init: ; preds = %pregion_for_cond4, %pregion_for_init | |
%30 = phi i32 [ 0, %pregion_for_init ], [ %41, %pregion_for_cond4 ] | |
%31 = add i32 %6, %30 | |
%32 = mul i32 %31, %Q | |
br label %pregion_for_cond | |
pregion_for_cond: ; preds = %pregion_for_cond, %pregion_for_entry.pregion_for_init | |
%33 = phi i32 [ 0, %pregion_for_entry.pregion_for_init ], [ %39, %pregion_for_cond ] | |
%puts1 = tail call i32 @puts(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @str2, i32 0, i32 0)), !llvm.mem.parallel_loop_access !11 | |
%34 = add i32 %11, %33 | |
%35 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %33, i32 %30 | |
%36 = load float, float addrspace(3)* %35, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !11 | |
%37 = add i32 %34, %32 | |
%38 = getelementptr inbounds float, float addrspace(1)* %output, i32 %37 | |
store float %36, float addrspace(1)* %38, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !11 | |
%39 = add i32 %33, 1 | |
%40 = icmp slt i32 %39, %0 | |
br i1 %40, label %pregion_for_cond, label %pregion_for_cond4, !llvm.loop !12 | |
pregion_for_cond4: ; preds = %pregion_for_cond | |
%41 = add i32 %30, 1 | |
%42 = icmp slt i32 %41, %1 | |
br i1 %42, label %pregion_for_entry.pregion_for_init, label %exit.barrier2, !llvm.loop !13 | |
exit.barrier2: ; preds = %pregion_for_cond4 | |
call void @barrier(i32 0) #1 | |
ret void | |
} | |
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #1 = { nounwind } | |
attributes #2 = { noinline nounwind "_kernel_local_size"="256" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #3 = { noduplicate nounwind } | |
!llvm.ident = !{!0} | |
!opencl.kernels = !{!1} | |
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"} | |
!1 = !{void (i32, i32, float addrspace(1)*, float addrspace(1)*)* @transpose} | |
!2 = !{!3, !3, i64 0} | |
!3 = !{!"int", !4, i64 0} | |
!4 = !{!"omnipotent char", !5, i64 0} | |
!5 = !{!"Simple C/C++ TBAA"} | |
!6 = !{!7, !7, i64 0} | |
!7 = !{!"float", !4, i64 0} | |
!8 = !{!9, !10} | |
!9 = distinct !{!9} | |
!10 = distinct !{!10} | |
!11 = !{!12, !13} | |
!12 = distinct !{!12} | |
!13 = distinct !{!13} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = 'transpose_barrier_condition.bc' | |
source_filename = "transpose_barrier_condition.bc" | |
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" | |
target triple = "x86_64-unknown-unknown-unknown" | |
@transpose.buffer = internal unnamed_addr addrspace(3) global [8 x [8 x float]] undef, align 4 | |
@kernel_config_l2 = external addrspace(2) constant [32 x i32] | |
; Function Attrs: noduplicate | |
declare protected void @barrier(i32) #0 | |
; Function Attrs: noinline nounwind | |
define void @transpose(i32 %P, i32 %Q, float addrspace(1)* nocapture readonly %input, float addrspace(1)* nocapture %output) #1 { | |
pregion_for_init15: | |
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4 | |
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4 | |
call void @barrier(i32 0) #2 | |
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !2 | |
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !2 | |
%4 = sub i32 %2, %3 | |
%5 = udiv i32 %4, %0 | |
%6 = shl i32 %5, 3 | |
%7 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !2 | |
%8 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !2 | |
%9 = sub i32 %7, %8 | |
%10 = udiv i32 %9, %1 | |
%11 = shl i32 %10, 3 | |
%12 = mul i32 %11, %P | |
%13 = add i32 %6, %12 | |
%14 = getelementptr inbounds float, float addrspace(1)* %input, i32 %13 | |
%15 = load float, float addrspace(1)* %14, align 4, !tbaa !6 | |
store float %15, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6 | |
%16 = mul i32 %6, %Q | |
%17 = add i32 %11, %16 | |
%18 = getelementptr inbounds float, float addrspace(1)* %output, i32 %17 | |
store float %15, float addrspace(1)* %18, align 4, !tbaa !6 | |
br label %pregion_for_entry.pregion_for_init12 | |
pregion_for_entry.pregion_for_init12: ; preds = %pregion_for_cond17, %pregion_for_init15 | |
%19 = phi i32 [ 0, %pregion_for_init15 ], [ %32, %pregion_for_cond17 ] | |
%20 = phi i32 [ 1, %pregion_for_init15 ], [ 0, %pregion_for_cond17 ] | |
%21 = add i32 %6, %19 | |
%22 = mul i32 %21, %Q | |
br label %pregion_for_cond14 | |
pregion_for_cond14: ; preds = %pregion_for_cond14, %pregion_for_entry.pregion_for_init12 | |
%23 = phi i32 [ %20, %pregion_for_entry.pregion_for_init12 ], [ %30, %pregion_for_cond14 ] | |
%24 = load float, float addrspace(1)* %14, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
store float %24, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
%25 = add i32 %11, %23 | |
%26 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %23, i32 %19 | |
%27 = load float, float addrspace(3)* %26, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
%28 = add i32 %25, %22 | |
%29 = getelementptr inbounds float, float addrspace(1)* %output, i32 %28 | |
store float %27, float addrspace(1)* %29, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
%30 = add i32 %23, 1 | |
%31 = icmp slt i32 %30, %0 | |
br i1 %31, label %pregion_for_cond14, label %pregion_for_cond17, !llvm.loop !9 | |
pregion_for_cond17: ; preds = %pregion_for_cond14 | |
%32 = add i32 %19, 1 | |
%33 = icmp slt i32 %32, %1 | |
br i1 %33, label %pregion_for_entry.pregion_for_init12, label %exit.barrier, !llvm.loop !10 | |
exit.barrier: ; preds = %pregion_for_cond17 | |
call void @barrier(i32 0) #2 | |
ret void | |
} | |
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #1 = { noinline nounwind "_kernel_local_size"="256" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #2 = { nounwind } | |
!llvm.ident = !{!0} | |
!opencl.kernels = !{!1} | |
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"} | |
!1 = !{void (i32, i32, float addrspace(1)*, float addrspace(1)*)* @transpose} | |
!2 = !{!3, !3, i64 0} | |
!3 = !{!"int", !4, i64 0} | |
!4 = !{!"omnipotent char", !5, i64 0} | |
!5 = !{!"Simple C/C++ TBAA"} | |
!6 = !{!7, !7, i64 0} | |
!7 = !{!"float", !4, i64 0} | |
!8 = !{!9, !10} | |
!9 = distinct !{!9} | |
!10 = distinct !{!10} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = 'transpose_barrier_condition.bc' | |
source_filename = "transpose_barrier_condition.bc" | |
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" | |
target triple = "x86_64-unknown-unknown-unknown" | |
@transpose.buffer = internal unnamed_addr addrspace(3) global [8 x [8 x float]] undef, align 4 | |
@kernel_config_l2 = external addrspace(2) constant [32 x i32] | |
; Function Attrs: noduplicate | |
declare protected void @barrier(i32) #0 | |
; Function Attrs: noinline nounwind | |
define void @transpose(i32 %P, i32 %Q, float addrspace(1)* nocapture readonly %input, float addrspace(1)* nocapture %output) #1 { | |
pregion_for_init15: | |
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4 | |
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4 | |
call void @barrier(i32 0) #2 | |
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !2 | |
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !2 | |
%4 = sub i32 %2, %3 | |
%5 = udiv i32 %4, %0 | |
%6 = shl i32 %5, 3 | |
%7 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !2 | |
%8 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !2 | |
%9 = sub i32 %7, %8 | |
%10 = udiv i32 %9, %1 | |
%11 = shl i32 %10, 3 | |
%12 = mul i32 %11, %P | |
%13 = add i32 %6, %12 | |
%14 = getelementptr inbounds float, float addrspace(1)* %input, i32 %13 | |
%15 = load float, float addrspace(1)* %14, align 4, !tbaa !6 | |
store float %15, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6 | |
%16 = mul i32 %6, %Q | |
%17 = add i32 %11, %16 | |
%18 = getelementptr inbounds float, float addrspace(1)* %output, i32 %17 | |
store float %15, float addrspace(1)* %18, align 4, !tbaa !6 | |
br label %pregion_for_entry.pregion_for_init12 | |
pregion_for_entry.pregion_for_init12: ; preds = %pregion_for_cond17, %pregion_for_init15 | |
%19 = phi i32 [ 0, %pregion_for_init15 ], [ %32, %pregion_for_cond17 ] | |
%20 = phi i32 [ 1, %pregion_for_init15 ], [ 0, %pregion_for_cond17 ] | |
%21 = add i32 %6, %19 | |
%22 = mul i32 %21, %Q | |
br label %pregion_for_cond14 | |
pregion_for_cond14: ; preds = %pregion_for_cond14, %pregion_for_entry.pregion_for_init12 | |
%23 = phi i32 [ %20, %pregion_for_entry.pregion_for_init12 ], [ %30, %pregion_for_cond14 ] | |
%24 = load float, float addrspace(1)* %14, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
store float %24, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
%25 = add i32 %11, %23 | |
%26 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %23, i32 %19 | |
%27 = load float, float addrspace(3)* %26, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
%28 = add i32 %25, %22 | |
%29 = getelementptr inbounds float, float addrspace(1)* %output, i32 %28 | |
store float %27, float addrspace(1)* %29, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
%30 = add i32 %23, 1 | |
%31 = icmp slt i32 %30, %0 | |
br i1 %31, label %pregion_for_cond14, label %pregion_for_cond17, !llvm.loop !9 | |
pregion_for_cond17: ; preds = %pregion_for_cond14 | |
%32 = add i32 %19, 1 | |
%33 = icmp slt i32 %32, %1 | |
br i1 %33, label %pregion_for_entry.pregion_for_init12, label %exit.barrier, !llvm.loop !10 | |
exit.barrier: ; preds = %pregion_for_cond17 | |
call void @barrier(i32 0) #2 | |
ret void | |
} | |
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #1 = { noinline nounwind "_kernel_local_size"="256" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #2 = { nounwind } | |
!llvm.ident = !{!0} | |
!opencl.kernels = !{!1} | |
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"} | |
!1 = !{void (i32, i32, float addrspace(1)*, float addrspace(1)*)* @transpose} | |
!2 = !{!3, !3, i64 0} | |
!3 = !{!"int", !4, i64 0} | |
!4 = !{!"omnipotent char", !5, i64 0} | |
!5 = !{!"Simple C/C++ TBAA"} | |
!6 = !{!7, !7, i64 0} | |
!7 = !{!"float", !4, i64 0} | |
!8 = !{!9, !10} | |
!9 = distinct !{!9} | |
!10 = distinct !{!10} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// P is matrix row | |
// Q is matrix column | |
__kernel void | |
transpose (const uint P, const uint Q, const __global float *input, | |
__global float *output) | |
{ | |
const uint tx = get_local_id (0); | |
const uint ty = get_local_id (1); | |
const uint ID0 = get_group_id (0) * 8 + tx; | |
const uint ID1 = get_group_id (1) * 8 + ty; | |
__local float buffer[8][8]; | |
buffer[ty][tx] = input[ID1 * P + ID0]; | |
barrier (CLK_LOCAL_MEM_FENCE); | |
const uint newID0 = get_group_id (1) * 8 + tx; | |
const uint newID1 = get_group_id (0) * 8 + ty; | |
output[newID1 * Q + newID0] = buffer[tx][ty]; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = 'transport_barrier.bc' | |
source_filename = "transport_barrier.bc" | |
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" | |
target triple = "x86_64-unknown-unknown-unknown" | |
@transpose.buffer = internal unnamed_addr addrspace(3) global [8 x [8 x float]] undef, align 4 | |
@kernel_config_l2 = external addrspace(2) constant [32 x i32] | |
; Function Attrs: noduplicate | |
declare protected void @barrier(i32) #0 | |
; Function Attrs: noinline nounwind | |
define void @transpose(i32 %P, i32 %Q, float addrspace(1)* nocapture readonly %input, float addrspace(1)* nocapture %output) #1 { | |
pregion_for_init6: | |
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4 | |
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4 | |
call void @barrier(i32 0) #2 | |
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !2 | |
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !2 | |
%4 = sub i32 %2, %3 | |
%5 = udiv i32 %4, %0 | |
%6 = shl i32 %5, 3 | |
%7 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !2 | |
%8 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !2 | |
%9 = sub i32 %7, %8 | |
%10 = udiv i32 %9, %1 | |
%11 = shl i32 %10, 3 | |
br label %pregion_for_entry.pregion_for_init3 | |
pregion_for_entry.pregion_for_init3: ; preds = %pregion_for_cond8, %pregion_for_init6 | |
%12 = phi i32 [ 0, %pregion_for_init6 ], [ %23, %pregion_for_cond8 ] | |
%13 = add i32 %11, %12 | |
%14 = mul i32 %13, %P | |
br label %pregion_for_cond5 | |
pregion_for_cond5: ; preds = %pregion_for_cond5, %pregion_for_entry.pregion_for_init3 | |
%15 = phi i32 [ 0, %pregion_for_entry.pregion_for_init3 ], [ %21, %pregion_for_cond5 ] | |
%16 = add i32 %6, %15 | |
%17 = add i32 %16, %14 | |
%18 = getelementptr inbounds float, float addrspace(1)* %input, i32 %17 | |
%19 = load float, float addrspace(1)* %18, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
%20 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %12, i32 %15 | |
store float %19, float addrspace(3)* %20, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
%21 = add i32 %15, 1 | |
%22 = icmp slt i32 %21, %0 | |
br i1 %22, label %pregion_for_cond5, label %pregion_for_cond8, !llvm.loop !9 | |
pregion_for_cond8: ; preds = %pregion_for_cond5 | |
%23 = add i32 %12, 1 | |
%24 = icmp slt i32 %23, %1 | |
br i1 %24, label %pregion_for_entry.pregion_for_init3, label %pregion_for_init, !llvm.loop !10 | |
pregion_for_init: ; preds = %pregion_for_cond8 | |
tail call void @barrier(i32 1) #3 | |
br label %pregion_for_entry.pregion_for_init | |
pregion_for_entry.pregion_for_init: ; preds = %pregion_for_cond2, %pregion_for_init | |
%25 = phi i32 [ 0, %pregion_for_init ], [ %36, %pregion_for_cond2 ] | |
%26 = add i32 %6, %25 | |
%27 = mul i32 %26, %Q | |
br label %pregion_for_cond | |
pregion_for_cond: ; preds = %pregion_for_cond, %pregion_for_entry.pregion_for_init | |
%28 = phi i32 [ 0, %pregion_for_entry.pregion_for_init ], [ %34, %pregion_for_cond ] | |
%29 = add i32 %11, %28 | |
%30 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %28, i32 %25 | |
%31 = load float, float addrspace(3)* %30, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !11 | |
%32 = add i32 %29, %27 | |
%33 = getelementptr inbounds float, float addrspace(1)* %output, i32 %32 | |
store float %31, float addrspace(1)* %33, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !11 | |
%34 = add i32 %28, 1 | |
%35 = icmp slt i32 %34, %0 | |
br i1 %35, label %pregion_for_cond, label %pregion_for_cond2, !llvm.loop !12 | |
pregion_for_cond2: ; preds = %pregion_for_cond | |
%36 = add i32 %25, 1 | |
%37 = icmp slt i32 %36, %1 | |
br i1 %37, label %pregion_for_entry.pregion_for_init, label %exit.barrier, !llvm.loop !13 | |
exit.barrier: ; preds = %pregion_for_cond2 | |
call void @barrier(i32 0) #2 | |
ret void | |
} | |
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #1 = { noinline nounwind "_kernel_local_size"="256" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #2 = { nounwind } | |
attributes #3 = { noduplicate nounwind } | |
!llvm.ident = !{!0} | |
!opencl.kernels = !{!1} | |
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"} | |
!1 = !{void (i32, i32, float addrspace(1)*, float addrspace(1)*)* @transpose} | |
!2 = !{!3, !3, i64 0} | |
!3 = !{!"int", !4, i64 0} | |
!4 = !{!"omnipotent char", !5, i64 0} | |
!5 = !{!"Simple C/C++ TBAA"} | |
!6 = !{!7, !7, i64 0} | |
!7 = !{!"float", !4, i64 0} | |
!8 = !{!9, !10} | |
!9 = distinct !{!9} | |
!10 = distinct !{!10} | |
!11 = !{!12, !13} | |
!12 = distinct !{!12} | |
!13 = distinct !{!13} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Simple transpose kernel for a P * Q matrix | |
__kernel void | |
transpose (const uint P, const uint Q, const __global float *input, | |
__global float *output) | |
{ | |
// Thread identifiers | |
const uint tx = get_local_id (0); | |
const uint ty = get_local_id (1); | |
const uint ID0 = get_group_id (0) * 8 + tx; // 0..P | |
const uint ID1 = get_group_id (1) * 8 + ty; // 0..Q | |
// Set-up the local memory for shuffling | |
__local float buffer[8][8]; | |
// Swap the x and y coordinates to perform the rotation (coalesced) | |
// if (ID0 < P && ID1 < Q) { | |
buffer[ty][tx] = input[ID1 * P + ID0]; | |
// } | |
// Synchronise all threads | |
if(ty < 30) { | |
barrier (CLK_LOCAL_MEM_FENCE); | |
} | |
// We don't have to swap the x and y thread indices here, | |
// because that's already done in the local memory | |
const uint newID0 = get_group_id (1) * 8 + tx; | |
const uint newID1 = get_group_id (0) * 8 + ty; | |
// Store the transposed result (coalesced) | |
// if (newID0 < Q && newID1 < P) { | |
output[newID1 * Q + newID0] = buffer[tx][ty]; | |
// } | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = 'transport_barrier_in_condition.bc' | |
source_filename = "transport_barrier_in_condition.bc" | |
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" | |
target triple = "x86_64-unknown-unknown-unknown" | |
@transpose.buffer = internal unnamed_addr addrspace(3) global [8 x [8 x float]] undef, align 4 | |
@kernel_config_l2 = external addrspace(2) constant [32 x i32] | |
; Function Attrs: noduplicate | |
declare protected void @barrier(i32) #0 | |
; Function Attrs: noinline nounwind | |
define void @transpose(i32 %P, i32 %Q, float addrspace(1)* nocapture readonly %input, float addrspace(1)* nocapture %output) #1 { | |
pregion_for_init7: | |
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4 | |
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4 | |
call void @barrier(i32 0) #2 | |
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !2 | |
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !2 | |
%4 = sub i32 %2, %3 | |
%5 = udiv i32 %4, %0 | |
%6 = shl i32 %5, 3 | |
%7 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !2 | |
%8 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !2 | |
%9 = sub i32 %7, %8 | |
%10 = udiv i32 %9, %1 | |
%11 = shl i32 %10, 3 | |
%12 = mul i32 %11, %P | |
%13 = add i32 %6, %12 | |
%14 = getelementptr inbounds float, float addrspace(1)* %input, i32 %13 | |
%15 = load float, float addrspace(1)* %14, align 4, !tbaa !6 | |
store float %15, float addrspace(3)* getelementptr inbounds ([8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 0, i32 0), align 4, !tbaa !6 | |
br label %pregion_for_entry.pregion_for_init4 | |
.r_entry.peeled_wi: ; preds = %pregion_for_entry.pregion_for_init4, %.r_entry.peeled_wi | |
%16 = phi i32 [ %25, %pregion_for_entry.pregion_for_init4 ], [ %22, %.r_entry.peeled_wi ] | |
%17 = add i32 %6, %16 | |
%18 = add i32 %17, %27 | |
%19 = getelementptr inbounds float, float addrspace(1)* %input, i32 %18 | |
%20 = load float, float addrspace(1)* %19, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
%21 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %24, i32 %16 | |
store float %20, float addrspace(3)* %21, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8 | |
%22 = add i32 %16, 1 | |
%23 = icmp slt i32 %22, %0 | |
br i1 %23, label %.r_entry.peeled_wi, label %pregion_for_cond9, !llvm.loop !9 | |
pregion_for_entry.pregion_for_init4: ; preds = %pregion_for_cond9, %pregion_for_init7 | |
%24 = phi i32 [ 0, %pregion_for_init7 ], [ %28, %pregion_for_cond9 ] | |
%25 = phi i32 [ 1, %pregion_for_init7 ], [ 0, %pregion_for_cond9 ] | |
%26 = add i32 %11, %24 | |
%27 = mul i32 %26, %P | |
br label %.r_entry.peeled_wi | |
pregion_for_cond9: ; preds = %.r_entry.peeled_wi | |
%28 = add i32 %24, 1 | |
%29 = icmp slt i32 %28, %1 | |
br i1 %29, label %pregion_for_entry.pregion_for_init4, label %pregion_for_init, !llvm.loop !10 | |
pregion_for_init: ; preds = %pregion_for_cond9 | |
tail call void @barrier(i32 1) #3 | |
br label %pregion_for_entry.pregion_for_init | |
pregion_for_entry.pregion_for_init: ; preds = %pregion_for_cond3, %pregion_for_init | |
%30 = phi i32 [ 0, %pregion_for_init ], [ %41, %pregion_for_cond3 ] | |
%31 = add i32 %6, %30 | |
%32 = mul i32 %31, %Q | |
br label %pregion_for_cond | |
pregion_for_cond: ; preds = %pregion_for_cond, %pregion_for_entry.pregion_for_init | |
%33 = phi i32 [ 0, %pregion_for_entry.pregion_for_init ], [ %39, %pregion_for_cond ] | |
%34 = add i32 %11, %33 | |
%35 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %33, i32 %30 | |
%36 = load float, float addrspace(3)* %35, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !11 | |
%37 = add i32 %34, %32 | |
%38 = getelementptr inbounds float, float addrspace(1)* %output, i32 %37 | |
store float %36, float addrspace(1)* %38, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !11 | |
%39 = add i32 %33, 1 | |
%40 = icmp slt i32 %39, %0 | |
br i1 %40, label %pregion_for_cond, label %pregion_for_cond3, !llvm.loop !12 | |
pregion_for_cond3: ; preds = %pregion_for_cond | |
%41 = add i32 %30, 1 | |
%42 = icmp slt i32 %41, %1 | |
br i1 %42, label %pregion_for_entry.pregion_for_init, label %exit.barrier1, !llvm.loop !13 | |
exit.barrier1: ; preds = %pregion_for_cond3 | |
call void @barrier(i32 0) #2 | |
ret void | |
} | |
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #1 = { noinline nounwind "_kernel_local_size"="256" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #2 = { nounwind } | |
attributes #3 = { noduplicate nounwind } | |
!llvm.ident = !{!0} | |
!opencl.kernels = !{!1} | |
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"} | |
!1 = !{void (i32, i32, float addrspace(1)*, float addrspace(1)*)* @transpose} | |
!2 = !{!3, !3, i64 0} | |
!3 = !{!"int", !4, i64 0} | |
!4 = !{!"omnipotent char", !5, i64 0} | |
!5 = !{!"Simple C/C++ TBAA"} | |
!6 = !{!7, !7, i64 0} | |
!7 = !{!"float", !4, i64 0} | |
!8 = !{!9, !10} | |
!9 = distinct !{!9} | |
!10 = distinct !{!10} | |
!11 = !{!12, !13} | |
!12 = distinct !{!12} | |
!13 = distinct !{!13} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__kernel void | |
transpose (const uint P, const uint Q, const __global float *input, | |
__global float *output) | |
{ | |
const uint tx = get_local_id (0); | |
const uint ty = get_local_id (1); | |
const uint ID0 = get_group_id (0) * 8 + tx; | |
const uint ID1 = get_group_id (1) * 8 + ty; | |
__local float buffer[8][8]; | |
buffer[ty][tx] = input[ID1 * P + ID0]; | |
if(ty < 30) { | |
printf("Some Print.\n"); | |
barrier (CLK_LOCAL_MEM_FENCE); | |
printf("Print again.\n"); | |
} else { | |
printf("else here.\n"); | |
} | |
const uint newID0 = get_group_id (1) * 8 + tx; | |
const uint newID1 = get_group_id (0) * 8 + ty; | |
output[newID1 * Q + newID0] = buffer[tx][ty]; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment