Last active
May 14, 2020 02:15
-
-
Save BeMg/c9603e554a889138fa6cf99398aa6b43 to your computer and use it in GitHub Desktop.
TI-opencl clocl example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__kernel void Conv2D( __global int * image_in, //image input | |
__global int * filter_in, //filter input | |
int K, //filter kernel size | |
__global int * image_out) //feature map output | |
{ | |
int W; //work group global size | |
int Wn; //padded image width | |
int x; //global id x | |
int y; //global id y | |
int ki, kj; //filter coordinate,(kj, ki) | |
int sum = 0; //multiply and sum of filter and data | |
W = get_global_size(0); | |
x = get_global_id(0); | |
y = get_global_id(1); | |
Wn = W + (K - 1); | |
for(ki=0; ki<K; ki++) | |
for(kj=0; kj<K; kj++) | |
{ | |
sum = sum + filter_in[ki*K + kj] * image_in[Wn*(y+ki) + x + kj]; | |
} | |
image_out[y*W + x] = sum; | |
barrier(CLK_GLOBAL_MEM_FENCE); | |
for(ki=0; ki<K; ki++) | |
for(kj=0; kj<K; kj++) | |
{ | |
sum = sum + filter_in[ki*K + kj] * image_in[Wn*(y+ki) + x + kj]; | |
} | |
image_out[y*W + x] = sum; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = 'conv2d.bc' | |
source_filename = "conv2d.bc" | |
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" | |
target triple = "c6000-unknown-unknown-unknown" | |
@kernel_config_l2 = external addrspace(2) constant [32 x i32] | |
; Function Attrs: noduplicate | |
declare linkonce protected void @barrier(i32) #0 | |
; Function Attrs: noinline nounwind | |
define void @Conv2D(i32 addrspace(1)* nocapture readonly %image_in, i32 addrspace(1)* nocapture readonly %filter_in, i32 %K, i32 addrspace(1)* nocapture %image_out) #1 { | |
pregion_for_init24: | |
%0 = call i32 @__core_num() | |
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 15), align 4 | |
%2 = mul i32 %0, %1 | |
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 14), align 4 | |
%4 = add i32 %3, %2 | |
%5 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4 | |
%6 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4 | |
%7 = mul i32 %5, %6 | |
%8 = mul i32 4, %7 | |
%9 = add i32 %4, %8 | |
%10 = add i32 %9, 7 | |
%11 = and i32 %10, -8 | |
%.1.pocl_context = inttoptr i32 %4 to i32 addrspace(1)**, !ocl.restrict !3 | |
%12 = mul i32 4, %7 | |
%13 = add i32 %11, %12 | |
%14 = add i32 %13, 7 | |
%15 = and i32 %14, -8 | |
%.0.pocl_context = inttoptr i32 %11 to i32*, !ocl.restrict !3 | |
call void @barrier(i32 0) #2 | |
%16 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 1), align 4, !tbaa !4 | |
%17 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !4 | |
%18 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !4 | |
%19 = add nsw i32 %K, -1 | |
%20 = add nsw i32 %19, %16 | |
%21 = icmp sgt i32 %K, 0 | |
br label %pregion_for_entry.pregion_for_init21 | |
pregion_for_entry.pregion_for_init21: ; preds = %pregion_for_cond26, %pregion_for_init24 | |
%22 = phi i32 [ 0, %pregion_for_init24 ], [ %56, %pregion_for_cond26 ] | |
%23 = add i32 %22, %18 | |
%24 = mul i32 %22, %5 | |
%25 = mul nsw i32 %23, %16 | |
br label %.r_entry | |
.r_entry: ; preds = %pregion_for_cond23, %pregion_for_entry.pregion_for_init21 | |
%26 = phi i32 [ 0, %pregion_for_entry.pregion_for_init21 ], [ %54, %pregion_for_cond23 ] | |
%27 = add i32 %26, %17 | |
br i1 %21, label %.lr.ph10.preheader, label %pregion_for_cond23 | |
.lr.ph10.preheader: ; preds = %.r_entry | |
br label %.lr.ph10 | |
.lr.ph10: ; preds = %._crit_edge11, %.lr.ph10.preheader | |
%28 = phi i32 [ %46, %._crit_edge11 ], [ 0, %.lr.ph10.preheader ] | |
%.lcssa114 = phi i32 [ %43, %._crit_edge11 ], [ 0, %.lr.ph10.preheader ] | |
%29 = mul nsw i32 %28, %K | |
%30 = add nsw i32 %28, %23 | |
%31 = mul nsw i32 %30, %20 | |
%32 = add nsw i32 %31, %27 | |
br label %33 | |
33: ; preds = %33, %.lr.ph10 | |
%34 = phi i32 [ %44, %33 ], [ 0, %.lr.ph10 ] | |
%35 = phi i32 [ %43, %33 ], [ %.lcssa114, %.lr.ph10 ] | |
%36 = add nsw i32 %34, %29 | |
%37 = getelementptr inbounds i32, i32 addrspace(1)* %filter_in, i32 %36 | |
%38 = load i32, i32 addrspace(1)* %37, align 4, !tbaa !4, !llvm.mem.parallel_loop_access !8 | |
%39 = add nsw i32 %32, %34 | |
%40 = getelementptr inbounds i32, i32 addrspace(1)* %image_in, i32 %39 | |
%41 = load i32, i32 addrspace(1)* %40, align 4, !tbaa !4, !llvm.mem.parallel_loop_access !8 | |
%42 = mul nsw i32 %41, %38 | |
%43 = add nsw i32 %42, %35 | |
%44 = add nuw nsw i32 %34, 1 | |
%45 = icmp slt i32 %44, %K | |
br i1 %45, label %33, label %._crit_edge11 | |
._crit_edge11: ; preds = %33 | |
%46 = add nuw nsw i32 %28, 1 | |
%47 = icmp slt i32 %46, %K | |
br i1 %47, label %.lr.ph10, label %pregion_for_cond23.loopexit | |
pregion_for_cond23.loopexit: ; preds = %._crit_edge11 | |
br label %pregion_for_cond23 | |
pregion_for_cond23: ; preds = %pregion_for_cond23.loopexit, %.r_entry | |
%48 = phi i32 [ 0, %.r_entry ], [ %43, %pregion_for_cond23.loopexit ] | |
%49 = add i32 %26, %24 | |
%50 = getelementptr i32, i32* %.0.pocl_context, i32 %49 | |
store i32 %48, i32* %50, align 4, !llvm.mem.parallel_loop_access !8 | |
%51 = add nsw i32 %25, %27 | |
%52 = getelementptr inbounds i32, i32 addrspace(1)* %image_out, i32 %51 | |
%53 = getelementptr i32 addrspace(1)*, i32 addrspace(1)** %.1.pocl_context, i32 %49 | |
store i32 addrspace(1)* %52, i32 addrspace(1)** %53, align 4, !llvm.mem.parallel_loop_access !8 | |
store i32 %48, i32 addrspace(1)* %52, align 4, !tbaa !4, !llvm.mem.parallel_loop_access !8 | |
%54 = add i32 %26, 1 | |
%55 = icmp slt i32 %54, %5 | |
br i1 %55, label %.r_entry, label %pregion_for_cond26, !llvm.loop !9 | |
pregion_for_cond26: ; preds = %pregion_for_cond23 | |
%56 = add i32 %22, 1 | |
%57 = icmp slt i32 %56, %6 | |
br i1 %57, label %pregion_for_entry.pregion_for_init21, label %pregion_for_init, !llvm.loop !10 | |
pregion_for_init: ; preds = %pregion_for_cond26 | |
tail call void @barrier(i32 2) #3 | |
br label %pregion_for_entry.pregion_for_init | |
pregion_for_entry.pregion_for_init: ; preds = %pregion_for_cond20, %pregion_for_init | |
%58 = phi i32 [ 0, %pregion_for_init ], [ %91, %pregion_for_cond20 ] | |
%59 = mul i32 %58, %5 | |
%60 = add i32 %58, %18 | |
br label %._crit_edge15.r_entry | |
._crit_edge15.r_entry: ; preds = %pregion_for_cond, %pregion_for_entry.pregion_for_init | |
%61 = phi i32 [ 0, %pregion_for_entry.pregion_for_init ], [ %89, %pregion_for_cond ] | |
%62 = add i32 %61, %59 | |
%63 = getelementptr i32, i32* %.0.pocl_context, i32 %62 | |
%64 = load i32, i32* %63, align 4, !llvm.mem.parallel_loop_access !11 | |
br i1 %21, label %.lr.ph.preheader, label %pregion_for_cond | |
.lr.ph.preheader: ; preds = %._crit_edge15.r_entry | |
%65 = add i32 %61, %17 | |
br label %.lr.ph | |
.lr.ph: ; preds = %._crit_edge, %.lr.ph.preheader | |
%66 = phi i32 [ %84, %._crit_edge ], [ 0, %.lr.ph.preheader ] | |
%.lcssa6 = phi i32 [ %81, %._crit_edge ], [ %64, %.lr.ph.preheader ] | |
%67 = mul nsw i32 %66, %K | |
%68 = add nsw i32 %66, %60 | |
%69 = mul nsw i32 %68, %20 | |
%70 = add nsw i32 %69, %65 | |
br label %71 | |
71: ; preds = %71, %.lr.ph | |
%72 = phi i32 [ %82, %71 ], [ 0, %.lr.ph ] | |
%73 = phi i32 [ %81, %71 ], [ %.lcssa6, %.lr.ph ] | |
%74 = add nsw i32 %72, %67 | |
%75 = getelementptr inbounds i32, i32 addrspace(1)* %filter_in, i32 %74 | |
%76 = load i32, i32 addrspace(1)* %75, align 4, !tbaa !4, !llvm.mem.parallel_loop_access !11 | |
%77 = add nsw i32 %70, %72 | |
%78 = getelementptr inbounds i32, i32 addrspace(1)* %image_in, i32 %77 | |
%79 = load i32, i32 addrspace(1)* %78, align 4, !tbaa !4, !llvm.mem.parallel_loop_access !11 | |
%80 = mul nsw i32 %79, %76 | |
%81 = add nsw i32 %80, %73 | |
%82 = add nuw nsw i32 %72, 1 | |
%83 = icmp slt i32 %82, %K | |
br i1 %83, label %71, label %._crit_edge | |
._crit_edge: ; preds = %71 | |
%84 = add nuw nsw i32 %66, 1 | |
%85 = icmp slt i32 %84, %K | |
br i1 %85, label %.lr.ph, label %pregion_for_cond.loopexit | |
pregion_for_cond.loopexit: ; preds = %._crit_edge | |
br label %pregion_for_cond | |
pregion_for_cond: ; preds = %pregion_for_cond.loopexit, %._crit_edge15.r_entry | |
%86 = phi i32 [ %64, %._crit_edge15.r_entry ], [ %81, %pregion_for_cond.loopexit ] | |
%87 = getelementptr i32 addrspace(1)*, i32 addrspace(1)** %.1.pocl_context, i32 %62 | |
%88 = load i32 addrspace(1)*, i32 addrspace(1)** %87, align 4, !llvm.mem.parallel_loop_access !11 | |
store i32 %86, i32 addrspace(1)* %88, align 4, !tbaa !4, !llvm.mem.parallel_loop_access !11 | |
%89 = add i32 %61, 1 | |
%90 = icmp slt i32 %89, %5 | |
br i1 %90, label %._crit_edge15.r_entry, label %pregion_for_cond20, !llvm.loop !12 | |
pregion_for_cond20: ; preds = %pregion_for_cond | |
%91 = add i32 %58, 1 | |
%92 = icmp slt i32 %91, %6 | |
br i1 %92, label %pregion_for_entry.pregion_for_init, label %exit.barrier, !llvm.loop !13 | |
exit.barrier: ; preds = %pregion_for_cond20 | |
call void @barrier(i32 0) #2 | |
ret void | |
} | |
declare i32 @__core_num() | |
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #1 = { noinline nounwind "_kernel_local_size"="0" "_wi_alloca_size"="12" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #2 = { nounwind } | |
attributes #3 = { noduplicate nounwind } | |
!llvm.ident = !{!0} | |
!ocl.restrict = !{!1} | |
!opencl.kernels = !{!2} | |
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"} | |
!1 = distinct !{!1} | |
!2 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32, i32 addrspace(1)*)* @Conv2D} | |
!3 = distinct !{!3} | |
!4 = !{!5, !5, i64 0} | |
!5 = !{!"int", !6, i64 0} | |
!6 = !{!"omnipotent char", !7, i64 0} | |
!7 = !{!"Simple C/C++ TBAA"} | |
!8 = !{!9, !10} | |
!9 = distinct !{!9} | |
!10 = distinct !{!10} | |
!11 = !{!12, !13} | |
!12 = distinct !{!12} | |
!13 = distinct !{!13} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define PADDING (32) | |
#define GROUP_DIMX (32) | |
#define LOG_GROUP_DIMX (5) | |
#define GROUP_DIMY (2) | |
#define WIDTH (256) | |
#define HEIGHT (4096) | |
__kernel void | |
matrix_transpose(__global float *output, | |
__global float *input, | |
__local float *tile) | |
{ | |
int block_x = get_group_id(0); | |
int block_y = get_group_id(1); | |
int local_x = get_local_id(0) & (GROUP_DIMX - 1); | |
int local_y = get_local_id(0) >> LOG_GROUP_DIMX; | |
int local_input = mad24(local_y, GROUP_DIMX + 1, local_x); | |
int local_output = mad24(local_x, GROUP_DIMX + 1, local_y); | |
int in_x = mad24(block_x, GROUP_DIMX, local_x); | |
int in_y = mad24(block_y, GROUP_DIMX, local_y); | |
int input_index = mad24(in_y, WIDTH, in_x); | |
int out_x = mad24(block_y, GROUP_DIMX, local_x); | |
int out_y = mad24(block_x, GROUP_DIMX, local_y); | |
int output_index = mad24(out_y, HEIGHT + PADDING, out_x); | |
int global_input_stride = WIDTH * GROUP_DIMY; | |
int global_output_stride = (HEIGHT + PADDING) * GROUP_DIMY; | |
int local_input_stride = GROUP_DIMY * (GROUP_DIMX + 1); | |
int local_output_stride = GROUP_DIMY; | |
tile[local_input] = input[input_index]; | |
local_input += local_input_stride; | |
input_index += global_input_stride; | |
tile[local_input] = input[input_index]; | |
local_input += local_input_stride; | |
input_index += global_input_stride; | |
tile[local_input] = input[input_index]; | |
local_input += local_input_stride; | |
input_index += global_input_stride; | |
tile[local_input] = input[input_index]; | |
local_input += local_input_stride; | |
input_index += global_input_stride; | |
tile[local_input] = input[input_index]; | |
local_input += local_input_stride; | |
input_index += global_input_stride; | |
tile[local_input] = input[input_index]; | |
local_input += local_input_stride; | |
input_index += global_input_stride; | |
tile[local_input] = input[input_index]; | |
local_input += local_input_stride; | |
input_index += global_input_stride; | |
tile[local_input] = input[input_index]; | |
local_input += local_input_stride; | |
input_index += global_input_stride; | |
tile[local_input] = input[input_index]; | |
local_input += local_input_stride; | |
input_index += global_input_stride; | |
tile[local_input] = input[input_index]; | |
local_input += local_input_stride; | |
input_index += global_input_stride; | |
tile[local_input] = input[input_index]; | |
local_input += local_input_stride; | |
input_index += global_input_stride; | |
tile[local_input] = input[input_index]; | |
local_input += local_input_stride; | |
input_index += global_input_stride; | |
tile[local_input] = input[input_index]; | |
local_input += local_input_stride; | |
input_index += global_input_stride; | |
tile[local_input] = input[input_index]; | |
local_input += local_input_stride; | |
input_index += global_input_stride; | |
tile[local_input] = input[input_index]; | |
local_input += local_input_stride; | |
input_index += global_input_stride; | |
tile[local_input] = input[input_index]; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
output[output_index] = tile[local_output]; | |
local_output += local_output_stride; | |
output_index += global_output_stride; | |
output[output_index] = tile[local_output]; | |
local_output += local_output_stride; | |
output_index += global_output_stride; | |
output[output_index] = tile[local_output]; | |
local_output += local_output_stride; | |
output_index += global_output_stride; | |
output[output_index] = tile[local_output]; | |
local_output += local_output_stride; | |
output_index += global_output_stride; | |
output[output_index] = tile[local_output]; | |
local_output += local_output_stride; | |
output_index += global_output_stride; | |
output[output_index] = tile[local_output]; | |
local_output += local_output_stride; | |
output_index += global_output_stride; | |
output[output_index] = tile[local_output]; | |
local_output += local_output_stride; | |
output_index += global_output_stride; | |
output[output_index] = tile[local_output]; | |
local_output += local_output_stride; | |
output_index += global_output_stride; | |
output[output_index] = tile[local_output]; | |
local_output += local_output_stride; | |
output_index += global_output_stride; | |
output[output_index] = tile[local_output]; | |
local_output += local_output_stride; | |
output_index += global_output_stride; | |
output[output_index] = tile[local_output]; | |
local_output += local_output_stride; | |
output_index += global_output_stride; | |
output[output_index] = tile[local_output]; | |
local_output += local_output_stride; | |
output_index += global_output_stride; | |
output[output_index] = tile[local_output]; | |
local_output += local_output_stride; | |
output_index += global_output_stride; | |
output[output_index] = tile[local_output]; | |
local_output += local_output_stride; | |
output_index += global_output_stride; | |
output[output_index] = tile[local_output]; | |
local_output += local_output_stride; | |
output_index += global_output_stride; | |
output[output_index] = tile[local_output]; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = 'example2.bc' | |
source_filename = "example2.bc" | |
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" | |
target triple = "c6000-unknown-unknown-unknown" | |
@kernel_config_l2 = external addrspace(2) constant [32 x i32] | |
; Function Attrs: noduplicate | |
declare linkonce protected void @barrier(i32) #0 | |
; Function Attrs: noinline nounwind | |
define void @matrix_transpose(float addrspace(1)* nocapture %output, float addrspace(1)* nocapture readonly %input, float addrspace(3)* %tile) #1 { | |
pregion_for_init1: | |
%0 = call i32 @__core_num() | |
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 15), align 4 | |
%2 = mul i32 %0, %1 | |
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 14), align 4 | |
%4 = add i32 %3, %2 | |
%5 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4 | |
%6 = mul i32 4, %5 | |
%7 = add i32 %4, %6 | |
%8 = add i32 %7, 7 | |
%9 = and i32 %8, -8 | |
%.0.pocl_context = inttoptr i32 %4 to i32*, !ocl.restrict !3 | |
call void @barrier(i32 0) #2 | |
%10 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !4 | |
%11 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !4 | |
%12 = sub i32 %10, %11 | |
%13 = udiv i32 %12, %5 | |
%14 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !4 | |
%15 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !4 | |
%16 = sub i32 %14, %15 | |
%17 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4, !tbaa !4 | |
%18 = udiv i32 %16, %17 | |
%19 = shl nsw i32 %13, 5 | |
%20 = shl nsw i32 %18, 5 | |
br label %pregion_for_cond3 | |
pregion_for_cond3: ; preds = %pregion_for_cond3, %pregion_for_init1 | |
%21 = phi i32 [ 0, %pregion_for_init1 ], [ %113, %pregion_for_cond3 ] | |
%22 = and i32 %21, 31 | |
%23 = lshr i32 %21, 5 | |
%24 = mul nsw i32 %23, 33 | |
%25 = add nuw nsw i32 %24, %22 | |
%26 = or i32 %22, %19 | |
%27 = add nsw i32 %23, %20 | |
%28 = shl nsw i32 %27, 8 | |
%29 = add nsw i32 %28, %26 | |
%30 = or i32 %22, %20 | |
%31 = add nsw i32 %23, %19 | |
%32 = mul nsw i32 %31, 4128 | |
%33 = add nsw i32 %32, %30 | |
%34 = getelementptr i32, i32* %.0.pocl_context, i32 %21 | |
store i32 %33, i32* %34, align 4, !llvm.mem.parallel_loop_access !8 | |
%35 = getelementptr inbounds float, float addrspace(1)* %input, i32 %29 | |
%36 = load float, float addrspace(1)* %35, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%37 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %25 | |
store float %36, float addrspace(3)* %37, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%38 = add nuw nsw i32 %25, 66 | |
%39 = add nsw i32 %29, 512 | |
%40 = getelementptr inbounds float, float addrspace(1)* %input, i32 %39 | |
%41 = load float, float addrspace(1)* %40, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%42 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %38 | |
store float %41, float addrspace(3)* %42, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%43 = add nuw nsw i32 %25, 132 | |
%44 = add nsw i32 %29, 1024 | |
%45 = getelementptr inbounds float, float addrspace(1)* %input, i32 %44 | |
%46 = load float, float addrspace(1)* %45, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%47 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %43 | |
store float %46, float addrspace(3)* %47, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%48 = add nuw nsw i32 %25, 198 | |
%49 = add nsw i32 %29, 1536 | |
%50 = getelementptr inbounds float, float addrspace(1)* %input, i32 %49 | |
%51 = load float, float addrspace(1)* %50, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%52 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %48 | |
store float %51, float addrspace(3)* %52, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%53 = add nuw nsw i32 %25, 264 | |
%54 = add nsw i32 %29, 2048 | |
%55 = getelementptr inbounds float, float addrspace(1)* %input, i32 %54 | |
%56 = load float, float addrspace(1)* %55, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%57 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %53 | |
store float %56, float addrspace(3)* %57, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%58 = add nuw nsw i32 %25, 330 | |
%59 = add nsw i32 %29, 2560 | |
%60 = getelementptr inbounds float, float addrspace(1)* %input, i32 %59 | |
%61 = load float, float addrspace(1)* %60, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%62 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %58 | |
store float %61, float addrspace(3)* %62, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%63 = add nuw nsw i32 %25, 396 | |
%64 = add nsw i32 %29, 3072 | |
%65 = getelementptr inbounds float, float addrspace(1)* %input, i32 %64 | |
%66 = load float, float addrspace(1)* %65, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%67 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %63 | |
store float %66, float addrspace(3)* %67, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%68 = add nuw nsw i32 %25, 462 | |
%69 = add nsw i32 %29, 3584 | |
%70 = getelementptr inbounds float, float addrspace(1)* %input, i32 %69 | |
%71 = load float, float addrspace(1)* %70, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%72 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %68 | |
store float %71, float addrspace(3)* %72, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%73 = add nuw nsw i32 %25, 528 | |
%74 = add nsw i32 %29, 4096 | |
%75 = getelementptr inbounds float, float addrspace(1)* %input, i32 %74 | |
%76 = load float, float addrspace(1)* %75, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%77 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %73 | |
store float %76, float addrspace(3)* %77, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%78 = add nuw nsw i32 %25, 594 | |
%79 = add nsw i32 %29, 4608 | |
%80 = getelementptr inbounds float, float addrspace(1)* %input, i32 %79 | |
%81 = load float, float addrspace(1)* %80, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%82 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %78 | |
store float %81, float addrspace(3)* %82, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%83 = add nuw nsw i32 %25, 660 | |
%84 = add nsw i32 %29, 5120 | |
%85 = getelementptr inbounds float, float addrspace(1)* %input, i32 %84 | |
%86 = load float, float addrspace(1)* %85, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%87 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %83 | |
store float %86, float addrspace(3)* %87, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%88 = add nuw nsw i32 %25, 726 | |
%89 = add nsw i32 %29, 5632 | |
%90 = getelementptr inbounds float, float addrspace(1)* %input, i32 %89 | |
%91 = load float, float addrspace(1)* %90, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%92 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %88 | |
store float %91, float addrspace(3)* %92, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%93 = add nuw nsw i32 %25, 792 | |
%94 = add nsw i32 %29, 6144 | |
%95 = getelementptr inbounds float, float addrspace(1)* %input, i32 %94 | |
%96 = load float, float addrspace(1)* %95, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%97 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %93 | |
store float %96, float addrspace(3)* %97, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%98 = add nuw nsw i32 %25, 858 | |
%99 = add nsw i32 %29, 6656 | |
%100 = getelementptr inbounds float, float addrspace(1)* %input, i32 %99 | |
%101 = load float, float addrspace(1)* %100, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%102 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %98 | |
store float %101, float addrspace(3)* %102, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%103 = add nuw nsw i32 %25, 924 | |
%104 = add nsw i32 %29, 7168 | |
%105 = getelementptr inbounds float, float addrspace(1)* %input, i32 %104 | |
%106 = load float, float addrspace(1)* %105, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%107 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %103 | |
store float %106, float addrspace(3)* %107, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%108 = add nuw nsw i32 %25, 990 | |
%109 = add nsw i32 %29, 7680 | |
%110 = getelementptr inbounds float, float addrspace(1)* %input, i32 %109 | |
%111 = load float, float addrspace(1)* %110, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%112 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %108 | |
store float %111, float addrspace(3)* %112, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !8 | |
%113 = add i32 %21, 1 | |
%114 = icmp slt i32 %113, %5 | |
br i1 %114, label %pregion_for_cond3, label %pregion_for_init, !llvm.loop !9 | |
pregion_for_init: ; preds = %pregion_for_cond3 | |
tail call void @barrier(i32 1) #3 | |
br label %pregion_for_cond | |
pregion_for_cond: ; preds = %pregion_for_cond, %pregion_for_init | |
%115 = phi i32 [ 0, %pregion_for_init ], [ %200, %pregion_for_cond ] | |
%116 = and i32 %115, 31 | |
%117 = mul nuw nsw i32 %116, 33 | |
%118 = lshr i32 %115, 5 | |
%119 = add nuw nsw i32 %117, %118 | |
%120 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %119 | |
%121 = load float, float addrspace(3)* %120, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%122 = getelementptr i32, i32* %.0.pocl_context, i32 %115 | |
%123 = load i32, i32* %122, align 4 | |
%124 = getelementptr inbounds float, float addrspace(1)* %output, i32 %123 | |
store float %121, float addrspace(1)* %124, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%125 = add nuw nsw i32 %119, 2 | |
%126 = add nsw i32 %123, 8256 | |
%127 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %125 | |
%128 = load float, float addrspace(3)* %127, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%129 = getelementptr inbounds float, float addrspace(1)* %output, i32 %126 | |
store float %128, float addrspace(1)* %129, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%130 = add nuw nsw i32 %119, 4 | |
%131 = add nsw i32 %123, 16512 | |
%132 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %130 | |
%133 = load float, float addrspace(3)* %132, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%134 = getelementptr inbounds float, float addrspace(1)* %output, i32 %131 | |
store float %133, float addrspace(1)* %134, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%135 = add nuw nsw i32 %119, 6 | |
%136 = add nsw i32 %123, 24768 | |
%137 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %135 | |
%138 = load float, float addrspace(3)* %137, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%139 = getelementptr inbounds float, float addrspace(1)* %output, i32 %136 | |
store float %138, float addrspace(1)* %139, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%140 = add nuw nsw i32 %119, 8 | |
%141 = add nsw i32 %123, 33024 | |
%142 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %140 | |
%143 = load float, float addrspace(3)* %142, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%144 = getelementptr inbounds float, float addrspace(1)* %output, i32 %141 | |
store float %143, float addrspace(1)* %144, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%145 = add nuw nsw i32 %119, 10 | |
%146 = add nsw i32 %123, 41280 | |
%147 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %145 | |
%148 = load float, float addrspace(3)* %147, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%149 = getelementptr inbounds float, float addrspace(1)* %output, i32 %146 | |
store float %148, float addrspace(1)* %149, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%150 = add nuw nsw i32 %119, 12 | |
%151 = add nsw i32 %123, 49536 | |
%152 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %150 | |
%153 = load float, float addrspace(3)* %152, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%154 = getelementptr inbounds float, float addrspace(1)* %output, i32 %151 | |
store float %153, float addrspace(1)* %154, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%155 = add nuw nsw i32 %119, 14 | |
%156 = add nsw i32 %123, 57792 | |
%157 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %155 | |
%158 = load float, float addrspace(3)* %157, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%159 = getelementptr inbounds float, float addrspace(1)* %output, i32 %156 | |
store float %158, float addrspace(1)* %159, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%160 = add nuw nsw i32 %119, 16 | |
%161 = add nsw i32 %123, 66048 | |
%162 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %160 | |
%163 = load float, float addrspace(3)* %162, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%164 = getelementptr inbounds float, float addrspace(1)* %output, i32 %161 | |
store float %163, float addrspace(1)* %164, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%165 = add nuw nsw i32 %119, 18 | |
%166 = add nsw i32 %123, 74304 | |
%167 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %165 | |
%168 = load float, float addrspace(3)* %167, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%169 = getelementptr inbounds float, float addrspace(1)* %output, i32 %166 | |
store float %168, float addrspace(1)* %169, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%170 = add nuw nsw i32 %119, 20 | |
%171 = add nsw i32 %123, 82560 | |
%172 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %170 | |
%173 = load float, float addrspace(3)* %172, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%174 = getelementptr inbounds float, float addrspace(1)* %output, i32 %171 | |
store float %173, float addrspace(1)* %174, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%175 = add nuw nsw i32 %119, 22 | |
%176 = add nsw i32 %123, 90816 | |
%177 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %175 | |
%178 = load float, float addrspace(3)* %177, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%179 = getelementptr inbounds float, float addrspace(1)* %output, i32 %176 | |
store float %178, float addrspace(1)* %179, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%180 = add nuw nsw i32 %119, 24 | |
%181 = add nsw i32 %123, 99072 | |
%182 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %180 | |
%183 = load float, float addrspace(3)* %182, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%184 = getelementptr inbounds float, float addrspace(1)* %output, i32 %181 | |
store float %183, float addrspace(1)* %184, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%185 = add nuw nsw i32 %119, 26 | |
%186 = add nsw i32 %123, 107328 | |
%187 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %185 | |
%188 = load float, float addrspace(3)* %187, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%189 = getelementptr inbounds float, float addrspace(1)* %output, i32 %186 | |
store float %188, float addrspace(1)* %189, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%190 = add nuw nsw i32 %119, 28 | |
%191 = add nsw i32 %123, 115584 | |
%192 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %190 | |
%193 = load float, float addrspace(3)* %192, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%194 = getelementptr inbounds float, float addrspace(1)* %output, i32 %191 | |
store float %193, float addrspace(1)* %194, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%195 = add nuw nsw i32 %119, 30 | |
%196 = add nsw i32 %123, 123840 | |
%197 = getelementptr inbounds float, float addrspace(3)* %tile, i32 %195 | |
%198 = load float, float addrspace(3)* %197, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%199 = getelementptr inbounds float, float addrspace(1)* %output, i32 %196 | |
store float %198, float addrspace(1)* %199, align 4, !tbaa !10, !llvm.mem.parallel_loop_access !12 | |
%200 = add i32 %115, 1 | |
%201 = icmp slt i32 %200, %5 | |
br i1 %201, label %pregion_for_cond, label %exit.barrier, !llvm.loop !13 | |
exit.barrier: ; preds = %pregion_for_cond | |
call void @barrier(i32 0) #2 | |
ret void | |
} | |
declare i32 @__core_num() | |
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #1 = { noinline nounwind "_kernel_local_size"="0" "_wi_alloca_size"="4" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #2 = { nounwind } | |
attributes #3 = { noduplicate nounwind } | |
!llvm.ident = !{!0} | |
!ocl.restrict = !{!1} | |
!opencl.kernels = !{!2} | |
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"} | |
!1 = distinct !{!1} | |
!2 = !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(3)*)* @matrix_transpose} | |
!3 = distinct !{!3} | |
!4 = !{!5, !5, i64 0} | |
!5 = !{!"int", !6, i64 0} | |
!6 = !{!"omnipotent char", !7, i64 0} | |
!7 = !{!"Simple C/C++ TBAA"} | |
!8 = !{!9} | |
!9 = distinct !{!9} | |
!10 = !{!11, !11, i64 0} | |
!11 = !{!"float", !6, i64 0} | |
!12 = !{!13} | |
!13 = distinct !{!13} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
kernel void | |
matadd (__global const float *A, | |
__global const float *B, | |
__global float *C) | |
{ | |
size_t X = get_global_id(0); | |
size_t Y = get_global_id(1); | |
size_t Idx = Y*get_global_size(0) + X; | |
C[Idx] = A[Idx] + B[Idx]; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = 'matadd.bc' | |
source_filename = "matadd.bc" | |
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" | |
target triple = "c6000-unknown-unknown-unknown" | |
@kernel_config_l2 = external addrspace(2) constant [32 x i32] | |
; Function Attrs: nounwind | |
define void @matadd(float addrspace(1)* nocapture readonly %A, float addrspace(1)* nocapture readonly %B, float addrspace(1)* nocapture %C) #0 { | |
.entry: | |
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4, !llvm.mem.parallel_loop_access !7 | |
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !10, !llvm.mem.parallel_loop_access !7 | |
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !10, !llvm.mem.parallel_loop_access !7 | |
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 1), align 4, !tbaa !10, !llvm.mem.parallel_loop_access !7 | |
%4 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4, !llvm.mem.parallel_loop_access !7 | |
%5 = icmp sgt i32 %4, 0 | |
br i1 %5, label %.bodyTop3.preheader, label %.exit2 | |
.bodyTop3.preheader: ; preds = %.entry | |
br label %.bodyTop3 | |
.bodyTop3: ; preds = %.bodyEnd4, %.bodyTop3.preheader | |
%6 = phi i32 [ %21, %.bodyEnd4 ], [ 0, %.bodyTop3.preheader ] | |
%7 = icmp sgt i32 %0, 0 | |
br i1 %7, label %.bodyTop.preheader, label %.bodyEnd4 | |
.bodyTop.preheader: ; preds = %.bodyTop3 | |
br label %.bodyTop | |
.bodyTop: ; preds = %.bodyTop, %.bodyTop.preheader | |
%8 = phi i32 [ %19, %.bodyTop ], [ 0, %.bodyTop.preheader ] | |
%9 = add i32 %8, %1 | |
%10 = add i32 %6, %2 | |
%11 = mul i32 %3, %10 | |
%12 = add i32 %11, %9 | |
%13 = getelementptr inbounds float, float addrspace(1)* %A, i32 %12 | |
%14 = load float, float addrspace(1)* %13, align 4, !tbaa !14, !llvm.mem.parallel_loop_access !7 | |
%15 = getelementptr inbounds float, float addrspace(1)* %B, i32 %12 | |
%16 = load float, float addrspace(1)* %15, align 4, !tbaa !14, !llvm.mem.parallel_loop_access !7 | |
%17 = fadd float %14, %16 | |
%18 = getelementptr inbounds float, float addrspace(1)* %C, i32 %12 | |
store float %17, float addrspace(1)* %18, align 4, !tbaa !14, !llvm.mem.parallel_loop_access !7 | |
%19 = add i32 %8, 1 | |
%20 = icmp slt i32 %19, %0 | |
br i1 %20, label %.bodyTop, label %.bodyEnd4.loopexit, !llvm.loop !8 | |
.bodyEnd4.loopexit: ; preds = %.bodyTop | |
br label %.bodyEnd4 | |
.bodyEnd4: ; preds = %.bodyEnd4.loopexit, %.bodyTop3 | |
%21 = add i32 %6, 1 | |
%22 = icmp slt i32 %21, %4 | |
br i1 %22, label %.bodyTop3, label %.exit2.loopexit, !llvm.loop !9 | |
.exit2.loopexit: ; preds = %.bodyEnd4 | |
br label %.exit2 | |
.exit2: ; preds = %.exit2.loopexit, %.entry | |
ret void | |
} | |
attributes #0 = { nounwind "_kernel_local_size"="0" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
!opencl.kernels = !{!0} | |
!llvm.ident = !{!6} | |
!0 = !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*)* @matadd, !1, !2, !3, !4, !5} | |
!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1} | |
!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"} | |
!3 = !{!"kernel_arg_type", !"float*", !"float*", !"float*"} | |
!4 = !{!"kernel_arg_base_type", !"float*", !"float*", !"float*"} | |
!5 = !{!"kernel_arg_type_qual", !"const", !"const", !""} | |
!6 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"} | |
!7 = !{!8, !9} | |
!8 = distinct !{!8} | |
!9 = distinct !{!9} | |
!10 = !{!11, !11, i64 0} | |
!11 = !{!"int", !12, i64 0} | |
!12 = !{!"omnipotent char", !13, i64 0} | |
!13 = !{!"Simple C/C++ TBAA"} | |
!14 = !{!15, !15, i64 0} | |
!15 = !{!"float", !12, i64 0} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Original code: | |
* The MIT License (MIT) | |
* Copyright (c) 2014 SURFsara | |
* https://github.com/CNugteren/myGEMM/blob/master/src/kernels.cl | |
*/ | |
#define ITYPE uint | |
#define TS (100) | |
/* work per thread */ | |
#define WPT (100 / 4) | |
// TS/WPT == RTS | |
#define RTS 4 | |
// Tiled and coalesced version | |
__kernel void | |
myGEMM4 (const __global float *A, const __global float *B, __global float *C, | |
uint M, uint N, uint K) | |
{ | |
// Thread identifiers | |
const ITYPE row = get_local_id (0); // Local row ID (max: TS) | |
const ITYPE col = get_local_id (1); // Local col ID (max: TS/WPT == RTS) | |
const ITYPE globalRow = TS * get_group_id (0) + row; // Row ID of C (0..M) | |
const ITYPE globalCol = TS * get_group_id (1) + col; // Col ID of C (0..N) | |
// Local memory to fit a tile of TS*TS elements of A and B | |
__local float Asub[TS][TS]; | |
__local float Bsub[TS][TS]; | |
// Initialise the accumulation registers | |
float acc[WPT]; | |
for (ITYPE w = 0; w < WPT; w++) | |
{ | |
acc[w] = 0.0f; | |
} | |
// Loop over all tiles | |
const ITYPE numTiles = K / TS; | |
for (ITYPE t = 0; t < numTiles; t++) | |
{ | |
// Load one tile of A and B into local memory | |
for (ITYPE w = 0; w < WPT; w++) | |
{ | |
const ITYPE tiledRow = TS * t + row; | |
const ITYPE tiledCol = TS * t + col; | |
Asub[col + w * RTS][row] = A[(tiledCol + w * RTS) * M + globalRow]; | |
Bsub[col + w * RTS][row] = B[(globalCol + w * RTS) * K + tiledRow]; | |
} | |
// Synchronise to make sure the tile is loaded | |
barrier (CLK_LOCAL_MEM_FENCE); | |
// Perform the computation for a single tile | |
for (ITYPE k = 0; k < TS; k++) | |
{ | |
for (ITYPE w = 0; w < WPT; w++) | |
{ | |
#ifdef USE_FMA | |
acc[w] = fma (Asub[k][row], Bsub[col + w * RTS][k], acc[w]); | |
#else | |
acc[w] += Asub[k][row] * Bsub[col + w * RTS][k]; | |
#endif | |
} | |
} | |
// Synchronise before loading the next tile | |
barrier (CLK_LOCAL_MEM_FENCE); | |
} | |
// Store the final results in C | |
for (ITYPE w = 0; w < WPT; w++) | |
{ | |
C[(globalCol + w * RTS) * M + globalRow] = acc[w]; | |
} | |
} | |
#define TRANSPOSEX 8 | |
#define TRANSPOSEY 8 | |
// Simple transpose kernel for a P * Q matrix | |
__kernel void | |
transpose (const ITYPE P, const ITYPE Q, const __global float *input, | |
__global float *output) | |
{ | |
// Thread identifiers | |
const ITYPE tx = get_local_id (0); | |
const ITYPE ty = get_local_id (1); | |
const ITYPE ID0 = get_group_id (0) * TRANSPOSEX + tx; // 0..P | |
const ITYPE ID1 = get_group_id (1) * TRANSPOSEY + ty; // 0..Q | |
// Set-up the local memory for shuffling | |
__local float buffer[TRANSPOSEX][TRANSPOSEY]; | |
// Swap the x and y coordinates to perform the rotation (coalesced) | |
// if (ID0 < P && ID1 < Q) { | |
buffer[ty][tx] = input[ID1 * P + ID0]; | |
// } | |
// Synchronise all threads | |
barrier (CLK_LOCAL_MEM_FENCE); | |
// We don't have to swap the x and y thread indices here, | |
// because that's already done in the local memory | |
const ITYPE newID0 = get_group_id (1) * TRANSPOSEY + tx; | |
const ITYPE newID1 = get_group_id (0) * TRANSPOSEX + ty; | |
// Store the transposed result (coalesced) | |
// if (newID0 < Q && newID1 < P) { | |
output[newID1 * Q + newID0] = buffer[tx][ty]; | |
// } | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = 'matrix1.bc' | |
source_filename = "matrix1.bc" | |
target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" | |
target triple = "c6000-unknown-unknown-unknown" | |
@myGEMM4.Asub = internal unnamed_addr addrspace(3) global [100 x [100 x float]] undef, align 4 | |
@myGEMM4.Bsub = internal unnamed_addr addrspace(3) global [100 x [100 x float]] undef, align 4 | |
@transpose.buffer = internal unnamed_addr addrspace(3) global [8 x [8 x float]] undef, align 4 | |
@kernel_config_l2 = external addrspace(2) constant [32 x i32] | |
; Function Attrs: noduplicate | |
declare linkonce protected void @barrier(i32) #0 | |
; Function Attrs: noinline nounwind | |
define void @myGEMM4(float addrspace(1)* nocapture readonly %A, float addrspace(1)* nocapture readonly %B, float addrspace(1)* nocapture %C, i32 %M, i32 %N, i32 %K) #1 { | |
.r_entry: | |
%0 = call i32 @__core_num() | |
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 15), align 4 | |
%2 = mul i32 %0, %1 | |
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 14), align 4 | |
%4 = add i32 %3, %2 | |
%5 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4 | |
%6 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4 | |
%7 = mul i32 %5, %6 | |
%8 = mul i32 100, %7 | |
%9 = add i32 %4, %8 | |
%10 = add i32 %9, 7 | |
%11 = and i32 %10, -8 | |
%.acc.pocl_context = inttoptr i32 %4 to [25 x float]*, !ocl.restrict !4 | |
%12 = mul i32 4, %7 | |
%13 = add i32 %11, %12 | |
%14 = add i32 %13, 7 | |
%15 = and i32 %14, -8 | |
%.t.010.ex_phi.pocl_context = inttoptr i32 %11 to i32*, !ocl.restrict !4 | |
call void @barrier(i32 0) #5 | |
%16 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !5 | |
%17 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !5 | |
%18 = sub i32 %16, %17 | |
%19 = udiv i32 %18, %5 | |
%20 = mul i32 %19, 100 | |
%21 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !5 | |
%22 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !5 | |
%23 = sub i32 %21, %22 | |
%24 = udiv i32 %23, %6 | |
%25 = mul i32 %24, 100 | |
%26 = udiv i32 %K, 100 | |
%27 = icmp ugt i32 %K, 99 | |
%28 = bitcast [25 x float]* %.acc.pocl_context to i8* | |
call void @llvm.lifetime.start.p0i8(i64 100, i8* %28) #5 | |
call void @llvm.memset.p0i8.i32(i8* align 64 %28, i8 0, i32 100, i1 false) | |
br i1 %27, label %pregion_for_init38, label %.preheader.preheader | |
.preheader.preheader: ; preds = %.r_entry | |
br label %.preheader | |
pregion_for_init38: ; preds = %.r_entry | |
store i32 0, i32* %.t.010.ex_phi.pocl_context, align 64 | |
br label %pregion_for_entry.pregion_for_init35 | |
pregion_for_entry.pregion_for_init35: ; preds = %pregion_for_cond40, %pregion_for_init38 | |
%29 = phi i32 [ 0, %pregion_for_init38 ], [ %39, %pregion_for_cond40 ] | |
%30 = phi i32 [ 1, %pregion_for_init38 ], [ 0, %pregion_for_cond40 ] | |
%31 = mul i32 %29, %5 | |
br label %pregion_for_cond37 | |
pregion_for_cond37: ; preds = %pregion_for_cond37, %pregion_for_entry.pregion_for_init35 | |
%32 = phi i32 [ %30, %pregion_for_entry.pregion_for_init35 ], [ %37, %pregion_for_cond37 ] | |
%33 = add i32 %32, %31 | |
%34 = getelementptr [25 x float], [25 x float]* %.acc.pocl_context, i32 %33 | |
%35 = bitcast [25 x float]* %34 to i8* | |
call void @llvm.lifetime.start.p0i8(i64 100, i8* %35) #5, !llvm.mem.parallel_loop_access !9 | |
call void @llvm.memset.p0i8.i32(i8* align 4 %35, i8 0, i32 100, i1 false) | |
%36 = getelementptr i32, i32* %.t.010.ex_phi.pocl_context, i32 %33 | |
store i32 0, i32* %36, align 4, !llvm.mem.parallel_loop_access !9 | |
%37 = add i32 %32, 1 | |
%38 = icmp slt i32 %37, %5 | |
br i1 %38, label %pregion_for_cond37, label %pregion_for_cond40, !llvm.loop !10 | |
pregion_for_cond40: ; preds = %pregion_for_cond37 | |
%39 = add i32 %29, 1 | |
%40 = icmp slt i32 %39, %6 | |
br i1 %40, label %pregion_for_entry.pregion_for_init35, label %.preheader5.preheader.loopbarrier, !llvm.loop !11 | |
.preheader5.preheader.loopbarrier: ; preds = %pregion_for_cond40 | |
call void @barrier(i32 0) #5 | |
br label %pregion_for_entry.pregion_for_init29 | |
pregion_for_entry.pregion_for_init29: ; preds = %pregion_for_entry.pregion_for_init29.backedge, %.preheader5.preheader.loopbarrier | |
%41 = phi i32 [ 0, %.preheader5.preheader.loopbarrier ], [ %.be, %pregion_for_entry.pregion_for_init29.backedge ] | |
%42 = mul i32 %41, %5 | |
%43 = add i32 %25, %41 | |
br label %pregion_for_entry..preheader5 | |
pregion_for_entry..preheader5: ; preds = %pregion_for_cond31, %pregion_for_entry.pregion_for_init29 | |
%44 = phi i32 [ 0, %pregion_for_entry.pregion_for_init29 ], [ %70, %pregion_for_cond31 ] | |
%45 = add i32 %44, %42 | |
%46 = getelementptr i32, i32* %.t.010.ex_phi.pocl_context, i32 %45 | |
%47 = load i32, i32* %46, align 4, !llvm.mem.parallel_loop_access !12 | |
%48 = mul i32 %47, 100 | |
%49 = add i32 %48, %44 | |
%50 = add i32 %48, %41 | |
%51 = add i32 %20, %44 | |
br label %52 | |
52: ; preds = %52, %pregion_for_entry..preheader5 | |
%53 = phi i32 [ %68, %52 ], [ 0, %pregion_for_entry..preheader5 ] | |
%54 = shl i32 %53, 2 | |
%55 = add i32 %50, %54 | |
%56 = mul i32 %55, %M | |
%57 = add i32 %56, %51 | |
%58 = getelementptr inbounds float, float addrspace(1)* %A, i32 %57 | |
%59 = load float, float addrspace(1)* %58, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !12 | |
%60 = add i32 %54, %41 | |
%61 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]] addrspace(3)* @myGEMM4.Asub, i32 0, i32 %60, i32 %44 | |
store float %59, float addrspace(3)* %61, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !12 | |
%62 = add i32 %54, %43 | |
%63 = mul i32 %62, %K | |
%64 = add i32 %49, %63 | |
%65 = getelementptr inbounds float, float addrspace(1)* %B, i32 %64 | |
%66 = load float, float addrspace(1)* %65, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !12 | |
%67 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]] addrspace(3)* @myGEMM4.Bsub, i32 0, i32 %60, i32 %44 | |
store float %66, float addrspace(3)* %67, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !12 | |
%68 = add nuw nsw i32 %53, 1 | |
%69 = icmp ult i32 %68, 25 | |
br i1 %69, label %52, label %pregion_for_cond31 | |
pregion_for_cond31: ; preds = %52 | |
%70 = add i32 %44, 1 | |
%71 = icmp slt i32 %70, %5 | |
br i1 %71, label %pregion_for_entry..preheader5, label %pregion_for_cond34, !llvm.loop !13 | |
pregion_for_cond34: ; preds = %pregion_for_cond31 | |
%72 = add i32 %41, 1 | |
%73 = icmp slt i32 %72, %6 | |
br i1 %73, label %pregion_for_entry.pregion_for_init29.backedge, label %pregion_for_init26, !llvm.loop !14 | |
pregion_for_entry.pregion_for_init29.backedge: ; preds = %.brexitbarrier.latchbarrier.postbarrier, %pregion_for_cond34 | |
%.be = phi i32 [ %72, %pregion_for_cond34 ], [ 0, %.brexitbarrier.latchbarrier.postbarrier ] | |
br label %pregion_for_entry.pregion_for_init29 | |
pregion_for_init26: ; preds = %pregion_for_cond34 | |
tail call void @barrier(i32 1) #6 | |
br label %pregion_for_entry.pregion_for_init23 | |
pregion_for_entry.pregion_for_init23: ; preds = %pregion_for_cond28, %pregion_for_init26 | |
%74 = phi i32 [ 0, %pregion_for_init26 ], [ %98, %pregion_for_cond28 ] | |
%75 = mul i32 %74, %5 | |
br label %pregion_for_entry..postbarrier | |
pregion_for_entry..postbarrier: ; preds = %pregion_for_cond25, %pregion_for_entry.pregion_for_init23 | |
%76 = phi i32 [ 0, %pregion_for_entry.pregion_for_init23 ], [ %96, %pregion_for_cond25 ] | |
%77 = add i32 %76, %75 | |
br label %.preheader4 | |
.preheader4: ; preds = %93, %pregion_for_entry..postbarrier | |
%78 = phi i32 [ %94, %93 ], [ 0, %pregion_for_entry..postbarrier ] | |
%79 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]] addrspace(3)* @myGEMM4.Asub, i32 0, i32 %78, i32 %76 | |
%80 = load float, float addrspace(3)* %79, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !17 | |
br label %81 | |
81: ; preds = %81, %.preheader4 | |
%82 = phi i32 [ %91, %81 ], [ 0, %.preheader4 ] | |
%83 = shl i32 %82, 2 | |
%84 = add i32 %83, %74 | |
%85 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]] addrspace(3)* @myGEMM4.Bsub, i32 0, i32 %84, i32 %78 | |
%86 = load float, float addrspace(3)* %85, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !17 | |
%87 = fmul float %80, %86 | |
%88 = getelementptr inbounds [25 x float], [25 x float]* %.acc.pocl_context, i32 %77, i32 %82 | |
%89 = load float, float* %88, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !17 | |
%90 = fadd float %89, %87 | |
store float %90, float* %88, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !17 | |
%91 = add nuw nsw i32 %82, 1 | |
%92 = icmp ult i32 %91, 25 | |
br i1 %92, label %81, label %93 | |
93: ; preds = %81 | |
%94 = add nuw nsw i32 %78, 1 | |
%95 = icmp ult i32 %94, 100 | |
br i1 %95, label %.preheader4, label %pregion_for_cond25 | |
pregion_for_cond25: ; preds = %93 | |
%96 = add i32 %76, 1 | |
%97 = icmp slt i32 %96, %5 | |
br i1 %97, label %pregion_for_entry..postbarrier, label %pregion_for_cond28, !llvm.loop !18 | |
pregion_for_cond28: ; preds = %pregion_for_cond25 | |
%98 = add i32 %74, 1 | |
%99 = icmp slt i32 %98, %6 | |
br i1 %99, label %pregion_for_entry.pregion_for_init23, label %pregion_for_init20, !llvm.loop !19 | |
pregion_for_init20: ; preds = %pregion_for_cond28 | |
tail call void @barrier(i32 1) #6 | |
%100 = add nuw nsw i32 %47, 1 | |
br label %pregion_for_entry.pregion_for_init17 | |
pregion_for_entry.pregion_for_init17: ; preds = %pregion_for_cond22, %pregion_for_init20 | |
%101 = phi i32 [ 0, %pregion_for_init20 ], [ %108, %pregion_for_cond22 ] | |
%102 = mul i32 %101, %5 | |
br label %pregion_for_cond19 | |
pregion_for_cond19: ; preds = %pregion_for_cond19, %pregion_for_entry.pregion_for_init17 | |
%103 = phi i32 [ 0, %pregion_for_entry.pregion_for_init17 ], [ %106, %pregion_for_cond19 ] | |
%104 = add i32 %103, %102 | |
%105 = getelementptr i32, i32* %.t.010.ex_phi.pocl_context, i32 %104 | |
store i32 %100, i32* %105, align 4, !llvm.mem.parallel_loop_access !20 | |
%106 = add i32 %103, 1 | |
%107 = icmp slt i32 %106, %5 | |
br i1 %107, label %pregion_for_cond19, label %pregion_for_cond22, !llvm.loop !21 | |
pregion_for_cond22: ; preds = %pregion_for_cond19 | |
%108 = add i32 %101, 1 | |
%109 = icmp slt i32 %108, %6 | |
br i1 %109, label %pregion_for_entry.pregion_for_init17, label %.brexitbarrier.latchbarrier.postbarrier, !llvm.loop !22 | |
.brexitbarrier.latchbarrier.postbarrier: ; preds = %pregion_for_cond22 | |
%110 = icmp ult i32 %100, %26 | |
call void @barrier(i32 0) #5 | |
br i1 %110, label %pregion_for_entry.pregion_for_init29.backedge, label %pregion_for_entry.pregion_for_init.preheader | |
pregion_for_entry.pregion_for_init.preheader: ; preds = %.brexitbarrier.latchbarrier.postbarrier | |
br label %pregion_for_entry.pregion_for_init | |
pregion_for_entry.pregion_for_init: ; preds = %pregion_for_cond16, %pregion_for_entry.pregion_for_init.preheader | |
%111 = phi i32 [ %164, %pregion_for_cond16 ], [ 0, %pregion_for_entry.pregion_for_init.preheader ] | |
%112 = mul i32 %111, %5 | |
%113 = add i32 %25, %111 | |
br label %.preheader.preheader.btr | |
.preheader: ; preds = %.preheader, %.preheader.preheader | |
%114 = phi i32 [ %122, %.preheader ], [ 0, %.preheader.preheader ] | |
%115 = getelementptr inbounds [25 x float], [25 x float]* %.acc.pocl_context, i32 0, i32 %114 | |
%116 = load float, float* %115, align 4, !tbaa !15 | |
%117 = shl i32 %114, 2 | |
%118 = add i32 %117, %25 | |
%119 = mul i32 %118, %M | |
%120 = add i32 %119, %20 | |
%121 = getelementptr inbounds float, float addrspace(1)* %C, i32 %120 | |
store float %116, float addrspace(1)* %121, align 4, !tbaa !15 | |
%122 = add nuw nsw i32 %114, 1 | |
%123 = icmp ult i32 %122, 25 | |
br i1 %123, label %.preheader, label %pregion_for_init45 | |
.preheader.peeled_wi: ; preds = %.preheader.preheader.peeled_wi, %.preheader.peeled_wi | |
%124 = phi i32 [ 0, %.preheader.preheader.peeled_wi ], [ %132, %.preheader.peeled_wi ] | |
%125 = getelementptr inbounds [25 x float], [25 x float]* %.acc.pocl_context, i32 %135, i32 %124 | |
%126 = load float, float* %125, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !23 | |
%127 = shl i32 %124, 2 | |
%128 = add i32 %127, %140 | |
%129 = mul i32 %128, %M | |
%130 = add i32 %129, %136 | |
%131 = getelementptr inbounds float, float addrspace(1)* %C, i32 %130 | |
store float %126, float addrspace(1)* %131, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !23 | |
%132 = add nuw nsw i32 %124, 1 | |
%133 = icmp ult i32 %132, 25 | |
br i1 %133, label %.preheader.peeled_wi, label %pregion_for_cond44 | |
.preheader.preheader.peeled_wi: ; preds = %pregion_for_cond44, %pregion_for_entry.pregion_for_init42 | |
%134 = phi i32 [ %138, %pregion_for_entry.pregion_for_init42 ], [ %143, %pregion_for_cond44 ] | |
call void @llvm.lifetime.start.p0i8(i64 100, i8* %28) #5, !llvm.mem.parallel_loop_access !23 | |
call void @llvm.memset.p0i8.i32(i8* align 64 %28, i8 0, i32 100, i1 false) | |
%135 = add i32 %134, %139 | |
%136 = add i32 %20, %134 | |
br label %.preheader.peeled_wi | |
pregion_for_init45: ; preds = %.preheader | |
call void @llvm.lifetime.end.p0i8(i64 100, i8* %28) #5 | |
br label %pregion_for_entry.pregion_for_init42 | |
pregion_for_entry.pregion_for_init42: ; preds = %pregion_for_cond47, %pregion_for_init45 | |
%137 = phi i32 [ 0, %pregion_for_init45 ], [ %145, %pregion_for_cond47 ] | |
%138 = phi i32 [ 1, %pregion_for_init45 ], [ 0, %pregion_for_cond47 ] | |
%139 = mul i32 %137, %5 | |
%140 = add i32 %25, %137 | |
br label %.preheader.preheader.peeled_wi | |
pregion_for_cond44: ; preds = %.preheader.peeled_wi | |
%141 = getelementptr [25 x float], [25 x float]* %.acc.pocl_context, i32 %135 | |
%142 = bitcast [25 x float]* %141 to i8* | |
call void @llvm.lifetime.end.p0i8(i64 100, i8* %142) #5, !llvm.mem.parallel_loop_access !23 | |
%143 = add i32 %134, 1 | |
%144 = icmp slt i32 %143, %5 | |
br i1 %144, label %.preheader.preheader.peeled_wi, label %pregion_for_cond47, !llvm.loop !24 | |
pregion_for_cond47: ; preds = %pregion_for_cond44 | |
%145 = add i32 %137, 1 | |
%146 = icmp slt i32 %145, %6 | |
br i1 %146, label %pregion_for_entry.pregion_for_init42, label %exit.barrier, !llvm.loop !25 | |
exit.barrier: ; preds = %pregion_for_cond47 | |
call void @barrier(i32 0) #5 | |
br label %UnifiedReturnBlock | |
.preheader.preheader.btr: ; preds = %pregion_for_cond, %pregion_for_entry.pregion_for_init | |
%147 = phi i32 [ 0, %pregion_for_entry.pregion_for_init ], [ %162, %pregion_for_cond ] | |
%148 = add i32 %147, %112 | |
%149 = add i32 %20, %147 | |
br label %.preheader.btr | |
.preheader.btr: ; preds = %.preheader.btr, %.preheader.preheader.btr | |
%150 = phi i32 [ %158, %.preheader.btr ], [ 0, %.preheader.preheader.btr ] | |
%151 = getelementptr inbounds [25 x float], [25 x float]* %.acc.pocl_context, i32 %148, i32 %150 | |
%152 = load float, float* %151, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !26 | |
%153 = shl i32 %150, 2 | |
%154 = add i32 %153, %113 | |
%155 = mul i32 %154, %M | |
%156 = add i32 %155, %149 | |
%157 = getelementptr inbounds float, float addrspace(1)* %C, i32 %156 | |
store float %152, float addrspace(1)* %157, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !26 | |
%158 = add nuw nsw i32 %150, 1 | |
%159 = icmp ult i32 %158, 25 | |
br i1 %159, label %.preheader.btr, label %pregion_for_cond | |
pregion_for_cond: ; preds = %.preheader.btr | |
%160 = getelementptr [25 x float], [25 x float]* %.acc.pocl_context, i32 %148 | |
%161 = bitcast [25 x float]* %160 to i8* | |
call void @llvm.lifetime.end.p0i8(i64 100, i8* %161) #5, !llvm.mem.parallel_loop_access !26 | |
%162 = add i32 %147, 1 | |
%163 = icmp slt i32 %162, %5 | |
br i1 %163, label %.preheader.preheader.btr, label %pregion_for_cond16, !llvm.loop !27 | |
pregion_for_cond16: ; preds = %pregion_for_cond | |
%164 = add i32 %111, 1 | |
%165 = icmp slt i32 %164, %6 | |
br i1 %165, label %pregion_for_entry.pregion_for_init, label %exit.barrier14, !llvm.loop !28 | |
exit.barrier14: ; preds = %pregion_for_cond16 | |
call void @barrier(i32 0) #5 | |
br label %UnifiedReturnBlock | |
UnifiedReturnBlock: ; preds = %exit.barrier14, %exit.barrier | |
ret void | |
} | |
; Function Attrs: noinline nounwind | |
define void @transpose(i32 %P, i32 %Q, float addrspace(1)* nocapture readonly %input, float addrspace(1)* nocapture %output) #2 { | |
pregion_for_init6: | |
%0 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 4), align 4 | |
%1 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 5), align 4 | |
call void @barrier(i32 0) #5 | |
%2 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 10), align 4, !tbaa !5 | |
%3 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 7), align 4, !tbaa !5 | |
%4 = sub i32 %2, %3 | |
%5 = udiv i32 %4, %0 | |
%6 = shl i32 %5, 3 | |
%7 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 11), align 4, !tbaa !5 | |
%8 = load i32, i32 addrspace(2)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(2)* @kernel_config_l2, i32 0, i32 8), align 4, !tbaa !5 | |
%9 = sub i32 %7, %8 | |
%10 = udiv i32 %9, %1 | |
%11 = shl i32 %10, 3 | |
br label %pregion_for_entry.pregion_for_init3 | |
pregion_for_entry.pregion_for_init3: ; preds = %pregion_for_cond8, %pregion_for_init6 | |
%12 = phi i32 [ 0, %pregion_for_init6 ], [ %23, %pregion_for_cond8 ] | |
%13 = add i32 %11, %12 | |
%14 = mul i32 %13, %P | |
br label %pregion_for_cond5 | |
pregion_for_cond5: ; preds = %pregion_for_cond5, %pregion_for_entry.pregion_for_init3 | |
%15 = phi i32 [ 0, %pregion_for_entry.pregion_for_init3 ], [ %21, %pregion_for_cond5 ] | |
%16 = add i32 %6, %15 | |
%17 = add i32 %16, %14 | |
%18 = getelementptr inbounds float, float addrspace(1)* %input, i32 %17 | |
%19 = load float, float addrspace(1)* %18, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !29 | |
%20 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %12, i32 %15 | |
store float %19, float addrspace(3)* %20, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !29 | |
%21 = add i32 %15, 1 | |
%22 = icmp slt i32 %21, %0 | |
br i1 %22, label %pregion_for_cond5, label %pregion_for_cond8, !llvm.loop !30 | |
pregion_for_cond8: ; preds = %pregion_for_cond5 | |
%23 = add i32 %12, 1 | |
%24 = icmp slt i32 %23, %1 | |
br i1 %24, label %pregion_for_entry.pregion_for_init3, label %pregion_for_init, !llvm.loop !31 | |
pregion_for_init: ; preds = %pregion_for_cond8 | |
tail call void @barrier(i32 1) #6 | |
br label %pregion_for_entry.pregion_for_init | |
pregion_for_entry.pregion_for_init: ; preds = %pregion_for_cond2, %pregion_for_init | |
%25 = phi i32 [ 0, %pregion_for_init ], [ %36, %pregion_for_cond2 ] | |
%26 = add i32 %6, %25 | |
%27 = mul i32 %26, %Q | |
br label %pregion_for_cond | |
pregion_for_cond: ; preds = %pregion_for_cond, %pregion_for_entry.pregion_for_init | |
%28 = phi i32 [ 0, %pregion_for_entry.pregion_for_init ], [ %34, %pregion_for_cond ] | |
%29 = add i32 %11, %28 | |
%30 = getelementptr inbounds [8 x [8 x float]], [8 x [8 x float]] addrspace(3)* @transpose.buffer, i32 0, i32 %28, i32 %25 | |
%31 = load float, float addrspace(3)* %30, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !32 | |
%32 = add i32 %29, %27 | |
%33 = getelementptr inbounds float, float addrspace(1)* %output, i32 %32 | |
store float %31, float addrspace(1)* %33, align 4, !tbaa !15, !llvm.mem.parallel_loop_access !32 | |
%34 = add i32 %28, 1 | |
%35 = icmp slt i32 %34, %0 | |
br i1 %35, label %pregion_for_cond, label %pregion_for_cond2, !llvm.loop !33 | |
pregion_for_cond2: ; preds = %pregion_for_cond | |
%36 = add i32 %25, 1 | |
%37 = icmp slt i32 %36, %1 | |
br i1 %37, label %pregion_for_entry.pregion_for_init, label %exit.barrier, !llvm.loop !34 | |
exit.barrier: ; preds = %pregion_for_cond2 | |
call void @barrier(i32 0) #5 | |
ret void | |
} | |
declare i32 @__core_num() | |
; Function Attrs: argmemonly nounwind willreturn | |
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3 | |
; Function Attrs: argmemonly nounwind willreturn | |
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #3 | |
; Function Attrs: argmemonly nounwind willreturn writeonly | |
declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg) #4 | |
attributes #0 = { noduplicate "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #1 = { noinline nounwind "_kernel_local_size"="80000" "_wi_alloca_size"="108" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #2 = { noinline nounwind "_kernel_local_size"="256" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
attributes #3 = { argmemonly nounwind willreturn } | |
attributes #4 = { argmemonly nounwind willreturn writeonly } | |
attributes #5 = { nounwind } | |
attributes #6 = { noduplicate nounwind } | |
!llvm.ident = !{!0} | |
!ocl.restrict = !{!1} | |
!opencl.kernels = !{!2, !3} | |
!0 = !{!"clang version 3.6.0 (git://git.ti.com/opencl/clang.git 5b006f07bdc22b5ae6917eecdfe243908dd7b029) (git://git.ti.com/opencl/llvm.git 09780c6750b30da81e4a0a805aedf1699fbc37c7)"} | |
!1 = distinct !{!1} | |
!2 = !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i32, i32, i32)* @myGEMM4} | |
!3 = !{void (i32, i32, float addrspace(1)*, float addrspace(1)*)* @transpose} | |
!4 = distinct !{!4} | |
!5 = !{!6, !6, i64 0} | |
!6 = !{!"int", !7, i64 0} | |
!7 = !{!"omnipotent char", !8, i64 0} | |
!8 = !{!"Simple C/C++ TBAA"} | |
!9 = !{!10, !11} | |
!10 = distinct !{!10} | |
!11 = distinct !{!11} | |
!12 = !{!13, !14} | |
!13 = distinct !{!13} | |
!14 = distinct !{!14} | |
!15 = !{!16, !16, i64 0} | |
!16 = !{!"float", !7, i64 0} | |
!17 = !{!18, !19} | |
!18 = distinct !{!18} | |
!19 = distinct !{!19} | |
!20 = !{!21, !22} | |
!21 = distinct !{!21} | |
!22 = distinct !{!22} | |
!23 = !{!24, !25} | |
!24 = distinct !{!24} | |
!25 = distinct !{!25} | |
!26 = !{!27, !28} | |
!27 = distinct !{!27} | |
!28 = distinct !{!28} | |
!29 = !{!30, !31} | |
!30 = distinct !{!30} | |
!31 = distinct !{!31} | |
!32 = !{!33, !34} | |
!33 = distinct !{!33} | |
!34 = distinct !{!34} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment