alecco · August 15, 2025 09:24 · alecco · Aug 15, 2025
diff --git a/shared_memory_bounds.cu b/shared_memory_bounds.cu
 // This is a small example based on a question on GPU mode #beginner
 #include <cuda_runtime.h>
 #include <stdio.h>

 #define WARP_SIZE 32

 #define ACTUAL_WARPS_PER_BLOCK 8

 #define WARPS_EACH_BLK 4

 __global__ void sharedMemoryErrorKernel() {
    __shared__ unsigned int sat[WARPS_EACH_BLK];
    __shared__ unsigned int commons[WARPS_EACH_BLK];

    unsigned int tid = threadIdx.x;
    unsigned int warp_id = tid / WARP_SIZE;
    unsigned int lane_id = tid % WARP_SIZE;

    // 'local_sat[0]' is equivalent to 'sat[warp_id]'
    // 'warp_id' shoud be  <= 'WARPS_EACH_BLK'.

    if (lane_id == 0) {
        sat[warp_id] = warp_id;
        commons[warp_id] = warp_id;
    }

    __syncthreads();

    // Read
    if (lane_id == 0) {
        printf("Thread %u (Warp %u, Lane %u):\n", tid, warp_id, lane_id);
        printf("  sat[%u]: %u\n", warp_id, sat[warp_id]);
        printf("  commons[%u]: %u\n", warp_id, commons[warp_id]);
    }
 }

 int main() {
    const int threadsPerBlock = ACTUAL_WARPS_PER_BLOCK * WARP_SIZE; // e.g., 8 * 32 = 256 threads

    sharedMemoryErrorKernel<<<1, threadsPerBlock>>>();

    cudaError_t err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(err));
    }

    return 0;
 }
	// This is a small example based on a question on GPU mode #beginner
	#include <cuda_runtime.h>
	#include <stdio.h>

	#define WARP_SIZE 32

	#define ACTUAL_WARPS_PER_BLOCK 8

	#define WARPS_EACH_BLK 4

	__global__ void sharedMemoryErrorKernel() {
	__shared__ unsigned int sat[WARPS_EACH_BLK];
	__shared__ unsigned int commons[WARPS_EACH_BLK];

	unsigned int tid = threadIdx.x;
	unsigned int warp_id = tid / WARP_SIZE;
	unsigned int lane_id = tid % WARP_SIZE;

	// 'local_sat[0]' is equivalent to 'sat[warp_id]'
	// 'warp_id' shoud be <= 'WARPS_EACH_BLK'.

	if (lane_id == 0) {
	sat[warp_id] = warp_id;
	commons[warp_id] = warp_id;
	}

	__syncthreads();

	// Read
	if (lane_id == 0) {
	printf("Thread %u (Warp %u, Lane %u):\n", tid, warp_id, lane_id);
	printf(" sat[%u]: %u\n", warp_id, sat[warp_id]);
	printf(" commons[%u]: %u\n", warp_id, commons[warp_id]);
	}
	}

	int main() {
	const int threadsPerBlock = ACTUAL_WARPS_PER_BLOCK * WARP_SIZE; // e.g., 8 * 32 = 256 threads

	sharedMemoryErrorKernel<<<1, threadsPerBlock>>>();

	cudaError_t err = cudaDeviceSynchronize();
	if (err != cudaSuccess) {
	fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(err));
	}

	return 0;
	}