currybab's blog

pmpp lecture 09 stencil 요약

source: Lecture 09 - Stencil

Stencil

Stencil Code

#define BLOCK_DIM 8

__global__ void stencil_kernel(float* in, float* out, unsigned int N) {
    unsigned int i = blockIdx.z * blockDim.z + threadIdx.z;
    unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
    unsigned int k = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (i >= 1 && i < N - 1 && j >= 1 && j < N - 1 && k >= 1 && k < N - 1) {
        out[i * N * N + j * N + k] = C0 * in[i * N * N + j * N + k]
                                    + C1 * (in[i * N * N + j * N + (k - 1)]
                                        + in[i * N * N + j * N + (k + 1)]
                                        + in[i * N * N + (j - 1) * N + k]
                                        + in[i * N * N + (j + 1) * N + k]
                                        + in[(i - 1) * N * N + j * N + k]
                                        + in[(i + 1) * N * N + j * N + k]);
    }
}

Data Reuse in Stencil (tiling)

#define BLOCK_DIM 8
#define IN_TILE_DIM BLOCK_DIM
#define OUT_TILE_DIM (IN_TILE_DIM - 2)

__global__ void stencil_kernel(float* in, float* out, unsigned int N) {
    unsigned int i = blockIdx.z * OUT_TILE_DIM + threadIdx.z - 1;
    unsigned int j = blockIdx.y * OUT_TILE_DIM + threadIdx.y - 1;
    unsigned int k = blockIdx.x * OUT_TILE_DIM + threadIdx.x - 1;

    // 입력 타일 로드
    __shared__ float in_s[IN_TILE_DIM][IN_TILE_DIM][IN_TILE_DIM];
    if (i >= 0 && i < N && j >= 0 && j < N && k >= 0 && k < N) {
        in_s[threadIdx.z][threadIdx.y][threadIdx.x] = in[i * N * N + j * N + k];
    }
    __syncthreads();
    
    if (i >= 1 && i < N - 1 && j >= 1 && j < N - 1 && k >= 1 && k < N - 1) {
        if (threadIdx.x >= 1 && threadIdx.x < blockDim.x - 1 
            && threadIdx.y >= 1 && threadIdx.y < blockDim.y - 1 
            && threadIdx.z >= 1 && threadIdx.z < blockDim.z - 1) {
            out[i * N * N + j * N + k] = C0 * in_s[threadIdx.z][threadIdx.y][threadIdx.x]
                                        + C1 * (in_s[threadIdx.z][threadIdx.y][threadIdx.x - 1]
                                            + in_s[threadIdx.z][threadIdx.y][threadIdx.x + 1]
                                            + in_s[threadIdx.z][threadIdx.y - 1][threadIdx.x]
                                            + in_s[threadIdx.z][threadIdx.y + 1][threadIdx.x]
                                            + in_s[threadIdx.z - 1][threadIdx.y][threadIdx.x]
                                            + in_s[threadIdx.z + 1][threadIdx.y][threadIdx.x]);
        }
    }
}

ratio analyze (왜 성능 향상이 없었는가)

Input tile size 늘리기

Stencil with Thread Coarsening Code

__global__ void stencil_kernel(float* in, float* out, unsigned int N) {
    int iStart = blockIdx.z * OUT_TILE_DIM;
    int j = blockIdx.y * OUT_TILE_DIM + threadIdx.y - 1;
    int k = blockIdx.x * OUT_TILE_DIM + threadIdx.x - 1;

    __shared__ float inPrev_s[IN_TILE_DIM][IN_TILE_DIM];
    __shared__ float inCurr_s[IN_TILE_DIM][IN_TILE_DIM];
    __shared__ float inNext_s[IN_TILE_DIM][IN_TILE_DIM];
    if (iStart - 1 >= 0 && iStart - 1 < N && j >= 0 && j < N && k >= 0 && k < N) {
        inPrev_s[threadIdx.y][threadIdx.x] = in[(iStart - 1) * N * N + j * N + k];
    }
    if (iStart >= 0 && iStart < N && j >= 0 && j < N && k >= 0 && k < N) {
        inCurr_s[threadIdx.y][threadIdx.x] = in[iStart * N * N + j * N + k];
    }
    __syncthreads();

    for (int i = iStart; i < iStart + OUT_TILE_DIM; i++) {
        if (i + 1 >= 0 && i + 1 < N && j >= 0 && j < N && k >= 0 && k < N) {
            inNext_s[threadIdx.y][threadIdx.x] = in[(i + 1) * N * N + j * N + k];
        }
        __syncthreads();
        if (i >= 1 && i < N - 1 && j >= 1 && j < N - 1 && k >= 1 && k < N - 1) {
            if (threadIdx.x >= 1 && threadIdx.x < blockDim.x - 1 
            && threadIdx.y >= 1 && threadIdx.y < blockDim.y - 1) {
                out[i * N * N + j * N + k] = C0 * inCurr_s[threadIdx.y][threadIdx.x]
                                        + C1 * (inPrev_s[threadIdx.y][threadIdx.x]
                                            + inNext_s[threadIdx.y][threadIdx.x]
                                            + inCurr_s[threadIdx.y][threadIdx.x - 1]
                                            + inCurr_s[threadIdx.y][threadIdx.x + 1]
                                            + inCurr_s[threadIdx.y - 1][threadIdx.x]
                                            + inCurr_s[threadIdx.y + 1][threadIdx.x]);
            }
        }
        __syncthreads();
        inPrev_s[threadIdx.y][threadIdx.x] = inCurr_s[threadIdx.y][threadIdx.x];
        inCurr_s[threadIdx.y][threadIdx.x] = inNext_s[threadIdx.y][threadIdx.x];
    }
}

Unshared Slices

Stencil with Register Tiling Code

__global__ void stencil_kernel(float* in, float* out, unsigned int N) {
    int iStart = blockIdx.z * OUT_TILE_DIM;
    int j = blockIdx.y * OUT_TILE_DIM + threadIdx.y - 1;
    int k = blockIdx.x * OUT_TILE_DIM + threadIdx.x - 1;

    float inPrev_s;
    __shared__ float inCurr_s[IN_TILE_DIM][IN_TILE_DIM];
    float inNext_s;
    if (iStart - 1 >= 0 && iStart - 1 < N && j >= 0 && j < N && k >= 0 && k < N) {
        inPrev_s = in[(iStart - 1) * N * N + j * N + k];
    }
    if (iStart >= 0 && iStart < N && j >= 0 && j < N && k >= 0 && k < N) {
        inCurr_s[threadIdx.y][threadIdx.x] = in[iStart * N * N + j * N + k];
    }
    __syncthreads();

    for (int i = iStart; i < iStart + OUT_TILE_DIM; i++) {
        if (i + 1 >= 0 && i + 1 < N && j >= 0 && j < N && k >= 0 && k < N) {
            inNext_s = in[(i + 1) * N * N + j * N + k];
        }
        __syncthreads();
        if (i >= 1 && i < N - 1 && j >= 1 && j < N - 1 && k >= 1 && k < N - 1) {
            if (threadIdx.x >= 1 && threadIdx.x < blockDim.x - 1 
            && threadIdx.y >= 1 && threadIdx.y < blockDim.y - 1) {
                out[i * N * N + j * N + k] = C0 * inCurr_s[threadIdx.y][threadIdx.x]
                                        + C1 * (inPrev_s
                                            + inNext_s
                                            + inCurr_s[threadIdx.y][threadIdx.x - 1]
                                            + inCurr_s[threadIdx.y][threadIdx.x + 1]
                                            + inCurr_s[threadIdx.y - 1][threadIdx.x]
                                            + inCurr_s[threadIdx.y + 1][threadIdx.x]);
            }
        }
        __syncthreads();
        inPrev_s = inCurr_s[threadIdx.y][threadIdx.x];
        inCurr_s[threadIdx.y][threadIdx.x] = inNext_s;
    }
}

Q&A

#blog #cuda #gpu #pmpp