currybab's blog

pmpp lecture 10 reduction 요약

Source: Lecture 10 - Reduction

Reduction

Sequential Reduction

    acc = IDENTITY; // 항원으로 초기화
    for (i = 0 ; i < N; ++i) { // 0부터 N까지 반복
        acc = f(acc, input[i]); // 배열을 반복하면서 어떤 연산을 적용함.
    }

Parallel Reduction

Segment Reduction

Reduction Tree (Per Block)

Reduction Tree Per Block

    __global__ void reduce_kernel(float* input, float* partialSums, unsigned int N) {
        unsigned int segment = (blockIdx.x * blockDim.x) * 2;
        unsigned int i = segment + threadIdx.x * 2;

        for (unsigned int stride = 1;  stride <= BLOCK_DIM; stride *= 2) {
            if (threadIdx.x % stride == 0) {
                input[i] += input[i + stride];
            }
            __syncthreads();
        }
        if (threadIdx.x == 0) {
            partialSums[blockIdx.x] = input[segment];
        }
    }

Reduction Code 관찰

Coalescing and Minimizing Control divergence

Control divergence problem

Coalescing and Minimizing Divergence

    __global__ void reduce_kernel(float* input, float* partialSums, unsigned int N) {
        unsigned int segment = (blockIdx.x * blockDim.x) * 2;
        unsigned int i = segment + threadIdx.x;

        for (unsigned int stride = BLOCK_DIM;  stride > 0; stride /= 2) {
            if (threadIdx.x < stride) {
                input[i] += input[i + stride];
            }
            __syncthreads();
        }
        if (threadIdx.x == 0) {
            partialSums[blockIdx.x] = input[segment];
        }
    }

Control Divergence Minimized

Data Reuse Using Shared Memory

Data Reuse Using Shared Memory

    __global__ void reduce_kernel(float* input, float* partialSums, unsigned int N) {
        unsigned int segment = (blockIdx.x * blockDim.x) * 2;
        unsigned int i = segment + threadIdx.x;

        __shared__ float input_s[BLOCK_DIM];
        input_s[threadIdx.x] = input[i] + input[i + BLOCK_DIM];
        __syncthreads();

        for (unsigned int stride = BLOCK_DIM / 2;  stride > 0; stride /= 2) {
            if (threadIdx.x < stride) {
                input_s[threadIdx.x] += input_s[threadIdx.x + stride];
            }
            __syncthreads();
        }
        if (threadIdx.x == 0) {
            partialSums[blockIdx.x] = input_s[0];
        }
    }
    // 실행 결과 2.15ms kernel time

Thread Coarsening

Thread Coarsening 적용

Apply Thread Coarsening

    #define BLOCK_DIM 1024
    #define COARSE_FACTOR 4

    __global__ void reduce_kernel(float* input, float* partialSums, unsigned int N) {
        unsigned int segment = (blockIdx.x * blockDim.x) * 2 * COARSE_FACTOR;
        unsigned int i = segment + threadIdx.x;

        __shared__ float input_s[BLOCK_DIM];
        float sum = 0.0f;
        for (unsigned int tile = 0; tile < COARSE_FACTOR; ++tile) {
            if (i + tile * BLOCK_DIM < N) {
                sum += input[i + tile * BLOCK_DIM];
            }
        }
        input_s[threadIdx.x] = sum;
        __syncthreads();

        for (unsigned int stride = BLOCK_DIM / 2;  stride > 0; stride /= 2) {
            if (threadIdx.x < stride) {
                input_s[threadIdx.x] += input_s[threadIdx.x + stride];
            }
            __syncthreads();
        }
        if (threadIdx.x == 0) {
            partialSums[blockIdx.x] = input_s[0];
        }
    }
    // 실행 결과 0.677ms

Coarsening 강점

#blog #cuda #gpu #pmpp