currybab's blog

pmpp lecture 12 Brent-Kung scan 요약

Source: Lecture 12 - Scan (Brent Kung)

Today

Brent-Kung Parallel Inclusive Scan

brent-kung-inclusive-scan

Kogge-Stone vs Brent-Kung

kogge-stone-vs-brent-kung

Optimizations

Re-index

reindex-threads-to-minimize-divergence

    __global__ void scan_kernel(float* input, float* output, float* partialSums, int N) {
        unsigned int segment = blockIdx.x * blockDim.x * 2; // 연산 스레드 하나당 두개의 값을 로드해놔야함.

        __shared__ float buffer_s[2*BLOCK_DIM];
        buffer_s[threadIdx.x] = input[segment + threadIdx.x];
        buffer_s[threadIdx.x + BLOCK_DIM] = input[segment + threadIdx.x + BLOCK_DIM];
        __syncthreads();

        // Reduction step
        for (unsigned int stride = 1; stride <= BLOCK_DIM; stride *= 2) {
            unsigned int i = (threadIdx.x + 1) * 2 * stride - 1;
            if (i < 2 * BLOCK_DIM) {
                buffer_s[i] += buffer_s[i - stride];
            }
            __syncthreads();
        }

        // Post-reduction step
        for (unsigned int stride = BLOCK_DIM / 2; stride > 0; stride /= 2) {
            unsigned int i = (threadIdx.x + 1) * 2 * stride - 1;
            if (i + stride < 2 * BLOCK_DIM) {
                buffer_s[i + stride] += buffer_s[i];
            }
            __syncthreads();
        }

        if (threadIdx.x == 0) {
            partialSums[blockIdx.x] = buffer_s[2 * BLOCK_DIM - 1]
        }
        output[segment + threadIdx.x] = buffer_s[threadIdx.x];
        output[segment + threadIdx.x + BLOCK_DIM] = buffer_s[threadIdx.x + BLOCK_DIM];
    }

Brent-Kung Exclusive Scan

Brent-Kung Exclusive Scan

작업효율성(실제 발생하는일)

Thread Coarsening In Parallel Scan

    #define BLOCK_DIM 1024
    #define COARSE_FACTOR 8

    __global__ void thread_coarsened_scan(float* input, float* output, float* partialSums, unsigned int N) {
        unsigned int bSegment = BLOCK_DIM * COARSE_FACTOR * blockIdx.x;

        __shared__ float buffer_s[BLOCK_DIM * COARSE_FACTOR];

        for (unsigned int c = 0; c < COARSE_FACTOR; ++c) {
            buffer_s[c * BLOCK_DIM + threadIdx.x] = input[bSegment + c * BLOCK_DIM + threadIdx.x];
        }
        __syncthreads();

        // Thread scan
        unsigned int tSegment = COARSE_FACTOR * threadIdx.x;
        for (unsigned int c = 1; c < COARSE_FACTOR; ++c) {
            buffer_s[tSegment + c] += buffer_s[tSegment + c - 1];
        }

        __shared__ float buffer1_s[BLOCK_DIM];
        __shared__ float buffer2_s[BLOCK_DIM];
        float* inBuffer_s = buffer1_s;
        float* outBuffer_s = buffer2_s;
        inBuffer_s[threadIdx.x] = buffer_s[tSegment + COARSE_FACTOR - 1];
        __syncthreads();

        for (int stride = 1; stride <= BLOCK_DIM / 2; stride *= 2) {
            if (threadIdx.x >= stride) {
                outBuffer_s[threadIdx.x] = inBuffer_s[threadIdx.x] + inBuffer_s[threadIdx.x - stride];
            } else {
                outBuffer_s[threadIdx.x] = inBuffer_s[threadIdx.x];
            }
            __syncthreads();
            
            float* temp = inBuffer_s;
            inBuffer_s = outBuffer_s;
            outBuffer_s = temp;
        }

        if(threadIdx.x > 0) {
            for (unsigned int c = 0; c < COARSE_FACTOR; ++c) {
                buffer_s[tSegment + c] += inBuffer_s[threadIdx.x - 1];
            }
        }

        if (threadIdx.x == BLOCK_DIM - 1) {
            partialSums[blockIdx.x] = inBuffer_s[threadIdx.x];
        }
        __syncthreads();
        for (unsigned int c = 0; c < COARSE_FACTOR; ++c) {
            output[bSegment + c * BLOCK_DIM + threadIdx.x] = buffer_s[c * BLOCK_DIM + threadIdx.x];
        }
    }

#blog #cuda #gpu #pmpp