currybab's blog

pmpp lecture 11 Kogge-Stone scan 요약

Source: Lecture 11 - Scan (Kogge Stone)

Today

Scan

Scan Example

Sequential Scan

    // inclusive scan
    output[0] = input[0];
    for (int i = 1; i < n; i++) {
        output[i] = f(output[i-1], input[i]);
    }

    // exclusive scan
    output[0] = IDENTITY;
    for (int i = 1; i < n; i++) {
        output[i] = f(output[i-1], input[i-1]);
    }

Segmented Scan

segmented scan

Kogge-Stone Parallel (Inclusive) Scan

kogge-stone inclusive scan

    __global__ void scan_kernel(float* input, float* output, float* partialSums, int N) {
        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

        output[i] = input[i];
        __syncthreads();

        for (int stride = 1; stride <= BLOCK_DIM / 2; stride *= 2) {
            float v;
            if (threadIdx.x >= stride) {
                v = output[i - stride];
            }
            __syncthreads(); // wait for everyone to read before updating
            if (threadIdx.x >= stride) {
                output[i] += v;
            }
            __syncthreads();
        }

        if (threadIdx.x == BLOCK_DIM - 1) {
            partialSums[blockIdx.x] = output[i];
        }    
    }

    __global__ void add_kernel(float* output, float* partialSums, unsigned int N) {
        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

        if (blockIdx.x > 0) {
            output[i] += partialSums[blockIdx.x - 1];
        }
    }
    #define BLOCK_DIM 1024
    __global__ void scan_kernel(float* input, float* output, float* partialSums, int N) {
        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

        __shared__ float buffer_s[BLOCK_DIM];
        buffer_s[threadIdx.x] = input[i];
        __syncthreads();

        for (int stride = 1; stride <= BLOCK_DIM / 2; stride *= 2) {
            float v;
            if (threadIdx.x >= stride) {
                v = buffer_s[threadIdx.x - stride];
            }
            __syncthreads(); // wait for everyone to read before updating
            if (threadIdx.x >= stride) {
                buffer_s[threadIdx.x] += v;
            }
            __syncthreads();
        }

        if (threadIdx.x == BLOCK_DIM - 1) {
            partialSums[blockIdx.x] = buffer_s[threadIdx.x];
        }    
        output[i] = buffer_s[threadIdx.x];
    }

Double Buffering

double buffering

    #define BLOCK_DIM 1024
    __global__ void scan_kernel(float* input, float* output, float* partialSums, int N) {
        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

        __shared__ float buffer1_s[BLOCK_DIM];
        __shared__ float buffer2_s[BLOCK_DIM];
        float* inBuffer_s = buffer1_s;
        float* outBuffer_s = buffer2_s;

        inBuffer_s[threadIdx.x] = input[i];
        __syncthreads();

        for (int stride = 1; stride <= BLOCK_DIM / 2; stride *= 2) {
            if (threadIdx.x >= stride) {
                outBuffer_s[threadIdx.x] = inBuffer_s[threadIdx.x] + inBuffer_s[threadIdx.x - stride];
            } else {
                outBuffer_s[threadIdx.x] = inBuffer_s[threadIdx.x];
            }
            __syncthreads();
            
            float* temp = inBuffer_s;
            inBuffer_s = outBuffer_s;
            outBuffer_s = temp;
        }

        if (threadIdx.x == BLOCK_DIM - 1) {
            partialSums[blockIdx.x] = inBuffer_s[threadIdx.x];
        }    
        output[i] = inBuffer_s[threadIdx.x];
    }

Exclusive Scan

    #define BLOCK_DIM 1024
    __global__ void scan_kernel(float* input, float* output, float* partialSums, int N) {
        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

        __shared__ float buffer1_s[BLOCK_DIM];
        __shared__ float buffer2_s[BLOCK_DIM];
        float* inBuffer_s = buffer1_s;
        float* outBuffer_s = buffer2_s;

        if (threadIdx.x == 0) {
            inBuffer_s[0] = 0.0f;
        } else {
            inBuffer_s[threadIdx.x] = input[i - 1];
        }
        __syncthreads();

        for (int stride = 1; stride <= BLOCK_DIM / 2; stride *= 2) {
            if (threadIdx.x >= stride) {
                outBuffer_s[threadIdx.x] = inBuffer_s[threadIdx.x] + inBuffer_s[threadIdx.x - stride];
            } else {
                outBuffer_s[threadIdx.x] = inBuffer_s[threadIdx.x];
            }
            __syncthreads();
            
            float* temp = inBuffer_s;
            inBuffer_s = outBuffer_s;
            outBuffer_s = temp;
        }

        if (threadIdx.x == BLOCK_DIM - 1) {
            partialSums[blockIdx.x] = inBuffer_s[threadIdx.x] + input[i];
        }    
        output[i] = inBuffer_s[threadIdx.x];
    }
    
    __global__ void add_kernel(float* output, float* partialSums, unsigned int N) {
        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

        if (blockIdx.x > 0) {
            output[i] += partialSums[blockIdx.x]; // changed!(removed -1)
        }
    }

작업 효율성 (Work efficiency)

#blog #cuda #gpu #pmpp