currybab's blog

pmpp lecture 20 intra warp synchronization 요약

Source: Lecture 20 - Intra Warp Synchronization

복습: Warp

Intra-Warp Synchronization

Warp Shuffle Functions

Reduction with Warp Shuffle

    #define BLOCK_DIM 1024
    #define WARP_SIZE 32
    
    __global__ void reduce_kernel(float* input, float* partialSums, unsigned int N) {
        unsigned int segment = (blockIdx.x * blockDim.x) * 2;
        unsigned int i = segment + threadIdx.x;

        __shared__ float input_s[BLOCK_DIM];
        input_s[threadIdx.x] = input[i] + input[i + BLOCK_DIM];
        __syncthreads();

        for (unsigned int stride = BLOCK_DIM / 2;  stride > WARP_SIZE; stride /= 2) {
            if (threadIdx.x < stride) {
                input_s[threadIdx.x] += input_s[threadIdx.x + stride];
            }
            __syncthreads();
        }

        // Reduction tree with warp shuffle
        float sum;
        if (threadIdx.x == 0) {
            sum = input_s[threadIdx.x] + input_s[threadIdx.x + WARP_SIZE];
        }
        for (unsigned int stride = WARP_SIZE / 2;  stride > 0; stride /= 2) {
            sum += __shfl_down_sync(0xFFFFFFFF, sum, stride);
        }
        
        if (threadIdx.x == 0) {
            partialSums[blockIdx.x] = sum;
        }
    }

Warp Vote Functions

Optimization with Warp Vote

    #define WARP_SIZE 32

    // enqueue kernel 값이 어떤 작업을 충족하면 enqueue 함.
    __global__ void enqueue_kernel(unsigned int* input, unsigned int N, unsigned int* queueSize) {
        // unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
        // if (i < N) {
        //     unsigned int val = input[i];
        //     if(cond(val)) {
        //         unsigned int j = atomicAdd(queueSize, 1);
        //         queue[j] = val;
        //     }
        // }
        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
        if (i < N) {
            unsigned int val = input[i];
            if(cond(val)) {
                // Assign a leader thread (0이 이 지점에 도달 못할 수도 있음)
                // - __activemask()를 사용하여 활성화된 스레드를 확인함.
                // - 첫번째 활성화된 스레드를 리더로 지정함.
                unsigned int activeThreads = __activemask();
                unsigned int leader = __ffs(activeThreads) - 1; // activemask의 첫번째 비트를 찾는다. 
                // 워프 수준 아니고 그냥 내장 함수임, 인덱스가 1부터 시작함. 0이면 1로 설정된 bit가 없음.

                // Find how many threads need to add to the queue = how many threads are active
                // 또다른 내장 함수가 있음(population count)
                unsigned int numActive = __popc(activeThreads);

                // Have the leader perform the atomic operation
                unsigned int j;
                if (threadIdx.x % WARP_SIZE == leader) {
                    j = atomicAdd(queueSize, numActive);
                }

                // Broadcast the result to all threads
                j = __shfl_sync(activeThreads, j, leader);
                
                // Find offset of each active thread and store result
                // - 활성 상태인 이전 스레드의 수를 찾음
                unsigned int previousThreads = (1 << (threadIdx.x % WARP_SIZE)) - 1;
                unsigned int previousActiveThreads = activeThreads & previousThreads;
                unsigned int offset = __popc(previousActiveThreads);

                // Store the result
                queue[j + offset] = val;
            }
        }
    }

#blog #cuda #gpu #pmpp