currybab's blog

pmpp lecture 14 merge 요약

Source: Lecture 14 - Merge

Merge

    void mergeSequential(float* A, float* B, float* C, unsigned int m, unsigned int n) {
        unsigned int i = 0; // A
        unsigned int j = 0; // B
        unsigned int k = 0; // C
        while (i < m && j < n) {
            if (A[i] < B[j]) {
                C[k++] = A[i++];
            } else {
                C[k++] = B[j++];
            }
        }
        while (i < m) {
            C[k++] = A[i++];
        }
        while (j < n) {
            C[k++] = B[j++];
        }
    }
    // 매우 순차적인 직업같은데 어떻게 병렬화할 수 있을까?

Parallel Merge

Finding Input Segments

finding input segments

    __device__ void mergeSequential(float* A, float* B, float* C, unsigned int m, unsigned int n) {
        unsigned int i = 0; // A
        unsigned int j = 0; // B
        unsigned int k = 0; // C
        while (i < m && j < n) {
            if (A[i] < B[j]) {
                C[k++] = A[i++];
            } else {
                C[k++] = B[j++];
            }
        }
        while (i < m) {
            C[k++] = A[i++];
        }
        while (j < n) {
            C[k++] = B[j++];
        }
    }

    __device__ unsigned int coRank(float* A, float* B, unsigned int m, unsigned int n, unsigned int k) {
        // set bound
        unsigned int iLow = (k > n) ? (k - n) : 0;
        unsigned int iHigh = (k < m) ? k : m;

        // binary search
        while (true) {
            unsigned int i = (iLow + iHigh) / 2;
            unsigned int j = k - i;
            if (i > 0 && j < n && A[i - 1] > B[j]) {
                iHigh = i;
            } else if (j > 0 && i < m && B[j - 1] > A[i]) {
                iLow = i;
            } else {
                return i;
            }
        }
    }

    #define ELEM_PER_THREAD 6
    #define THREADS_PER_BLOCK 128
    #define ELEM_PER_BLOCK (ELEM_PER_THREAD * THREADS_PER_BLOCK)

    // unsigned int numbBlocks = (m + n + ELEM_PER_BLOCK - 1) / ELEM_PER_BLOCK;
    // merge_kernel <<< numBlocks, THREADS_PER_BLOCK >>> (A_d, B_d, C_d, m, n);
    
    __global__ void merge_kernel(float* A, float* B, float* C, unsigned int m, unsigned int n) {
        unsigned int k = (blockIdx.x * blockDimx.x + threadIdx.x) * ELEM_PER_THREAD;
        if (k < m + n) {
            unsigned int i = coRank(A, B, m, n, k);
            unsigned int j = k - i;
            unsigned int kNext = (k + ELEM_PER_THREAD < m + n) ? (k + ELEM_PER_THREAD) : (m + n);
            unsigned int iNext = coRank(A, B, m, n, kNext);
            unsigned int jNext = kNext - iNext;
            mergeSequential(&A[i], &B[j], &C[k], iNext - i, jNext - j);
        }
    }

Shared Memory Tiling

shared memory tiling

    __global__ void merge_kernel(float* A, float* B, float* C, unsigned int m, unsigned int n) {
        // Find the block's segments
        unsigned int kBlock = blockIdx.x * ELEM_PER_BLOCK;
        unsigned int kNextBlock = (blockIdx.x < gridDim.x - 1) ? (kBlock + ELEM_PER_BLOCK) : (m + n);
        __shared__ unsigned int iBlock;
        __shared__ unsigned int iNextBlock;
        if (threadIdx.x == 0) {
            iBlock = coRank(A, B, m, n, kBlock);
            iNextBlock = coRank(A, B, m, n, kNextBlock);
        }
        __syncthreads();

        unsigned int jBlock = kBlock - iBlock;
        unsigned int jNextBlock = kNextBlock - iNextBlock;

        // Load block's segments to shared memory
        __shared__ float A_s[ELEM_PER_BLOCK];
        unsigned int mBlock = iNextBlock - iBlock;
        for (unsigned int i = threadIdx.x; i < mBlock; i += blockDim.x) {
            A_s[i] = A[iBlock + i];
        }
        float* B_s = A_s + mBlock;
        unsigned int nBlock = jNextBlock - jBlock;
        for (unsigned int i = threadIdx.x; i < nBlock; i += blockDim.x) {
            B_s[i] = B[jBlock + i];
        }
        __syncthreads();

        // Merge in shared memory
        __shared__ float C_s[ELEM_PER_BLOCK];
        unsigned int k = threadIdx.x * ELEM_PER_THREAD;
        if (k < mBlock + nBlock) {
            unsigned int i = coRank(A_s, B_s, mBlock, nBlock, k);
            unsigned int j = k - i;
            unsigned int kNext = (k + ELEM_PER_THREAD < mBlock + nBlock) ? (k + ELEM_PER_THREAD) : (mBlock + nBlock);
            unsigned int iNext = coRank(A_s, B_s, mBlock, nBlock, kNext);
            unsigned int jNext = kNext - iNext;
            mergeSequential(&A_s[i], &B_s[j], &C_s[k], iNext - i, jNext - j);
        }
        __syncthreads();

        // Copy result to global memory
        for (unsigned int k = threadIdx.x; k < mBlock + nBlock; k += blockDim.x) {
            C[kBlock + k] = C_s[k];
        }
    }

#blog #cuda #gpu #pmpp