currybab's blog

pmpp lecture 22 dynamic parallelism 정리

Source: Lecture 22 - Dynamic Parallelism

Dynamic Parallelism

dynamic parallelism

Nested Parallelism

Applications of Dynamic Parallelism

Dynamic Parallelism API

프론티어를 사용한 BFS

bfs frontier

    // 이웃의 수만큼 반복하는 대신 각 이웃에 대해 스레드를 실행할 것
    
    __global__ void bfs_child_kernel(CSRGraph, csrGraph, unsigned int* level, unsigned int* currFrontier, unsigned int numPrevFrontier, unsigned int* numCurrFrontier, unsigned int currLevel, unsigned int numNeighbors, unsigned int start) {
        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
        if (i < numNeighbors) {
            unsigned int edge = start + i;
            unsigned int neighbor = csrGraph.dst[edge];
            if (atomicCAS(&level[neighbor], UINT_MAX, currLevel) == UINT_MAX) { // 다른 스레드에서 동일한 정점에 대해 방문했을 경우
                unsigned int currFrontierIdx = atomicAdd(numCurrFrontier, 1);
                currFrontier[currFrontierIdx] = neighbor;
            }
        }
    }

    __global__ void bfs_kernel(CSRGraph, csrGraph, unsigned int* level, unsigned int* prevFrontier, unsigned int* currFrontier, unsigned int numPrevFrontier, unsigned int* numCurrFrontier, unsigned int currLevel) {
        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
        if (i < numPrevFrontier) {
            unsigned int vertex = prevFrontier[i];
            unsigned int start = csrGraph.srcPtrs[vertex];
            unsigned int numNeighbors = csrGraph.srcPtrs[vertex + 1] - start;
            unsigned int numThreadsPerBloc = 1024;
            unsigned int numBlocks = (numNeighbors + numThreadsPerBloc - 1) / numThreadsPerBloc;
            bfs_child_kernel<<< numBlocks, numThreadsPerBloc >>>(csrGraph, level, currFrontier, numPrevFrontier, numCurrFrontier, currLevel, numNeighbors, start);
        }
    }

    void bfs_levels(CSRGraph, csrGraph, unsigned int* level, unsigned int* prevFrontier, unsigned int* currFrontier, unsigned int* numCurrFrontier) {
        unsigned int numPrevFrontier = 1;
        unsigned int numThreadsPerBlock = 256;
        cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, csrGraph.numVertices);
        for (unsigned int currLevel = 1; numPrevFrontier > 0; currLevel++) {
            // Visit vertices in previous frontier
            cudaMemset(numCurrFrontier, 0, sizeof(unsigned int));
            unsigned int numBlocks = (numPrevFrontier + numThreadsPerBlock - 1) / numThreadsPerBlock;
            bfs_child_kernel<<< numBlocks, numThreadsPerBlock >>>(csrGraph, level, prevFrontier, currFrontier, numPrevFrontier, numCurrFrontier, currLevel);
            cudaDeviceSynchronize();
            // Swap buffers
            unsigned int* temp = prevFrontier;
            prevFrontier = currFrontier;
            currFrontier = temp;
            cudaMemcpy(&numPrevFrontier, numCurrFrontier, sizeof(unsigned int), cudaMemcpyDeviceToHost);
        }
        cudaDeviceSynchronize();
    }

Streams

Per-Thread Stream

Optimizationss

    __global__ void bfs_kernel(CSRGraph, csrGraph, unsigned int* level, unsigned int* prevFrontier, unsigned int* currFrontier, unsigned int numPrevFrontier, unsigned int* numCurrFrontier, unsigned int currLevel) {
        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
        if (i < numPrevFrontier) {
            unsigned int vertex = prevFrontier[i];
            unsigned int start = csrGraph.srcPtrs[vertex];
            unsigned int numNeighbors = csrGraph.srcPtrs[vertex + 1] - start;
            // 임계값을 넘으면 그리드를 실행하고 그렇지 않으면 직렬화함.
            if (numNeighbors > 1200) {
                unsigned int numThreadsPerBloc = 1024;
                unsigned int numBlocks = (numNeighbors + numThreadsPerBloc - 1) / numThreadsPerBloc;
                bfs_child_kernel<<< numBlocks, numThreadsPerBloc >>>(csrGraph, level, currFrontier, numPrevFrontier, numCurrFrontier, currLevel, numNeighbors, start);
            } else {
                for (unsigned int i = 0; i < numNeighbors; i++) {
                    unsigned int edge = start + i;
                    unsigned int neighbor = csrGraph.dst[edge];
                    if (atomicCAS(&level[neighbor], UINT_MAX, currLevel) == UINT_MAX) { // 다른 스레드에서 동일한 정점에 대해 방문했을 경우
                        unsigned int currFrontierIdx = atomicAdd(numCurrFrontier, 1);
                        currFrontier[currFrontierIdx] = neighbor;
                    }
                }
            }
        }
    }

Offloading Driver Code

    __global__ void bfs_levels_kernel(CSRGraph, csrGraph, unsigned int* level, unsigned int* prevFrontier, unsigned int* currFrontier, unsigned int* numCurrFrontier) {
        unsigned int numPrevFrontier = 1;
        unsigned int numThreadsPerBlock = 256;
        for (unsigned int currLevel = 1; numPrevFrontier > 0; currLevel++) {
            // Visit vertices in previous frontier
            *numCurrFrontier = 0;
            unsigned int numBlocks = (numPrevFrontier + numThreadsPerBlock - 1) / numThreadsPerBlock;
            bfs_child_kernel<<< numBlocks, numThreadsPerBlock >>>(csrGraph, level, prevFrontier, currFrontier, numPrevFrontier, numCurrFrontier, currLevel);
            cudaDeviceSynchronize();
            
            // Swap buffers
            unsigned int* temp = prevFrontier;
            prevFrontier = currFrontier;
            currFrontier = temp;
            numPrevFrontier = *numCurrFrontier;
        }
    }

    void bfs_levels(CSRGraph, csrGraph, unsigned int* level, unsigned int* prevFrontier, unsigned int* currFrontier, unsigned int* numCurrFrontier) {
        cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, csrGraph.numVertices);
        bfs_levels_kernel<<< 1, 1 >>>(csrGraph, level, prevFrontier, currFrontier, numCurrFrontier);
        cudaDeviceSynchronize();
    }

Memory Visibility

중첩 깊이 (Nesting Depth)

#blog #cuda #gpu #pmpp