currybab's blog

pmpp lecture 18,19 graph processing 정리

Source:

Representing Graphs

COO and CSR/CSC Representation

coo and csr/csc representation

Approaches to Parallelizing Graph Processing

Approaches to Breadth First Search (BFS)

Dataset Implications

Similarity Between BFS and SpMV

Linear Algebraic Formulation of Graph Problem (그래프 문제의 선형 대수적 공식화)

symmetric vs not symmetric graph

Redundant Work (중복 작업)

Reducing Redundancy

    __global__ void bfs_kernel(CSRGraph, csrGraph, unsigned int* level, unsigned int* prevFrontier, unsigned int* currFrontier, unsigned int numPrevFrontier, unsigned int* numCurrFrontier, unsigned int currLevel) {
        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
        if (i < numPrevFrontier) {
            unsigned int vertex = prevFrontier[i];
            for (unsigned int edge = csrGraph.srcPtrs[vertex]; edge < csrGraph.srcPtrs[vertex + 1]; edge++) {
                unsigned int neighbor = csrGraph.dst[edge];
                if (atomicCAS(&level[neighbor], UINT_MAX, currLevel) == UINT_MAX) { // 다른 스레드에서 동일한 정점에 대해 방문했을 경우
                    unsigned int currFrontierIdx = atomicAdd(numCurrFrontier, 1);
                    currFrontier[currFrontierIdx] = neighbor;
                }
            }
        }
    }

Queue Privatization

Queue Privatization

    #define LOCAL_QUEUE_SIZE 2048

    __global__ void bfs_kernel(CSRGraph, csrGraph, unsigned int* level, unsigned int* prevFrontier, unsigned int* currFrontier, unsigned int numPrevFrontier, unsigned int* numCurrFrontier, unsigned int currLevel) {

        __shared__ unsigned int currFrontier_s[LOCAL_QUEUE_SIZE];
        __shared__ unsigned int numCurrFrontier_s;

        if (threadIdx.x == 0) {
            numCurrFrontier_s = 0;
        }
        __syncthreads();

        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
        if (i < numPrevFrontier) {
            unsigned int vertex = prevFrontier[i];
            for (unsigned int edge = csrGraph.srcPtrs[vertex]; edge < csrGraph.srcPtrs[vertex + 1]; edge++) {
                unsigned int neighbor = csrGraph.dst[edge];
                if (atomicCAS(&level[neighbor], UINT_MAX, currLevel) == UINT_MAX) { // 다른 스레드에서 동일한 정점에 대해 방문했을 경우
                    unsigned int currFrontierIdx_s = atomicAdd(numCurrFrontier_s, 1);
                    if (currFrontierIdx_s < LOCAL_QUEUE_SIZE) {
                        currFrontier_s[currFrontierIdx_s] = neighbor;
                    } else {
                        // 오버 플로가 발생하면 그냥 global에 직접 넣는다.
                        numCurrFrontier_s = LOCAL_QUEUE_SIZE;
                        unsigned int currFrontierIdx = atomicAdd(numCurrFrontier, 1);
                        currFrontier[currFrontierIdx] = neighbor;
                    }
                }
            }
        }
        __syncthreads();

        __shared__ unsigned int currFrontierStartIdx;
        if (threadIdx.x == 0) {
            currFrontierStartIdx = atomicAdd(numCurrFrontier, numCurrFrontier_s);
        }
        __syncthreads();

        for (unsigned int currFrontierIdx_s = threadIdx.x; currFrontierIdx_s < numCurrFrontier_s; currFrontierIdx_s += blockDim.x) {
            currFrontier[currFrontierStartIdx + currFrontierIdx_s] = currFrontier_s[currFrontierIdx_s];
        }
    }

    // 14.826ms -> 14.676ms로 거의 변화 없음.
    // 하드웨어나 그래프에 따라 달라질 수 있음.

Minimizing Launch Overhead

minimize launch overhead

#blog #cuda #gpu #pmpp