currybab's blog

pmpp lecture 21 pinned memory and streams 정리

Source: Lecture 21 - Pinned Memory and Streams

Today

Direct Memory Access

dma

Pinned Memory

cudaMemcpy의 동작

Faster Copies

시스템 아키텍쳐 수준의 병렬성

벡터 덧셈 utilization

vector add utilization

pipelining vector add

Streams and Asynchronous Copies

Using Streams and Asynchronous Copies

    __global__ void vecadd_kernel(float* x, float* y, float* z, int N) {
        int i = blockDim.x * blockIdx.x +  threadIdx.x;
        if (i < N) {
            z[i] = x[i] + y[i];
        }
    }

    void vecadd_gpu(float* x, float* y, float* z, int N) {

        // Allocate GPU memory
        float *x_d, *y_d, *z_d;
        cudaMalloc((void**) &x_d, N * sizeof(float));
        cudaMalloc((void**) &y_d, N * sizeof(float));
        cudaMalloc((void**) &z_d, N * sizeof(float));

        // Setup streams
        unsigned int numStreams = 32;
        cudaStream_t stream[numStreams];
        for (unsigned int s = 0; s < numStreams; s++) {
            cudaStreamCreate(&stream[s]);
        }
        
        // Stream the segments
        unsigned int numSegments = numStreams;
        unsigned int segmentSize = (N + numSegments - 1) / numSegments;
        for (unsigned int s = 0; s < numSegments; s++) {

            // Finding the segment bounds
            unsigned int start = s * segmentSize;
            unsigned int end = (start + segmentSize < N) ? start + segmentSize : N;
            unsigned int Nsegment = end - start;

            // Copy data to GPU
            cudaMemcpyAsync(&x_d[start], &x[start], Nsegment * sizeof(float), cudaMemcpyHostToDevice, stream[s]);
            cudaMemcpyAsync(&y_d[start], &y[start], Nsegment * sizeof(float), cudaMemcpyHostToDevice, stream[s]);
            
            // Vector addition on GPU
            int numThreadsPerBlock = 512;
            int numBlocks = (Nsegment + numThreadsPerBlock - 1) / numThreadsPerBlock;
            vecadd_kernel<<<numBlocks, numThreadsPerBlock, 0, stream[s]>>>(&x_d[start], &y_d[start], &z_d[start], Nsegment);
            

            // Copy result back to host
            cudaMemcpyAsync(&z[start], &z_d[start], Nsegment * sizeof(float), cudaMemcpyDeviceToHost, stream[s]);

        }
        cudaDeviceSynchronize();

        
        // Free GPU memory
        cudaFree(x_d);
        cudaFree(y_d);
        cudaFree(z_d);
    }

profiling result

#blog #cuda #gpu #pmpp