GPUプログラミング(2日目)実習手順

0. 環境設定

$ echo 'export PATH=/octfs/apl/CUDA/cuda-9.0/bin:$PATH' >> ~/.bash_profile
$ source ~/.bash_profile

1. 最初のプログラム

✏️ 1-1. 問題

16要素からなる整数配列のそれぞれの要素にindex番号(0〜15)を書き込み表示しなさい

✏️ 1-2. C++プログラム(並列化なし)による計算

ソースコード(ex1-c.cpp)

#include <stdio.h>

void WriteIndex(int* a, int n)
{
    for (int i = 0; i < n; i++) {
        a[i] = i;
    }
}

int main()
{
    // Init
    int N = 16;
    int* h_a = new int[N]; // on host (CPU)

    // Main
    WriteIndex(h_a, N);
    
    // Display results
    for (int i = 0; i < N; i++) {
        printf("%d\n",h_a[i]);
    }
    
    // Free memory
    delete [] h_a;
    
    return 0;
}

ジョブスクリプト(cuda.nqs)

#!/bin/bash
#PBS -q LECTURE
#PBS -l elapstim_req=00:00:10,cpunum_job=1,gpunum_job=1
cd $PBS_O_WORKDIR
./a.out

コンパイルとジョブ投入

$ g++ ex1-c.cpp
$ qsub cuda.nqs

✏️ 1-3. CUDAプログラムによる計算(1ブロックのみ使用)

ソースコード(ex1-cuda.cu)

#include <stdio.h>

__global__ void WriteIndex(int* a, int n)
{
    int i = threadIdx.x;
    if (i < n) {
        a[i] = i;
    }
}

int main()
{
    // Init
    int N = 16;
    int* h_a = new int[N]; // on host (CPU)
    int* d_a;              // on device (GPU)
    cudaMalloc(&d_a, N * sizeof(int));

    // Copy data from host to device
    // Do nothing

    // Main
    int blocks = 1;
    int threadsPerBlock = N;
    WriteIndex<<<blocks, threadsPerBlock>>>(d_a, N);
    
    // Copy data from device to host
    cudaMemcpy(h_a, d_a, N * sizeof(int), cudaMemcpyDeviceToHost);

    // Display results
    for (int i = 0; i < N; i++) {
        printf("%d\n",h_a[i]);
    }
    
    // Free memory
    delete [] h_a;
    cudaFree(d_a);
    
    return 0;
}

ソースコード解説

ジョブスクリプト(cuda.nqs)(再喝)

#!/bin/bash
#PBS -q LECTURE
#PBS -l elapstim_req=00:00:10,cpunum_job=1,gpunum_job=1
cd $PBS_O_WORKDIR
./a.out

コンパイルとジョブ投入

$ nvcc ex1-cuda.cu
$ qsub cuda.nqs

プログラムの性能の考察

補足

✏️ 1-4. CUDAプログラムによる計算(複数ブロック使用)

ソースコード(ex1-cuda2.cu)

#include <stdio.h>

__global__ void WriteIndex(int* a, int n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n) {
        a[i] = i;
    }
}

int main()
{
    // Init
    int N = 16;
    int* h_a = new int[N]; // on host (CPU)
    int* d_a;              // on device (GPU)
    cudaMalloc(&d_a, N * sizeof(int));
    
    // Measure the elapsed time
    cudaEvent_t start, stop;
    float time_ms;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    // Copy data from host to device
    // Do nothing

    // Main
    int threadsPerBlock = 8;
    int blocks = (N + threadsPerBlock - 1) / threadsPerBlock; // round up to int
    WriteIndex<<<blocks, threadsPerBlock>>>(d_a, N);
    
    // Copy data from device to host
    cudaMemcpy(h_a, d_a, N * sizeof(int), cudaMemcpyDeviceToHost);

    // elaspsed time
    cudaEventRecord(stop, 0);
    // Wait until cudaEventRecord is completed. 
    // (Note: cudaEventRecord is an asynchronous function)
    cudaEventSynchronize(stop);
    
    // Display results
    for (int i = 0; i < N; i++) {
        printf("%d\n",h_a[i]);
    }
    cudaEventElapsedTime(&time_ms, start, stop);
    printf("exe time: %f ms\n", time_ms);
    
    // Free memory
    delete [] h_a;
    cudaFree(d_a);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    return 0;
}

ソースコード解説

ジョブスクリプト(cuda.nqs)

コンパイルとジョブ投入

$ nvcc -O3 -arch=sm_60 ex1-cuda2.cu
$ qsub cuda.nqs

プロファイリング(パフォーマンスの解析)

#!/bin/bash
#PBS -q LECTURE
#PBS -l elapstim_req=00:00:10,cpunum_job=1,gpunum_job=1
cd $PBS_O_WORKDIR
nvprof ./a.out
$ qsub prof.nqs
-bash-4.2$ cat prof.nqs.e389728 
==92462== NVPROF is profiling process 92462, command: ./a.out
==92462== Profiling application: ./a.out
==92462== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   60.75%  2.0800us         1  2.0800us  2.0800us  2.0800us  WriteIndex(int*, int)
                   39.25%  1.3440us         1  1.3440us  1.3440us  1.3440us  [CUDA memcpy DtoH]
      API calls:   99.75%  520.14ms         1  520.14ms  520.14ms  520.14ms  cudaMalloc
                    0.13%  682.87us        94  7.2640us     114ns  282.80us  cuDeviceGetAttribute
                    0.04%  214.33us         1  214.33us  214.33us  214.33us  cudaFree
                    0.04%  209.93us         1  209.93us  209.93us  209.93us  cuDeviceTotalMem
                    0.01%  56.766us         1  56.766us  56.766us  56.766us  cuDeviceGetName
                    0.01%  45.203us         1  45.203us  45.203us  45.203us  cudaLaunch
                    0.01%  30.483us         1  30.483us  30.483us  30.483us  cudaMemcpy
                    0.00%  11.443us         2  5.7210us  2.2800us  9.1630us  cudaEventRecord
                    0.00%  8.3070us         2  4.1530us     682ns  7.6250us  cudaEventCreate
                    0.00%  6.3850us         1  6.3850us  6.3850us  6.3850us  cudaEventSynchronize
                    0.00%  6.1000us         2  3.0500us     193ns  5.9070us  cudaSetupArgument
                    0.00%  5.0610us         1  5.0610us  5.0610us  5.0610us  cudaEventElapsedTime
                    0.00%  3.0570us         3  1.0190us     185ns  2.6390us  cuDeviceGetCount
                    0.00%  1.9980us         2     999ns     517ns  1.4810us  cudaEventDestroy
                    0.00%  1.2720us         1  1.2720us  1.2720us  1.2720us  cudaConfigureCall
                    0.00%     595ns         2     297ns     146ns     449ns  cuDeviceGet

2. 効率の良くリダクションを行うプログラム

✏️ 2-1. 問題

\(2^{25}\)要素からなる配列にはランダムに0または1が書き込まれている.この時,この配列の総和を求めなさい.

✏️ 2-2. Cascadingアルゴリズムによる計算

ソースコード(ex2.cu)

#include <stdio.h>
#include <math.h>
// Thrust library
// #include <thrust/reduce.h>
// #include <thrust/device_ptr.h>


// Device parameter
const int NUM_SM = 56;
const int WARPS_PER_SM = 64;
const int WARPS_PER_BLOCK = 32;
const int NUM_BLOCKS = NUM_SM * WARPS_PER_SM / WARPS_PER_BLOCK;
const int THREADS_PER_WARP = 32;
const int THREADS_PER_BLOCK = WARPS_PER_BLOCK * THREADS_PER_WARP;


__global__ void ReductionCascading(int* inArray, unsigned int numElements, int* halfwayResult)
{
    const unsigned int tid  = threadIdx.x;            // thread id in each block
    const unsigned int wid  = tid / THREADS_PER_WARP; // warp id in each block
    const unsigned int lid  = tid % THREADS_PER_WARP; // lane id = thread id in each warp
    const unsigned int sid  = blockIdx.x * blockDim.x + threadIdx.x; // sequential thread id in the kernel
    const unsigned int numTotalThreads = gridDim.x * blockDim.x;

    int item1 = 0;
    __shared__ int x[WARPS_PER_BLOCK * THREADS_PER_WARP];
    __shared__ int y[WARPS_PER_BLOCK];
    
    const int numIter = 1 + (numElements - 1) / numTotalThreads; //round up to int
    // Main
    for (int i = 0; i < numIter - 1; i++) {
        item1 += inArray[numTotalThreads * i + sid];
    }
    // Last
    int idx = numTotalThreads * (numIter - 1) + sid;
    if (idx < numElements) {
        item1 += inArray[idx];
    }
    x[tid] = item1;
    __syncthreads();

    if (lid < 16) {
        x[tid] += x[tid + 16];
        __syncthreads();
        x[tid] += x[tid + 8];
        __syncthreads();
        x[tid] += x[tid + 4];
        __syncthreads();
        x[tid] += x[tid + 2];
        __syncthreads();
        x[tid] += x[tid + 1];
        __syncthreads();
    }

    if( 0 == lid ){
        y[wid] = x[tid];
    }
    __syncthreads();
    
    if (tid < 16) {
        y[tid] += y[tid + 16];
        __syncthreads();
        y[tid] += y[tid + 8];
        __syncthreads();
        y[tid] += y[tid + 4];
        __syncthreads();
        y[tid] += y[tid + 2];
        __syncthreads();
        y[tid] += y[tid + 1];
        __syncthreads();
    }
    if( 0 == tid ){
        halfwayResult[blockIdx.x] = y[tid];
    }
    
    return;
}


int main()
{    
    // Init
    int logNumElements = 25; // input_size = 2^25
    int numElements = 1 << logNumElements;
    int* h_input = new int[numElements]; // on host (CPU)
    int* h_halfwayResult = new int[NUM_BLOCKS];
    int  h_output = 0;
    int* d_input;                        // on device (GPU)
    int* d_halfwayResult;
    cudaMalloc(&d_input, numElements * sizeof(int));
    cudaMalloc(&d_halfwayResult, NUM_BLOCKS * sizeof(int));
    srand(0);
    for( unsigned int i = 0; i < numElements; ++i) {
        h_input[i] = (int)(rand() % 2);
    }
    int answer = 0;
    for (int i = 0; i < numElements; i++) {
        answer += h_input[i];
    }

    // Measure the elapsed time
    cudaEvent_t start, stop;
    float time_ms;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Copy data from host to device
    cudaMemcpy(d_input, h_input, numElements * sizeof(int), cudaMemcpyHostToDevice);

    // Main
    printf("numElements = %d (2^%d)\n", numElements, logNumElements);
    cudaEventRecord(start, 0);
    ReductionCascading<<<NUM_BLOCKS, THREADS_PER_BLOCK>>>(d_input, numElements, d_halfwayResult);
    
    // Copy data from device to host
    cudaMemcpy(h_halfwayResult, d_halfwayResult, NUM_BLOCKS * sizeof(int), cudaMemcpyDeviceToHost);

    h_output = 0;
    for (int i = 0; i < NUM_BLOCKS; i++) {
        h_output += h_halfwayResult[i];
    }

    // elaspsed time
    cudaEventRecord(stop, 0);
    // Wait until cudaEventRecord is completed. 
    // (Note: cudaEventRecord is an asynchronous function)
    cudaEventSynchronize(stop);
    
    // Display results
    printf("Our result = %d, correct answer = %d\n", h_output, answer);
    cudaEventElapsedTime(&time_ms, start, stop);
    printf("  exe time: %f ms\n", time_ms);

    // Thrust Library
    // int h_outputThrust;
    // thrust::device_ptr<int> d_input_ptr = thrust::device_pointer_cast(d_input);
    // cudaEventRecord(start, 0);
    // h_outputThrust = thrust::reduce(d_input_ptr, d_input_ptr + numElements);
    // cudaEventRecord(stop, 0);
    // // Wait until cudaEventRecord is completed. 
    // // (Note: cudaEventRecord is an asynchronous function)
    // cudaEventSynchronize(stop);
    // printf("Thrust result = %d, correct answer = %d\n", h_outputThrust, answer);
    // cudaEventElapsedTime(&time_ms, start, stop);
    // printf("  exe time: %f ms\n", time_ms);
    
    // Free memory
    delete [] h_input;
    delete [] h_halfwayResult;
    cudaFree(d_input);
    cudaFree(d_halfwayResult);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    return 0;
}

ジョブスクリプト(cuda.nqs)

#!/bin/bash
#PBS -q LECTURE
#PBS -l elapstim_req=00:00:10,cpunum_job=1,gpunum_job=1
cd $PBS_O_WORKDIR
./a.out

コンパイルとジョブ投入

$ nvcc -O3 -arch=sm_60 ex2.cu
$ qsub cuda.nqs

処理時間のプロファイリング

#!/bin/bash
#PBS -q LECTURE
#PBS -l elapstim_req=00:00:10,cpunum_job=1,gpunum_job=1
cd $PBS_O_WORKDIR
nvprof ./a.out
$ qsub prof.nqs

オキュパンシのプロファイリング

#!/bin/bash
#PBS -q LECTURE
#PBS -l elapstim_req=00:00:10,cpunum_job=1,gpunum_job=1
cd $PBS_O_WORKDIR
nvprof --metrics achieved_occupancy ./a.out
$ qsub occupancy.nqs

✏️ 2-3. Thrustライブラリとの比較

ソースコード(ex2.cuを変更)

ジョブスクリプト(cuda.nqs)

コンパイルとジョブ投入

$ nvcc -O3 -arch=sm_60 ex2.cu
$ qsub cuda.nqs

処理時間のプロファイリング

$ qsub prof.nqs