$ echo 'export PATH=/octfs/apl/CUDA/cuda-9.0/bin:$PATH' >> ~/.bash_profile
$ source ~/.bash_profile
16要素からなる整数配列のそれぞれの要素にindex番号(0〜15)を書き込み表示しなさい
#include <stdio.h>
void WriteIndex(int* a, int n)
{
for (int i = 0; i < n; i++) {
a[i] = i;
}
}
int main()
{
// Init
int N = 16;
int* h_a = new int[N]; // on host (CPU)
// Main
WriteIndex(h_a, N);
// Display results
for (int i = 0; i < N; i++) {
printf("%d\n",h_a[i]);
}
// Free memory
delete [] h_a;
return 0;
}
#!/bin/bash
#PBS -q LECTURE
#PBS -l elapstim_req=00:00:10,cpunum_job=1,gpunum_job=1
cd $PBS_O_WORKDIR
./a.out
$ g++ ex1-c.cpp
$ qsub cuda.nqs
#include <stdio.h>
__global__ void WriteIndex(int* a, int n)
{
int i = threadIdx.x;
if (i < n) {
a[i] = i;
}
}
int main()
{
// Init
int N = 16;
int* h_a = new int[N]; // on host (CPU)
int* d_a; // on device (GPU)
cudaMalloc(&d_a, N * sizeof(int));
// Copy data from host to device
// Do nothing
// Main
int blocks = 1;
int threadsPerBlock = N;
WriteIndex<<<blocks, threadsPerBlock>>>(d_a, N);
// Copy data from device to host
cudaMemcpy(h_a, d_a, N * sizeof(int), cudaMemcpyDeviceToHost);
// Display results
for (int i = 0; i < N; i++) {
printf("%d\n",h_a[i]);
}
// Free memory
delete [] h_a;
cudaFree(d_a);
return 0;
}
__global__
をつける__device__
をつける0
からblockDim.x - 1
(ブロック内のスレッド最大値 - 1)までのIDが割り当てられ,スレッドIDはthreadIdx.x
で参照できる.cudaMalloc
関数を使用する<<<blocks, threadsPerBlock>>>
と書いた時,ブロックの数がblocks
になり,1ブロックあたりのスレッド数がthreadsPerBlock
になる.今回は1ブロック16スレッドにする.cudaMemcpy
関数を使用する.
cudaMemcpyDeviceToHost
をcudaMemcpyHostToDevice
にするcudaFree
関数を用いる#!/bin/bash
#PBS -q LECTURE
#PBS -l elapstim_req=00:00:10,cpunum_job=1,gpunum_job=1
cd $PBS_O_WORKDIR
./a.out
$ nvcc ex1-cuda.cu
$ qsub cuda.nqs
#include <stdio.h>
__global__ void WriteIndex(int* a, int n)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
a[i] = i;
}
}
int main()
{
// Init
int N = 16;
int* h_a = new int[N]; // on host (CPU)
int* d_a; // on device (GPU)
cudaMalloc(&d_a, N * sizeof(int));
// Measure the elapsed time
cudaEvent_t start, stop;
float time_ms;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// Copy data from host to device
// Do nothing
// Main
int threadsPerBlock = 8;
int blocks = (N + threadsPerBlock - 1) / threadsPerBlock; // round up to int
WriteIndex<<<blocks, threadsPerBlock>>>(d_a, N);
// Copy data from device to host
cudaMemcpy(h_a, d_a, N * sizeof(int), cudaMemcpyDeviceToHost);
// elaspsed time
cudaEventRecord(stop, 0);
// Wait until cudaEventRecord is completed.
// (Note: cudaEventRecord is an asynchronous function)
cudaEventSynchronize(stop);
// Display results
for (int i = 0; i < N; i++) {
printf("%d\n",h_a[i]);
}
cudaEventElapsedTime(&time_ms, start, stop);
printf("exe time: %f ms\n", time_ms);
// Free memory
delete [] h_a;
cudaFree(d_a);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return 0;
}
blockIdx.x
はブロックIDで,各ブロックに対し0から「ブロック数-1」までの番号が振られる.blockDim.x
はブロックあたりのスレッド数,threadIdx.x
はブロック内でのスレッド番号である.$ nvcc -O3 -arch=sm_60 ex1-cuda2.cu
$ qsub cuda.nqs
#!/bin/bash
#PBS -q LECTURE
#PBS -l elapstim_req=00:00:10,cpunum_job=1,gpunum_job=1
cd $PBS_O_WORKDIR
nvprof ./a.out
$ qsub prof.nqs
-bash-4.2$ cat prof.nqs.e389728
==92462== NVPROF is profiling process 92462, command: ./a.out
==92462== Profiling application: ./a.out
==92462== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 60.75% 2.0800us 1 2.0800us 2.0800us 2.0800us WriteIndex(int*, int)
39.25% 1.3440us 1 1.3440us 1.3440us 1.3440us [CUDA memcpy DtoH]
API calls: 99.75% 520.14ms 1 520.14ms 520.14ms 520.14ms cudaMalloc
0.13% 682.87us 94 7.2640us 114ns 282.80us cuDeviceGetAttribute
0.04% 214.33us 1 214.33us 214.33us 214.33us cudaFree
0.04% 209.93us 1 209.93us 209.93us 209.93us cuDeviceTotalMem
0.01% 56.766us 1 56.766us 56.766us 56.766us cuDeviceGetName
0.01% 45.203us 1 45.203us 45.203us 45.203us cudaLaunch
0.01% 30.483us 1 30.483us 30.483us 30.483us cudaMemcpy
0.00% 11.443us 2 5.7210us 2.2800us 9.1630us cudaEventRecord
0.00% 8.3070us 2 4.1530us 682ns 7.6250us cudaEventCreate
0.00% 6.3850us 1 6.3850us 6.3850us 6.3850us cudaEventSynchronize
0.00% 6.1000us 2 3.0500us 193ns 5.9070us cudaSetupArgument
0.00% 5.0610us 1 5.0610us 5.0610us 5.0610us cudaEventElapsedTime
0.00% 3.0570us 3 1.0190us 185ns 2.6390us cuDeviceGetCount
0.00% 1.9980us 2 999ns 517ns 1.4810us cudaEventDestroy
0.00% 1.2720us 1 1.2720us 1.2720us 1.2720us cudaConfigureCall
0.00% 595ns 2 297ns 146ns 449ns cuDeviceGet
\(2^{25}\)要素からなる配列にはランダムに0または1が書き込まれている.この時,この配列の総和を求めなさい.
#include <stdio.h>
#include <math.h>
// Thrust library
// #include <thrust/reduce.h>
// #include <thrust/device_ptr.h>
// Device parameter
const int NUM_SM = 56;
const int WARPS_PER_SM = 64;
const int WARPS_PER_BLOCK = 32;
const int NUM_BLOCKS = NUM_SM * WARPS_PER_SM / WARPS_PER_BLOCK;
const int THREADS_PER_WARP = 32;
const int THREADS_PER_BLOCK = WARPS_PER_BLOCK * THREADS_PER_WARP;
__global__ void ReductionCascading(int* inArray, unsigned int numElements, int* halfwayResult)
{
const unsigned int tid = threadIdx.x; // thread id in each block
const unsigned int wid = tid / THREADS_PER_WARP; // warp id in each block
const unsigned int lid = tid % THREADS_PER_WARP; // lane id = thread id in each warp
const unsigned int sid = blockIdx.x * blockDim.x + threadIdx.x; // sequential thread id in the kernel
const unsigned int numTotalThreads = gridDim.x * blockDim.x;
int item1 = 0;
__shared__ int x[WARPS_PER_BLOCK * THREADS_PER_WARP];
__shared__ int y[WARPS_PER_BLOCK];
const int numIter = 1 + (numElements - 1) / numTotalThreads; //round up to int
// Main
for (int i = 0; i < numIter - 1; i++) {
item1 += inArray[numTotalThreads * i + sid];
}
// Last
int idx = numTotalThreads * (numIter - 1) + sid;
if (idx < numElements) {
item1 += inArray[idx];
}
x[tid] = item1;
__syncthreads();
if (lid < 16) {
x[tid] += x[tid + 16];
__syncthreads();
x[tid] += x[tid + 8];
__syncthreads();
x[tid] += x[tid + 4];
__syncthreads();
x[tid] += x[tid + 2];
__syncthreads();
x[tid] += x[tid + 1];
__syncthreads();
}
if( 0 == lid ){
y[wid] = x[tid];
}
__syncthreads();
if (tid < 16) {
y[tid] += y[tid + 16];
__syncthreads();
y[tid] += y[tid + 8];
__syncthreads();
y[tid] += y[tid + 4];
__syncthreads();
y[tid] += y[tid + 2];
__syncthreads();
y[tid] += y[tid + 1];
__syncthreads();
}
if( 0 == tid ){
halfwayResult[blockIdx.x] = y[tid];
}
return;
}
int main()
{
// Init
int logNumElements = 25; // input_size = 2^25
int numElements = 1 << logNumElements;
int* h_input = new int[numElements]; // on host (CPU)
int* h_halfwayResult = new int[NUM_BLOCKS];
int h_output = 0;
int* d_input; // on device (GPU)
int* d_halfwayResult;
cudaMalloc(&d_input, numElements * sizeof(int));
cudaMalloc(&d_halfwayResult, NUM_BLOCKS * sizeof(int));
srand(0);
for( unsigned int i = 0; i < numElements; ++i) {
h_input[i] = (int)(rand() % 2);
}
int answer = 0;
for (int i = 0; i < numElements; i++) {
answer += h_input[i];
}
// Measure the elapsed time
cudaEvent_t start, stop;
float time_ms;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Copy data from host to device
cudaMemcpy(d_input, h_input, numElements * sizeof(int), cudaMemcpyHostToDevice);
// Main
printf("numElements = %d (2^%d)\n", numElements, logNumElements);
cudaEventRecord(start, 0);
ReductionCascading<<<NUM_BLOCKS, THREADS_PER_BLOCK>>>(d_input, numElements, d_halfwayResult);
// Copy data from device to host
cudaMemcpy(h_halfwayResult, d_halfwayResult, NUM_BLOCKS * sizeof(int), cudaMemcpyDeviceToHost);
h_output = 0;
for (int i = 0; i < NUM_BLOCKS; i++) {
h_output += h_halfwayResult[i];
}
// elaspsed time
cudaEventRecord(stop, 0);
// Wait until cudaEventRecord is completed.
// (Note: cudaEventRecord is an asynchronous function)
cudaEventSynchronize(stop);
// Display results
printf("Our result = %d, correct answer = %d\n", h_output, answer);
cudaEventElapsedTime(&time_ms, start, stop);
printf(" exe time: %f ms\n", time_ms);
// Thrust Library
// int h_outputThrust;
// thrust::device_ptr<int> d_input_ptr = thrust::device_pointer_cast(d_input);
// cudaEventRecord(start, 0);
// h_outputThrust = thrust::reduce(d_input_ptr, d_input_ptr + numElements);
// cudaEventRecord(stop, 0);
// // Wait until cudaEventRecord is completed.
// // (Note: cudaEventRecord is an asynchronous function)
// cudaEventSynchronize(stop);
// printf("Thrust result = %d, correct answer = %d\n", h_outputThrust, answer);
// cudaEventElapsedTime(&time_ms, start, stop);
// printf(" exe time: %f ms\n", time_ms);
// Free memory
delete [] h_input;
delete [] h_halfwayResult;
cudaFree(d_input);
cudaFree(d_halfwayResult);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return 0;
}
#!/bin/bash
#PBS -q LECTURE
#PBS -l elapstim_req=00:00:10,cpunum_job=1,gpunum_job=1
cd $PBS_O_WORKDIR
./a.out
$ nvcc -O3 -arch=sm_60 ex2.cu
$ qsub cuda.nqs
#!/bin/bash
#PBS -q LECTURE
#PBS -l elapstim_req=00:00:10,cpunum_job=1,gpunum_job=1
cd $PBS_O_WORKDIR
nvprof ./a.out
$ qsub prof.nqs
#!/bin/bash
#PBS -q LECTURE
#PBS -l elapstim_req=00:00:10,cpunum_job=1,gpunum_job=1
cd $PBS_O_WORKDIR
nvprof --metrics achieved_occupancy ./a.out
$ qsub occupancy.nqs
$ nvcc -O3 -arch=sm_60 ex2.cu
$ qsub cuda.nqs
$ qsub prof.nqs