1. Introduction
I am a begineer of GPGPU. For performance comparison of CPU and GPU I did a little experiment using CUDA Toolkit.
2. Prerequities
GPU: GeForce GTX 1060 6GB *
CPU: AMD Ryzen 7 3700X 8-Core Processor
OS: CentOS7.7
CUDA: CUDA11.2
Programing Language: C language
* I bought this GPU according to [1]
3. Installing CUDA
(1) Install CUDA11.2
wget https://developer.download.nvidia.com/compute/cuda/11.2.0/local_installers/cuda-repo-rhel7-11-2-local-11.2.0_460.27.04-1.x86_64.rpm
sudo rpm -i cuda-repo-rhel7-11-2-local-11.2.0_460.27.04-1.x86_64.rpm
sudo yum clean all
sudo yum -y install nvidia-driver-latest-dkms cuda
sudo yum -y install cuda-drivers
(2) Check CUDA is installed or not
nvcc --version
(3) Check GPU is active or not
nvidia-smi
4. Build and Run program
4.1 Program using CPU
(1) Create the following source “calculate_cpu.c”. This program transpose two vectors to one vectors generating an element from two elements corresponding to source vectors in each element of vector.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
void vec_func(float *out, float *a, float *b, long long n) {
for(int i = 0; i < n; i++){
out[i] = log1p(expf((-1) * sqrt((log1p(abs(10 * sin(a[i]) * cos(a[i])))) + (log1p(abs(10 * sin(b[i]) * cos(b[i])))))));
}
}
int main(int argc, char *argv[]){
float *a, *b, *out;
clock_t start, end;
long long N = atoll(argv[1]);
// Allocate memory
a = (float*)malloc(sizeof(float) * N);
b = (float*)malloc(sizeof(float) * N);
out = (float*)malloc(sizeof(float) * N);
// Initialize array
for(int i = 0; i < N; i++){
a[i] = (rand() / (float)RAND_MAX); b[i] = (rand() / (float)RAND_MAX); out[i] = 0.0f;
}
// start
start = clock();
// Executing kernel
vec_func(out, a, b, N);
// end
end = clock();
printf("%f\n", ((double)(end - start)) / CLOCKS_PER_SEC );
if(argc > 2){
for(int i = 0; i < N; i++){
printf("a:%f v:%f out:%f\n", a[i], b[i], out[i]);
}
}
}
(2) Build and Run. In my setup, the elapsed time during vec_func function execution is 51.52sec
gcc -std=c99 -lm ./calculate_cpu.c -o ./calculate_cpu
./calculate_cpu 500000000
4.2 Program using GPU
(1) Create the following source “calculate_gpu.cu”
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>
#include <math.h>
#include <cuda_runtime.h>
__global__ void vec_func(float *out, float *a, float *b, long long n) {
int i = (blockIdx.x * blockDim.x) + threadIdx.x;
out[i] = log1p(expf((-1) * sqrt((log1p(abs(10 * sin(a[i]) * cos(a[i])))) + (log1p(abs(10 * sin(b[i]) * cos(b[i])))))));
}
int main(int argc, char *argv[]){
float *a, *b, *out;
float *d_a, *d_b, *d_out;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
long long N = atoll(argv[1]);
// Allocate memory
a = (float*)malloc(sizeof(float) * N);
b = (float*)malloc(sizeof(float) * N);
out = (float*)malloc(sizeof(float) * N);
// Initialize array
for(int i = 0; i < N; i++){
a[i] = (rand() / (float)RAND_MAX); b[i] = (rand() / (float)RAND_MAX); out[i] = 0.0f;
}
// Allocate device memory
cudaMalloc((void**)&d_a, sizeof(float) * N);
cudaMalloc((void**)&d_b, sizeof(float) * N);
cudaMalloc((void**)&d_out, sizeof(float) * N);
// Transfer data from host to device memory
cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
// start
cudaEventRecord(start);
// Calculation in GPU
// Launch (N / 1024) blocks with 1024 threads per block with kernel<<<N,M>>>(…);
vec_func<<<N / 1024, 1024>>>(d_out, d_a, d_b, N);
// end
cudaEventRecord(stop);
cudaEventSynchronize(stop);
// elapsed time
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
// post process
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("vec_func: %f milli sec\n", milliseconds );
// Transfer data back to host memory
cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
if(argc > 2){
for(int i = 0; i < N; i++){
printf("a:%f v:%f out:%f\n", a[i], b[i], out[i]);
}
}
// Deallocate device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_out);
// Deallocate host memory
free(a);
free(b);
free(out);
}
(2) Build and Run. In my setup, the elapsed time during vec_func function execution is 0.049sec
nvcc ./calculate_gpu.cu -o ./calculate_gpu
./calculate_gpu 500000000
5. References
[1] CUDA-Enabled GeForce and TITAN Products
https://developer.nvidia.com/cuda-gpus
[2] NVIDIA CUDA Installation Guide for Linux
https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#axzz4VX488ASJ
[3] CUDA C/C++ Basics Supercomputing 2011 Tutorial
https://www.nvidia.com/docs/IO/116711/sc11-cuda-c-basics.pdf