A little experiment for CPU vs GPU using CUDA

1. Introduction

I am a begineer of GPGPU. For performance comparison of CPU and GPU I did a little experiment using CUDA Toolkit.

2. Prerequities

GPU: GeForce GTX 1060 6GB *
CPU: AMD Ryzen 7 3700X 8-Core Processor
OS: CentOS7.7
CUDA: CUDA11.2
Programing Language: C language
* I bought this GPU according to [1]

3. Installing CUDA

(1) Install CUDA11.2

wget https://developer.download.nvidia.com/compute/cuda/11.2.0/local_installers/cuda-repo-rhel7-11-2-local-11.2.0_460.27.04-1.x86_64.rpm
sudo rpm -i cuda-repo-rhel7-11-2-local-11.2.0_460.27.04-1.x86_64.rpm
sudo yum clean all
sudo yum -y install nvidia-driver-latest-dkms cuda
sudo yum -y install cuda-drivers

(2) Check CUDA is installed or not

nvcc --version

(3) Check GPU is active or not

nvidia-smi

4. Build and Run program

4.1 Program using CPU

(1) Create the following source “calculate_cpu.c”. This program transpose two vectors to one vectors generating an element from two elements corresponding to source vectors in each element of vector.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>

void vec_func(float *out, float *a, float *b, long long n) {
    for(int i = 0; i < n; i++){
        out[i] = log1p(expf((-1) * sqrt((log1p(abs(10 * sin(a[i]) * cos(a[i])))) + (log1p(abs(10 * sin(b[i]) * cos(b[i])))))));
    }
}
int main(int argc, char *argv[]){
    float *a, *b, *out;
    clock_t start, end;
    long long N = atoll(argv[1]);

    // Allocate memory
    a = (float*)malloc(sizeof(float) * N);
    b = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);
    
    // Initialize array
    for(int i = 0; i < N; i++){
        a[i] = (rand() / (float)RAND_MAX); b[i] = (rand() / (float)RAND_MAX); out[i] = 0.0f;
    }
    
    // start
    start = clock();

    // Executing kernel
    vec_func(out, a, b, N);

    // end
    end = clock();
    printf("%f\n", ((double)(end - start)) / CLOCKS_PER_SEC );
    
    if(argc > 2){
        for(int i = 0; i < N; i++){
            printf("a:%f v:%f out:%f\n", a[i], b[i], out[i]);
        }
    }
}

(2) Build and Run. In my setup, the elapsed time during vec_func function execution is 51.52sec

gcc -std=c99 -lm ./calculate_cpu.c -o ./calculate_cpu
./calculate_cpu 500000000

4.2 Program using GPU

(1) Create the following source “calculate_gpu.cu”

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>
#include <math.h>
#include <cuda_runtime.h>

__global__ void vec_func(float *out, float *a, float *b, long long n) {
    int i = (blockIdx.x * blockDim.x) + threadIdx.x;
    out[i] = log1p(expf((-1) * sqrt((log1p(abs(10 * sin(a[i]) * cos(a[i])))) + (log1p(abs(10 * sin(b[i]) * cos(b[i])))))));
}

int main(int argc, char *argv[]){
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    long long N = atoll(argv[1]);

    // Allocate memory
    a = (float*)malloc(sizeof(float) * N);
    b = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);
    
    // Initialize array
    for(int i = 0; i < N; i++){
        a[i] = (rand() / (float)RAND_MAX); b[i] = (rand() / (float)RAND_MAX); out[i] = 0.0f;
    }
    
    // Allocate device memory
    cudaMalloc((void**)&d_a, sizeof(float) * N);
    cudaMalloc((void**)&d_b, sizeof(float) * N);
    cudaMalloc((void**)&d_out, sizeof(float) * N);
    
    // Transfer data from host to device memory
    cudaMemcpy(d_a, a, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);

    // start
    cudaEventRecord(start);
    
    // Calculation in GPU
    // Launch (N / 1024) blocks with 1024 threads per block with kernel<<<N,M>>>(…);
    vec_func<<<N / 1024, 1024>>>(d_out, d_a, d_b, N);
    
    // end
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    // elapsed time
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // post process
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    printf("vec_func: %f milli sec\n", milliseconds );
    
    // Transfer data back to host memory
    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
    if(argc > 2){
        for(int i = 0; i < N; i++){
            printf("a:%f v:%f out:%f\n", a[i], b[i], out[i]);
        }
    }
    
    // Deallocate device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    
    // Deallocate host memory
    free(a);
    free(b);
    free(out);
}

(2) Build and Run. In my setup, the elapsed time during vec_func function execution is 0.049sec

nvcc ./calculate_gpu.cu -o ./calculate_gpu
./calculate_gpu 500000000

5. References

[1] CUDA-Enabled GeForce and TITAN Products
https://developer.nvidia.com/cuda-gpus

[2] NVIDIA CUDA Installation Guide for Linux
https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#axzz4VX488ASJ

[3] CUDA C/C++ Basics Supercomputing 2011 Tutorial
https://www.nvidia.com/docs/IO/116711/sc11-cuda-c-basics.pdf