1. Setup
OS: CentOS7
CPU: Ryzen 7 3700X
Memory: TEAM DDR4 3200Mhz PC4-25600 32GBx 4x
2. Source
(1) no_simd.c which multiple two vectors without SIMD
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void normal_mul(long long vec_len, float *ret, const float *v1, const float *v2){
for(long long elem_idx = 0; elem_idx < vec_len; elem_idx++)
ret[elem_idx] = v1[elem_idx] * v2[elem_idx];
}
int main(int argc, char *argv[]){
long long vec_len = atoll(argv[1]);
float *v1, *v2, *ret;
clock_t start, end;
v1 = (float *)malloc(sizeof(float) * vec_len);
v2 = (float *)malloc(sizeof(float) * vec_len);
ret = (float *)malloc(sizeof(float) * vec_len);
srand(0.5);
for(long long elem_idx = 0; elem_idx < vec_len; elem_idx++){
v1[elem_idx] = rand() / (float)RAND_MAX;
v2[elem_idx] = rand() / (float)RAND_MAX;
ret[elem_idx] = 0.0;
}
start = clock();
normal_mul(vec_len, ret, v1, v2);
end = clock();
printf("%.2f sec elapsed\n", (float)(end - start) / CLOCKS_PER_SEC);
if(argc > 2){
for(long long elem_idx = 0; elem_idx < vec_len; elem_idx++){
printf("%f\n", ret[elem_idx]);
}
}
free(v1);
free(v2);
free(ret);
return 0;
}
(2) use_simd.c which multiple two vectors with SIMD(AVX256)
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
//AVX
#include <immintrin.h>
void simd_mul(const long long vec_len, float *ret, const float *v1, const float *v2){
static const long long single_size = 8; //size of element is 8B
const long long elem_num = vec_len / single_size;
//A 256-bit vector in AVX
__m256 *vec1 = (__m256 *)v1;
__m256 *vec2 = (__m256 *)v2;
__m256 *vec_ret = (__m256 *)ret;
for(long long vec_idx = 0; vec_idx < elem_num; vec_idx++){
//Multiply packed single-precision (32-bit) floating-point elements
vec_ret[vec_idx] = _mm256_mul_ps(vec1[vec_idx], vec2[vec_idx]);
}
}
int main(int argc, char *argv[]){
const long long vec_len = atoll(argv[1]);
float *v1, *v2, *ret;
clock_t start, end;
//Allocate size bytes of memory, aligned to the alignment specified in 32
v1 = (float *)_mm_malloc(sizeof(float) * vec_len, 32);
v2 = (float *)_mm_malloc(sizeof(float) * vec_len, 32);
ret = (float *)_mm_malloc(sizeof(float) * vec_len, 32);
srand(0.5);
for(long long elem_idx = 0; elem_idx < vec_len; elem_idx++){
v1[elem_idx] = rand() / (float)RAND_MAX;
v2[elem_idx] = rand() / (float)RAND_MAX;
ret[elem_idx] = 0.0;
}
start = clock();
simd_mul(vec_len, ret, v1, v2);
end = clock();
printf("%.2f sec elapsed\n",(double)(end - start) / CLOCKS_PER_SEC);
if(argc > 2){
for(long long elem_idx = 0; elem_idx < vec_len; elem_idx++){
printf("%f %f %f\n", v1[elem_idx], v2[elem_idx], ret[elem_idx]);
}
}
// _mm_free should be used to free memory that is allocated with _mm_malloc.
_mm_free(v1);
_mm_free(v2);
_mm_free(ret);
return 0;
}
3. Build and Run
(1) Build no_simd.c and Run. In my setup, the response time of the third command is 6.5sec. If we add an option -O2 to the first command, the response time is 1.6sec.
gcc ./no_simd.c -o ./no_simd -std=c99 ./no_simd 32 print ./no_simd 3200000000
(2) Build use_simd.c and Run, the response time of the third command is 1.6sec. If we add an option -O2 to the first command, the response time is 1.6sec.
gcc ./use_simd.c -o ./use_simd -std=c99 -mavx ./use_simd 32 print ./use_simd 3200000000
4. Additioanls
If we change multiplication to addition, we can get similar result to the above.
5. References
[1] Intel Intrinsics Guide
https://software.intel.com/sites/landingpage/IntrinsicsGuide/