SIMD sample code using AVX

1. Setup

OS: CentOS7
CPU: Ryzen 7 3700X
Memory: TEAM DDR4 3200Mhz PC4-25600 32GBx 4x

2. Source

(1) no_simd.c which multiple two vectors without SIMD

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

void normal_mul(long long vec_len, float *ret, const float *v1, const float *v2){
  for(long long elem_idx = 0; elem_idx < vec_len; elem_idx++)
    ret[elem_idx] = v1[elem_idx] * v2[elem_idx];
}

int main(int argc, char *argv[]){
  long long vec_len = atoll(argv[1]);
  float *v1, *v2, *ret;
  clock_t start, end;

  v1 = (float *)malloc(sizeof(float) * vec_len);
  v2 = (float *)malloc(sizeof(float) * vec_len);
  ret = (float *)malloc(sizeof(float) * vec_len);
  
  srand(0.5);
  for(long long elem_idx = 0; elem_idx < vec_len; elem_idx++){
    v1[elem_idx] = rand() / (float)RAND_MAX;
    v2[elem_idx] = rand() / (float)RAND_MAX;
    ret[elem_idx] = 0.0;
  }
  
  start = clock();
  normal_mul(vec_len, ret, v1, v2);
  end = clock();
  printf("%.2f sec elapsed\n", (float)(end - start) / CLOCKS_PER_SEC);
  
  if(argc > 2){
    for(long long elem_idx = 0; elem_idx < vec_len; elem_idx++){
      printf("%f\n", ret[elem_idx]);
    }
  }

  free(v1);
  free(v2);
  free(ret);

  return 0;
}

(2) use_simd.c which multiple two vectors with SIMD(AVX256)

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

//AVX
#include <immintrin.h>

void simd_mul(const long long vec_len, float *ret, const float *v1, const float *v2){
  static const long long single_size = 8; //size of element is 8B
  const long long elem_num = vec_len / single_size; 

  //A 256-bit vector in AVX
  __m256 *vec1 = (__m256 *)v1;
  __m256 *vec2 = (__m256 *)v2;
  __m256 *vec_ret = (__m256 *)ret;
  
  for(long long vec_idx = 0; vec_idx < elem_num; vec_idx++){
    //Multiply packed single-precision (32-bit) floating-point elements
    vec_ret[vec_idx] = _mm256_mul_ps(vec1[vec_idx], vec2[vec_idx]); 
  }
}

int main(int argc, char *argv[]){
  const long long vec_len = atoll(argv[1]);
  float *v1, *v2, *ret;
  clock_t start, end;

  //Allocate size bytes of memory, aligned to the alignment specified in 32
  v1 = (float *)_mm_malloc(sizeof(float) * vec_len, 32);
  v2 = (float *)_mm_malloc(sizeof(float) * vec_len, 32);
  ret = (float *)_mm_malloc(sizeof(float) * vec_len, 32);
  
  srand(0.5);
  for(long long elem_idx = 0; elem_idx < vec_len; elem_idx++){
    v1[elem_idx] = rand() / (float)RAND_MAX;
    v2[elem_idx] = rand() / (float)RAND_MAX;
    ret[elem_idx] = 0.0;
  }
  
  start = clock();
  simd_mul(vec_len, ret, v1, v2);
  end = clock();
  printf("%.2f sec elapsed\n",(double)(end - start) / CLOCKS_PER_SEC);
  
  if(argc > 2){
    for(long long elem_idx = 0; elem_idx < vec_len; elem_idx++){
       printf("%f %f %f\n", v1[elem_idx], v2[elem_idx], ret[elem_idx]);
    }
  }

  // _mm_free should be used to free memory that is allocated with _mm_malloc.
  _mm_free(v1);
  _mm_free(v2);
  _mm_free(ret);

  return 0;
}

3. Build and Run

(1) Build no_simd.c and Run. In my setup, the response time of the third command is 6.5sec. If we add an option -O2 to the first command, the response time is 1.6sec.

gcc ./no_simd.c -o ./no_simd -std=c99
./no_simd 32 print
./no_simd 3200000000

(2) Build use_simd.c and Run, the response time of the third command is 1.6sec. If we add an option -O2 to the first command, the response time is 1.6sec.

gcc ./use_simd.c -o ./use_simd -std=c99 -mavx 
./use_simd 32 print
./use_simd 3200000000

4. Additioanls

If we change multiplication to addition, we can get similar result to the above.

5. References

[1] Intel Intrinsics Guide
https://software.intel.com/sites/landingpage/IntrinsicsGuide/

1. Setup

2. Source

3. Build and Run

4. Additioanls

5. References

Published by ktke109