Don-Reba

Vectorized FGT

Jul 27th, 2016
1,013
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. __m256 FgtVolume::operator [] (const Vector3f256 & p) const
  2. {
  3.     __m256 side = _mm256_set1_ps(Side);
  4.  
  5.     // get domain origin
  6.     __m256 minX = _mm256_set1_ps(DomainMin.x());
  7.     __m256 minY = _mm256_set1_ps(DomainMin.y());
  8.     __m256 minZ = _mm256_set1_ps(DomainMin.z());
  9.  
  10.     // get cell coordinates
  11.     __m256 kx = _mm256_floor_ps(_mm256_div_ps(_mm256_sub_ps(p.x, minX), side));
  12.     __m256 ky = _mm256_floor_ps(_mm256_div_ps(_mm256_sub_ps(p.y, minY), side));
  13.     __m256 kz = _mm256_floor_ps(_mm256_div_ps(_mm256_sub_ps(p.z, minZ), side));
  14.  
  15.     // get coefficient addresses
  16.     __m256i indices = _mm256_cvtps_epi32
  17.         ( _mm256_mul_ps
  18.             ( _mm256_fmadd_ps
  19.                     ( _mm256_fmadd_ps(kz, _mm256_set1_ps((float)Count.y()), ky)
  20.                     , _mm256_set1_ps((float)Count.x())
  21.                     , kx
  22.                     )
  23.             , _mm256_set1_ps((float)PD)
  24.             )
  25.         );
  26.  
  27.     // get cell center's position
  28.     __m256 cx = _mm256_fmadd_ps(side, _mm256_add_ps(kx, _mm256_set1_ps(0.5f)), minX);
  29.     __m256 cy = _mm256_fmadd_ps(side, _mm256_add_ps(ky, _mm256_set1_ps(0.5f)), minY);
  30.     __m256 cz = _mm256_fmadd_ps(side, _mm256_add_ps(kz, _mm256_set1_ps(0.5f)), minZ);
  31.  
  32.     // compute distance from cell center in sigmas
  33.     __m256 delta[3] =
  34.         { _mm256_mul_ps(_mm256_sub_ps(p.x, cx), _mm256_set1_ps(1.0f / Sigma))
  35.         , _mm256_mul_ps(_mm256_sub_ps(p.y, cy), _mm256_set1_ps(1.0f / Sigma))
  36.         , _mm256_mul_ps(_mm256_sub_ps(p.z, cz), _mm256_set1_ps(1.0f / Sigma))
  37.         };
  38.  
  39.     __m256 dxx = _mm256_mul_ps(delta[0], delta[0]);
  40.     __m256 dyy = _mm256_mul_ps(delta[1], delta[1]);
  41.     __m256 dzz = _mm256_mul_ps(delta[2], delta[2]);
  42.  
  43.     __m256 dnorm = _mm256_add_ps(_mm256_add_ps(dxx, dyy), dzz);
  44.  
  45.     // gather the polynomial
  46.     assert(PD == 56);
  47.     __m256 polynomial[56];
  48.     polynomial[0] = expf8(_mm256_sub_ps(_mm256_setzero_ps(), dnorm));
  49.  
  50.     Vector3i heads(0, 0, 0);
  51.     int t = 1;
  52.     for (int a = 1; a != Alpha; ++a)
  53.     {
  54.         int tail = t;
  55.         for (int i = 0; i != 3; ++i)
  56.         {
  57.             int head = heads[i];
  58.             heads[i] = t;
  59.             for (int j = head; j != tail; ++j, ++t)
  60.                 polynomial[t] = _mm256_mul_ps(delta[i], polynomial[j]);
  61.         }
  62.     }
  63.  
  64.     __declspec(align(32)) int indicesi32[8];
  65.     _mm256_store_si256((__m256i*)indicesi32, indices);
  66.     bool allEqual = true;
  67.     for (size_t i = 1; i != 8; ++i)
  68.         allEqual = allEqual && (indicesi32[0] == indicesi32[i]);
  69.  
  70.     // normalizationFactor * coefficients ยท polynomial
  71.     __m256 sum = _mm256_setzero_ps();
  72.     if (allEqual)
  73.     {
  74.         // fast path (all indices are the same)
  75.         const float * cp = &Coefficients[indicesi32[0]];
  76.         for (size_t i = 0; i != 56; ++i)
  77.         {
  78.             __m256 coefficients = _mm256_set1_ps(cp[i]);
  79.             sum = _mm256_fmadd_ps(polynomial[i], coefficients, sum);
  80.         }
  81.     }
  82.     else
  83.     {
  84.         // slow path
  85.         for (size_t i = 0; i != 56; ++i)
  86.         {
  87.             __m256 coefficients = _mm256_i32gather_ps(&Coefficients[i], indices, sizeof(float));
  88.             sum = _mm256_fmadd_ps(polynomial[i], coefficients, sum);
  89.         }
  90.     }
  91.     return _mm256_mul_ps(sum, _mm256_set1_ps(normalizationFactor));
  92. }
Advertisement