目前我有一个代码是
this answer that worked with double-precision
:
static inline float fast_hMax_ps(__m256 a){
const __m256 permHalves = _mm256_permute2f128_ps(a, a, 1);
const __m256 m0 = _mm256_max_ps(permHalves, a);
const __m256 perm0 = _mm256_permute_ps(m0, 0b01001110);
const __m256 m1 = _mm256_max_ps(m0, perm0);
const __m256 perm1 = _mm256_permute_ps(m1, 0b10110001);
const __m256 m2 = _mm256_max_ps(perm1, m1);
return ((float*)&m2)[0];
}