所以我有一个矩阵乘法的SSE3实现:
inline void multiply(const float *__restrict affector, const float *__restrict affected, float *__restrict result)
{
__m128 a0 = _mm_load_ps(&affector[0]);
__m128 a1 = _mm_load_ps(&affector[4]);
__m128 a2 = _mm_load_ps(&affector[8]);
__m128 a3 = _mm_load_ps(&affector[12]);
__m128 b0 = _mm_load_ps(&affected[0]);
__m128 b1 = _mm_load_ps(&affected[4]);
__m128 b2 = _mm_load_ps(&affected[8]);
__m128 b3 = _mm_load_ps(&affected[12]);
__m128 tmp0 = _mm_unpacklo_ps(b0, b1);
__m128 tmp1 = _mm_unpackhi_ps(b0, b1);
__m128 tmp2 = _mm_unpacklo_ps(b2, b3);
__m128 tmp3 = _mm_unpackhi_ps(b2, b3);
b0 = _mm_movelh_ps(tmp0, tmp2);
b1 = _mm_movehl_ps(tmp2, tmp0);
b2 = _mm_movelh_ps(tmp1, tmp3);
b3 = _mm_movehl_ps(tmp3, tmp1);
__m128 mul = _mm_mul_ps(a0, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[0] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a0, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[1] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a0, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[2] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a0, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[3] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a1, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[4] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a1, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[5] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a1, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[6] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a1, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[7] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a2, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[8] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a2, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[9] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a2, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[10] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a2, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[11] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a3, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[12] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a3, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[13] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a3, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[14] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a3, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[15] = _mm_cvtss_f32(mul);
}
运行此功能1.000.000次,速度约为0.04秒
现在,我在想使用点积会加快速度,因为我不必:
1. Multiply
2. Do horizontal addition
3. Do another horizontal addition
而是仅仅:
1. Single Dot product
以下是SSE4.1的实现:
* Loop is unwraped for performance
* @attention As opposed to non-SIMD multiplication we're using column-major
*/
inline void multiply(const float *__restrict affector, const float *__restrict affected, float *__restrict result)
{
// std::cout << "INSIDE SSE4" << std::endl;
// Load rows of matrix B and transpose to columns
__m128 a0 = _mm_load_ps(&affector[0]);
__m128 a1 = _mm_load_ps(&affector[4]);
__m128 a2 = _mm_load_ps(&affector[8]);
__m128 a3 = _mm_load_ps(&affector[12]);
// Load rows of matrix A
__m128 b0 = _mm_load_ps(&affected[0]);
__m128 b1 = _mm_load_ps(&affected[4]);
__m128 b2 = _mm_load_ps(&affected[8]);
__m128 b3 = _mm_load_ps(&affected[12]);
// b0 = [1, 2, 3, 4]
// b1 = [5, 6, 7, 8]
// b2 = [9, 10, 11, 12]
// b3 = [13, 14, 15, 16]
// need to arrive at-> b0 = [1, 5, 9, 10]
// need to arrive at-> b1 = [2, 6, 10, 14]
// need to arrive at-> b2 = [3, 7, 11, 15]
// need to arrive at-> b3 = [4, 8, 12, 16]
// tmp0 = [1, 5, 2, 6]
__m128 tmp0 = _mm_unpacklo_ps(b0, b1);
// tmp1 = [3, 7, 4, 8]
__m128 tmp1 = _mm_unpackhi_ps(b0, b1);
// tmp2 = [9, 13, 10, 14]
__m128 tmp2 = _mm_unpacklo_ps(b2, b3);
// tmp3 = [11, 15, 12, 16]
__m128 tmp3 = _mm_unpackhi_ps(b2, b3);
// b0 = [1, 5, ....] = move tmp2 low into tmp0 high
b0 = _mm_movelh_ps(tmp0, tmp2);
// b1 = [...., 10, 14] = move tmp0 high into tmp tmp2 low
b1 = _mm_movehl_ps(tmp2, tmp0);
// b2 = [3, 7, ....] = move tmp3 lows into tmp1 highs
b2 = _mm_movelh_ps(tmp1, tmp3);
// b3 = [...., 12, 16] = move tmp1 highs into tmp3 lows
b3 = _mm_movehl_ps(tmp3, tmp1);
__m128 mul;
// Perform the matrix multiplication for each element
mul = _mm_dp_ps(a0, b0, 0xF1); // Dot product of a0 and b0, 0xF1 means all four elements
result[0] = _mm_cvtss_f32(mul); // Store result
mul = _mm_dp_ps(a0, b1, 0xF1); // Dot product of a0 and b1
result[1] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a0, b2, 0xF1); // Dot product of a0 and b2
result[2] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a0, b3, 0xF1); // Dot product of a0 and b3
result[3] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a1, b0, 0xF1); // Dot product of a1 and b0
result[4] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a1, b1, 0xF1); // Dot product of a1 and b1
result[5] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a1, b2, 0xF1); // Dot product of a1 and b2
result[6] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a1, b3, 0xF1); // Dot product of a1 and b3
result[7] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a2, b0, 0xF1); // Dot product of a2 and b0
result[8] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a2, b1, 0xF1); // Dot product of a2 and b1
result[9] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a2, b2, 0xF1); // Dot product of a2 and b2
result[10] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a2, b3, 0xF1); // Dot product of a2 and b3
result[11] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a3, b0, 0xF1); // Dot product of a3 and b0
result[12] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a3, b1, 0xF1); // Dot product of a3 and b1
result[13] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a3, b2, 0xF1); // Dot product of a3 and b2
result[14] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a3, b3, 0xF1); // Dot product of a3 and b3
result[15] = _mm_cvtss_f32(mul);
}
结果是:~0.15秒!!!这甚至比我的不使用内部函数的实现(~0.11-0.12秒)和使用SSE2的实现(-0.10-0.9秒)还要慢。发生什么事??是因为点积在较低级别的实现方式,还是我做错了什么?
所有3个(非intrisct、SSE2、SSE3、SSE4.1)都使用-O2进行了优化。