代码之家  ›  专栏  ›  技术社区  ›  John

两个8位阵列协方差的快速实现

  •  4
  • John  · 技术社区  · 9 年前

    我需要比较大量类似的小尺寸图像(高达200x200)。 所以我尝试实现SSIM(结构相似性参见 https://en.wikipedia.org/wiki/Structural_similarity )算法。 SSIM需要计算两个8位灰度图像的协方差。 一个简单的实现如下:

    float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
    {
        float sum = 0;
        for(size_t i = 0; i < size; ++i)
            sum += (x[i] - averageX) * (y[i] - averageY);
        return sum / size;
    }
    

    但它的性能很差。 所以我希望通过使用SIMD或CUDA来改进它(我听说可以做到)。 不幸的是,我没有这样做的经验。 它看起来怎么样?我该去哪里?

    2 回复  |  直到 9 年前
        1
  •  4
  •   ErmIg    9 年前

    我有另一个很好的解决方案!

    首先,我想提到一些数学公式:

    averageX = Sum(x[i])/size;
    averageY = Sum(y[i])/size;
    

    因此:

    Sum((x[i] - averageX)*(y[i] - averageY))/size = 
    
    Sum(x[i]*y[i])/size - Sum(x[i]*averageY)/size - 
    Sum(averageX*y[i])/size + Sum(averageX*averageY)/size = 
    
    Sum(x[i]*y[i])/size - averageY*Sum(x[i])/size - 
    averageX*Sum(y[i])/size + averageX*averageY*Sum(1)/size =   
    
    Sum(x[i]*y[i])/size - averageY*averageX - 
    averageX*averageY + averageX*averageY = 
    
    Sum(x[i]*y[i])/size - averageY*averageX;     
    

    它允许修改我们的算法:

    float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
    {
        uint32_t sum = 0; // If images will have size greater then 256x256 than you have to use uint64_t.
        for(size_t i = 0; i < size; ++i)
            sum += x[i]*y[i];
        return sum / size - averageY*averageX;
    } 
    

    只有在那之后,我们才能使用SIMD(我使用的是SSE2):

    #include <emmintrin.h>
    
    inline __m128i SigmaXY(__m128i x, __m128i y)
    {
        __m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(x, _mm_setzero_si128()), _mm_unpacklo_epi8(y, _mm_setzero_si128()));
        __m128i hi = _mm_madd_epi16(_mm_unpackhi_epi8(y, _mm_setzero_si128()), _mm_unpackhi_epi8(y, _mm_setzero_si128()));
        return _mm_add_epi32(lo, hi);
    }
    
    float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
    {
        uint32_t sum = 0;
        size_t i = 0, alignedSize = size/16*16;
        if(size >= 16)
        {
            __m128i sums = _mm_setzero_si128();
            for(; i < alignedSize; i += 16)
            {
                __m128i _x = _mm_loadu_si128((__m128i*)(x + i));
                __m128i _y = _mm_loadu_si128((__m128i*)(y + i));
                sums = _mm_add_epi32(sums, SigmaXY(_x, _y));
            }
            uint32_t _sums[4];
            _mm_storeu_si128(_sums, sums);
            sum = _sums[0] + _sums[1] + _sums[2] + _sums[3];
        } 
        for(; i < size; ++i)
            sum += x[i]*y[i];
        return sum / size - averageY*averageX;
    }
    
        2
  •  3
  •   ErmIg    9 年前

    该算法有一个SIMD实现(我使用SSE4.1):

    #include <smmintrin.h>
    
    template <int shift> inline __m128 SigmaXY(const __m128i & x, const __m128i & y, __m128 & averageX, __m128 & averageY)
    {
        __m128 _x = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(x, shift)));
        __m128 _y = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(y, shift)));
        return _mm_mul_ps(_mm_sub_ps(_x, averageX), _mm_sub_ps(_y, averageY))
    }
    
    float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
    {
        float sum = 0;
        size_t i = 0, alignedSize = size/16*16;
        if(size >= 16)
        {
            __m128 sums = _mm_setzero_ps();
            __m128 avgX = _mm_set1_ps(averageX);
            __m128 avgY = _mm_set1_ps(averageY);
            for(; i < alignedSize; i += 16)
            {
                __m128i _x = _mm_loadu_si128((__m128i*)(x + i));
                __m128i _y = _mm_loadu_si128((__m128i*)(y + i));
                sums = _mm_add_ps(sums, SigmaXY<0>(_x, _y, avgX, avgY);
                sums = _mm_add_ps(sums, SigmaXY<4>(_x, _y, avgX, avgY);
                sums = _mm_add_ps(sums, SigmaXY<8>(_x, _y, avgX, avgY);
                sums = _mm_add_ps(sums, SigmaXY<12>(_x, _y, avgX, avgY);
            }
            float _sums[4];
            _mm_storeu_ps(_sums, sums);
            sum = _sums[0] + _sums[1] + _sums[2] + _sums[3];
        }
        for(; i < size; ++i)
            sum += (x[i] - averageX) * (y[i] - averageY);
        return sum / size;
    }
    

    我希望它对你有用。