代码之家  ›  专栏  ›  技术社区  ›  ronag

我的编译器在做什么?(优化memcpy)

  •  9
  • ronag  · 技术社区  · 14 年前

    我正在用VC++2010中的以下设置编译一些代码:/o2/ob2/oi/ot

    然而,我在理解生成的程序集的某些部分时遇到了一些困难,我在代码中提出了一些问题作为注释。

    另外,在现代CPU上,通常推荐什么预取距离?我可以在自己的CPU上测试OFC,但我希望有一些值可以在更广泛的CPU上很好地工作。也许可以使用动态预取距离?

    <--编辑:

    另一件令我惊讶的事情是编译器没有以某种形式交错使用movdqa和movntdq指令?因为在某种意义上,这些指令与我的理解是异步的。

    此代码在预取时也假定有32个字节的缓存线,但是高端CPU似乎有64个字节的缓存线,因此可能会删除2个预取。

    ---GT;

    void memcpy_aligned_x86(void* dest, const void* source, size_t size)
    { 
    0052AC20  push        ebp  
    0052AC21  mov         ebp,esp  
     const __m128i* source_128 = reinterpret_cast<const __m128i*>(source);
    
     for(size_t n = 0; n < size/16; n += 8) 
    0052AC23  mov         edx,dword ptr [size]  
    0052AC26  mov         ecx,dword ptr [dest]  
    0052AC29  mov         eax,dword ptr [source]  
    0052AC2C  shr         edx,4  
    0052AC2F  test        edx,edx  
    0052AC31  je          copy+9Eh (52ACBEh)  
     __m128i xmm0 = _mm_setzero_si128();
     __m128i xmm1 = _mm_setzero_si128();
     __m128i xmm2 = _mm_setzero_si128();
     __m128i xmm3 = _mm_setzero_si128();
     __m128i xmm4 = _mm_setzero_si128();
     __m128i xmm5 = _mm_setzero_si128();
     __m128i xmm6 = _mm_setzero_si128();
     __m128i xmm7 = _mm_setzero_si128();
    
     __m128i* dest_128 = reinterpret_cast<__m128i*>(dest);
    0052AC37  push        esi  
    0052AC38  push        edi  
    0052AC39  lea         edi,[edx-1]  
    0052AC3C  shr         edi,3  
    0052AC3F  inc         edi  
     {
      _mm_prefetch(reinterpret_cast<const char*>(source_128+8), _MM_HINT_NTA);
      _mm_prefetch(reinterpret_cast<const char*>(source_128+10), _MM_HINT_NTA);
      _mm_prefetch(reinterpret_cast<const char*>(source_128+12), _MM_HINT_NTA);
      _mm_prefetch(reinterpret_cast<const char*>(source_128+14), _MM_HINT_NTA);
    
      xmm0 = _mm_load_si128(source_128++);
      xmm1 = _mm_load_si128(source_128++);
      xmm2 = _mm_load_si128(source_128++);
      xmm3 = _mm_load_si128(source_128++);
      xmm4 = _mm_load_si128(source_128++);
      xmm5 = _mm_load_si128(source_128++);
      xmm6 = _mm_load_si128(source_128++);
      xmm7 = _mm_load_si128(source_128++);
    0052AC40  movdqa      xmm6,xmmword ptr [eax+70h]  // 1. Why is this moved before the pretecthes?
    0052AC45  prefetchnta [eax+80h]  
    0052AC4C  prefetchnta [eax+0A0h]  
    0052AC53  prefetchnta [eax+0C0h]  
    0052AC5A  prefetchnta [eax+0E0h]  
    0052AC61  movdqa      xmm0,xmmword ptr [eax+10h]  
    0052AC66  movdqa      xmm1,xmmword ptr [eax+20h]  
    0052AC6B  movdqa      xmm2,xmmword ptr [eax+30h]  
    0052AC70  movdqa      xmm3,xmmword ptr [eax+40h]  
    0052AC75  movdqa      xmm4,xmmword ptr [eax+50h]  
    0052AC7A  movdqa      xmm5,xmmword ptr [eax+60h]  
    0052AC7F  lea         esi,[eax+70h]  // 2. What is happening in these 2 lines?
    0052AC82  mov         edx,eax        //
    0052AC84  movdqa      xmm7,xmmword ptr [edx]  // 3. Why edx? and not simply eax?
    
      _mm_stream_si128(dest_128++, xmm0);
    0052AC88  mov         esi,ecx  // 4. Is esi never used?
    0052AC8A  movntdq     xmmword ptr [esi],xmm7  
      _mm_stream_si128(dest_128++, xmm1);
    0052AC8E  movntdq     xmmword ptr [ecx+10h],xmm0  
      _mm_stream_si128(dest_128++, xmm2);
    0052AC93  movntdq     xmmword ptr [ecx+20h],xmm1  
      _mm_stream_si128(dest_128++, xmm3);
    0052AC98  movntdq     xmmword ptr [ecx+30h],xmm2  
      _mm_stream_si128(dest_128++, xmm4);
    0052AC9D  movntdq     xmmword ptr [ecx+40h],xmm3  
      _mm_stream_si128(dest_128++, xmm5);
    0052ACA2  movntdq     xmmword ptr [ecx+50h],xmm4  
      _mm_stream_si128(dest_128++, xmm6);
    0052ACA7  movntdq     xmmword ptr [ecx+60h],xmm5  
      _mm_stream_si128(dest_128++, xmm7);
    0052ACAC  lea         edx,[ecx+70h]  
    0052ACAF  sub         eax,0FFFFFF80h  
    0052ACB2  sub         ecx,0FFFFFF80h  
    0052ACB5  dec         edi  
    0052ACB6  movntdq     xmmword ptr [edx],xmm6  // 5. Why not simply ecx?
    0052ACBA  jne         copy+20h (52AC40h)  
    0052ACBC  pop         edi  
    0052ACBD  pop         esi  
     }
    }
    

    原始代码:

    void memcpy_aligned_x86(void* dest, const void* source, size_t size)
    { 
     assert(dest != nullptr);
     assert(source != nullptr);
     assert(source != dest);
     assert(size % 128 == 0);
    
     __m128i xmm0 = _mm_setzero_si128();
     __m128i xmm1 = _mm_setzero_si128();
     __m128i xmm2 = _mm_setzero_si128();
     __m128i xmm3 = _mm_setzero_si128();
     __m128i xmm4 = _mm_setzero_si128();
     __m128i xmm5 = _mm_setzero_si128();
     __m128i xmm6 = _mm_setzero_si128();
     __m128i xmm7 = _mm_setzero_si128();
    
     __m128i* dest_128 = reinterpret_cast<__m128i*>(dest);
     const __m128i* source_128 = reinterpret_cast<const __m128i*>(source);
    
     for(size_t n = 0; n < size/16; n += 8) 
     {
      _mm_prefetch(reinterpret_cast<const char*>(source_128+8), _MM_HINT_NTA);
      _mm_prefetch(reinterpret_cast<const char*>(source_128+10), _MM_HINT_NTA);
      _mm_prefetch(reinterpret_cast<const char*>(source_128+12), _MM_HINT_NTA);
      _mm_prefetch(reinterpret_cast<const char*>(source_128+14), _MM_HINT_NTA);
    
      xmm0 = _mm_load_si128(source_128++);
      xmm1 = _mm_load_si128(source_128++);
      xmm2 = _mm_load_si128(source_128++);
      xmm3 = _mm_load_si128(source_128++);
      xmm4 = _mm_load_si128(source_128++);
      xmm5 = _mm_load_si128(source_128++);
      xmm6 = _mm_load_si128(source_128++);
      xmm7 = _mm_load_si128(source_128++);
    
      _mm_stream_si128(dest_128++, xmm0);
      _mm_stream_si128(dest_128++, xmm1);
      _mm_stream_si128(dest_128++, xmm2);
      _mm_stream_si128(dest_128++, xmm3);
      _mm_stream_si128(dest_128++, xmm4);
      _mm_stream_si128(dest_128++, xmm5);
      _mm_stream_si128(dest_128++, xmm6);
      _mm_stream_si128(dest_128++, xmm7);
     }
    }
    
    3 回复  |  直到 12 年前
        1
  •  3
  •   Eugene Smith    14 年前

        2
  •  2
  •   ronag    14 年前

    void* memcpy(void* dest, const void* source, size_t num)
    {   
        __asm
        {
            mov esi, source;    
            mov edi, dest;   
    
            mov ebx, num; 
            shr ebx, 7;      
    
            cpy:
                prefetchnta [esi+80h];
                prefetchnta [esi+0C0h];
    
                movdqa xmm0, [esi+00h];
                movdqa xmm1, [esi+10h];
                movdqa xmm2, [esi+20h];
                movdqa xmm3, [esi+30h];
    
                movntdq [edi+00h], xmm0;
                movntdq [edi+10h], xmm1;
                movntdq [edi+20h], xmm2;
                movntdq [edi+30h], xmm3;
    
                movdqa xmm4, [esi+40h];
                movdqa xmm5, [esi+50h];
                movdqa xmm6, [esi+60h];
                movdqa xmm7, [esi+70h];
    
                movntdq [edi+40h], xmm4;
                movntdq [edi+50h], xmm5;
                movntdq [edi+60h], xmm6;
                movntdq [edi+70h], xmm7;
    
                lea edi, [edi+80h];
                lea esi, [esi+80h];
                dec ebx;
    
            jnz cpy;
        }
        return dest;
    }
    
    void* memcpy_tbb(void* dest, const void* source, size_t num)
    {   
        tbb::parallel_for(tbb::blocked_range<size_t>(0, num/128), [&](const tbb::blocked_range<size_t>& r)
        {
            memcpy_SSE2_3(reinterpret_cast<char*>(dest) + r.begin()*128, reinterpret_cast<const char*>(source) + r.begin()*128, r.size()*128);
        }, tbb::affinity_partitioner());
    
        return dest;
    }
    
        3
  •  1
  •   Quonux    12 年前
    0052AC82  mov         edx,eax        //
    0052AC84  movdqa      xmm7,xmmword ptr [edx]  // 3. Why edx? and not simply eax? <--
    

    0052ACAF  sub         eax,0FFFFFF80h  
    

    推荐文章