代码之家  ›  专栏  ›  技术社区  ›  luispedro

使用整数和无符号整数与双精度整数混合时的速度差

  •  19
  • luispedro  · 技术社区  · 15 年前

    我有一个应用程序,其中内部循环的一部分基本上是:

    double sum = 0;
    for (int i = 0; i != N; ++i, ++data, ++x) sum += *data * x;
    

    这是一个更大的代码库的一部分,但我把它归结为要点:

    #include <iostream>                                      
    #include <cstdlib>                                       
    #include <vector>
    #include <time.h>
    
    typedef unsigned char uint8;
    
    template<typename T>
    double moments(const uint8* data, int N, T wrap) {
        T pos = 0;
        double sum = 0.;
        for (int i = 0; i != N; ++i, ++data) {
            sum += *data * pos;
            ++pos;
            if (pos == wrap) pos = 0;
        }
        return sum;
    }
    
    template<typename T>
    const char* name() { return "unknown"; }
    
    template<>
    const char* name<int>() { return "int"; }
    
    template<>
    const char* name<unsigned int>() { return "unsigned int"; }
    
    const int Nr_Samples = 10 * 1000;
    
    template<typename T>
    void measure(const std::vector<uint8>& data) {
        const uint8* dataptr = &data[0];
        double moments_results[Nr_Samples];
        time_t start, end;
        time(&start);
        for (int i = 0; i != Nr_Samples; ++i) {
            moments_results[i] = moments<T>(dataptr, data.size(), 128);
        }
        time(&end);
        double avg = 0.0;
        for (int i = 0; i != Nr_Samples; ++i) avg += moments_results[i];
        avg /= Nr_Samples;
        std::cout << "With " << name<T>() << ": " << avg << " in " << (end - start) << "secs" << std::endl;
    }
    
    
    int main() {
        std::vector<uint8> data(128*1024);
        for (int i = 0; i != data.size(); ++i) data[i] = std::rand();
        measure<int>(data);
        measure<unsigned int>(data);
        measure<int>(data);
        return 0;
    }
    

    无优化编译:

    luispedro@oakeshott:/home/luispedro/tmp/so §g++  test.cpp    
    luispedro@oakeshott:/home/luispedro/tmp/so §./a.out
    With int: 1.06353e+09 in 9secs
    With unsigned int: 1.06353e+09 in 14secs
    With int: 1.06353e+09 in 9secs
    

    通过优化:

    luispedro@oakeshott:/home/luispedro/tmp/so §g++  -O3  test.cpp
    luispedro@oakeshott:/home/luispedro/tmp/so §./a.out
    With int: 1.06353e+09 in 3secs
    With unsigned int: 1.06353e+09 in 12secs
    With int: 1.06353e+09 in 4secs
    

    这与硬件有关,还是gcc优化机制的局限性?我赌第二个。

    我的机器是一台运行Ubuntu 9.10的英特尔32位计算机。

    :由于Stephen询问,以下是反编译源代码(来自a-O3编译)。我相信我得到了主要的循环:

    int版本:

    40: 0f b6 14 0b             movzbl (%ebx,%ecx,1),%edx
         sum += *data * pos;
    44: 0f b6 d2                movzbl %dl,%edx
    47: 0f af d0                imul   %eax,%edx
          ++pos;
    4a: 83 c0 01                add    $0x1,%eax
          sum += *data * pos;
    4d: 89 95 54 c7 fe ff       mov    %edx,-0x138ac(%ebp)
          ++pos;
          if (pos == wrap) pos = 0;
    53: 31 d2                   xor    %edx,%edx
    55: 3d 80 00 00 00          cmp    $0x80,%eax
    5a: 0f 94 c2                sete   %dl
      T pos = 0;
      double sum = 0.;
      for (int i = 0; i != N; ++i, ++data) {
    5d: 83 c1 01                add    $0x1,%ecx
          sum += *data * pos;
    60: db 85 54 c7 fe ff       fildl  -0x138ac(%ebp)
          ++pos;
          if (pos == wrap) pos = 0;
    66: 83 ea 01                sub    $0x1,%edx
    69: 21 d0                   and    %edx,%eax
      T pos = 0;
      double sum = 0.;
      for (int i = 0; i != N; ++i, ++data) {
    6b: 39 f1                   cmp    %esi,%ecx
          sum += *data * pos;
    6d: de c1                   faddp  %st,%st(1)
      T pos = 0;
      double sum = 0.;
      for (int i = 0; i != N; ++i, ++data) {
    6f: 75 cf                   jne    40
    

    未签名版本:

    50: 0f b6 34 13             movzbl (%ebx,%edx,1),%esi
          sum += *data * pos;
    54: 81 e6 ff 00 00 00       and    $0xff,%esi
    5a: 31 ff                   xor    %edi,%edi
    5c: 0f af f0                imul   %eax,%esi
          ++pos;
    5f: 83 c0 01                add    $0x1,%eax
          if (pos == wrap) pos = 0;
    62: 3d 80 00 00 00          cmp    $0x80,%eax
    67: 0f 94 c1                sete   %cl
      T pos = 0;
      double sum = 0.;
      for (int i = 0; i != N; ++i, ++data) {
    6a: 83 c2 01                add    $0x1,%edx
          sum += *data * pos;
    6d: 89 bd 54 c7 fe ff       mov    %edi,-0x138ac(%ebp)
    73: 89 b5 50 c7 fe ff       mov    %esi,-0x138b0(%ebp)
          ++pos;
          if (pos == wrap) pos = 0;
    79: 89 ce                   mov    %ecx,%esi
    7b: 81 e6 ff 00 00 00       and    $0xff,%esi
          sum += *data * pos;
    81: df ad 50 c7 fe ff       fildll -0x138b0(%ebp)
          ++pos;
          if (pos == wrap) pos = 0;
    87: 83 ee 01                sub    $0x1,%esi
    8a: 21 f0                   and    %esi,%eax
      for (int i = 0; i != N; ++i, ++data) {
    8c: 3b 95 34 c7 fe ff       cmp    -0x138cc(%ebp),%edx
          sum += *data * pos;
    92: de c1                   faddp  %st,%st(1)
      for (int i = 0; i != N; ++i, ++data) {
    94: 75 ba                   jne    50
    

    这是-O3版本,这就是源代码行上下跳跃的原因。 非常感谢。

    4 回复  |  直到 14 年前
        1
  •  34
  •   Stephen Canon    15 年前

    原因如下:许多常见的体系结构(包括x86)都有一条硬件指令将有符号整数转换为双精度整数,但没有从无符号到双精度的硬件转换,因此编译器需要在软件中综合转换。此外,Intel上唯一的无符号乘法是全宽度乘法,而有符号乘法可以使用有符号乘法低位指令。

    GCC的软件从无符号int到double的转换很可能是次优的(考虑到您所观察到的减速幅度,几乎可以肯定是次优的),但使用有符号整数时,代码的速度会更快。

    假设使用智能编译器,64位系统上的差异应该小得多,因为64位有符号整数->双转换可用于高效地进行32位无符号转换。

    举例来说,这是:

    sum += *data * x;
    

    如果整数变量是有符号的,则应编译成以下内容:

    mov       (data),   %eax
    imul      %ecx,     %eax
    cvtsi2sd  %eax,     %xmm1
    addsd     %xmm1,    %xmm0
    

    另一方面,如果整数变量是无符号的, cvtsi2sd 无法用于执行转换,因此需要软件解决方案。我希望看到这样的情况:

        mov       (data),   %eax
        mul       %ecx            // might be slower than imul
        cvtsi2sd  %eax,     %xmm1 // convert as though signed integer
        test      %eax,     %eax  // check if high bit was set
        jge       1f              // if it was, we need to adjust the converted
        addsd     (2^32),   %xmm1 // value by adding 2^32
    1:  addsd     %xmm1,    %xmm0
    

    对于未签名->双转换;情况很可能更糟。

        2
  •  3
  •   anon anon    15 年前

    下面是一些由VC++6.0生成的代码-无优化:

    4:        int x = 12345;
    0040E6D8   mov         dword ptr [ebp-4],3039h
    5:        double d1 = x;
    0040E6DF   fild        dword ptr [ebp-4]
    0040E6E2   fstp        qword ptr [ebp-0Ch]
    6:        unsigned int y = 12345;
    0040E6E5   mov         dword ptr [ebp-10h],3039h
    7:        double d2 = y;
    0040E6EC   mov         eax,dword ptr [ebp-10h]
    0040E6EF   mov         dword ptr [ebp-20h],eax
    0040E6F2   mov         dword ptr [ebp-1Ch],0
    0040E6F9   fild        qword ptr [ebp-20h]
    0040E6FC   fstp        qword ptr [ebp-18h]
    

    正如您所看到的,转换无符号数据需要做更多的工作。

        3
  •  1
  •   Mihai Iorga    12 年前

    发布模式。。。

    With int: 4.23944e+009 in 9secs
    With unsigned int: 4.23944e+009 in 18secs
    With int: 4.23944e+009 in 9secs
    

    调试模式。。。

    With int: 4.23944e+009 in 34secs
    With unsigned int: 4.23944e+009 in 58secs
    With int: 4.23944e+009 in 34secs
    

    ASM处于释放模式((未签名)

        for (int i = 0; i != Nr_Samples; ++i) { 
    011714A1  fldz  
    011714A3  mov         edx,dword ptr [esi+4]  
    011714A6  add         esp,4  
    011714A9  xor         edi,edi  
    011714AB  sub         edx,dword ptr [esi]  
            moments_results[i] = moments<T>(dataptr, data.size(), 128); 
    011714AD  mov         ecx,dword ptr [ebp-1388Ch]  
    011714B3  fld         st(0)  
    011714B5  xor         eax,eax  
    011714B7  test        edx,edx  
    011714B9  je          measure<unsigned int>+79h (11714E9h)  
    011714BB  mov         esi,edx  
    011714BD  movzx       ebx,byte ptr [ecx]  
    011714C0  imul        ebx,eax  
    011714C3  mov         dword ptr [ebp-138A4h],ebx  
    011714C9  fild        dword ptr [ebp-138A4h]  //only in unsigned
    011714CF  test        ebx,ebx  //only in unsigned
    011714D1  jns         measure<unsigned int>+69h (11714D9h)  //only in unsigned
    011714D3  fadd        qword ptr [__real@41f0000000000000 (11731C8h)]  //only in unsigned
    011714D9  inc         eax  
    011714DA  faddp       st(1),st  
    011714DC  cmp         eax,80h  
    011714E1  jne         measure<unsigned int>+75h (11714E5h)  
    011714E3  xor         eax,eax  
    011714E5  inc         ecx  
    011714E6  dec         esi  
    011714E7  jne         measure<unsigned int>+4Dh (11714BDh)  
    011714E9  fstp        qword ptr [ebp+edi*8-13888h]  
    011714F0  inc         edi  
    011714F1  cmp         edi,2710h  
    011714F7  jne         measure<unsigned int>+3Dh (11714ADh)  
        } 
    

        for (int i = 0; i != Nr_Samples; ++i) { 
    012A1351  fldz  
    012A1353  mov         edx,dword ptr [esi+4]  
    012A1356  add         esp,4  
    012A1359  xor         edi,edi  
    012A135B  sub         edx,dword ptr [esi]  
            moments_results[i] = moments<T>(dataptr, data.size(), 128); 
    012A135D  mov         ecx,dword ptr [ebp-13890h]  
    012A1363  fld         st(0)  
    012A1365  xor         eax,eax  
    012A1367  test        edx,edx  
    012A1369  je          measure<int>+6Fh (12A138Fh)  
    012A136B  mov         esi,edx  
    012A136D  movzx       ebx,byte ptr [ecx]  
    012A1370  imul        ebx,eax  
    012A1373  mov         dword ptr [ebp-1388Ch],ebx  
    012A1379  inc         eax  
    012A137A  fild        dword ptr [ebp-1388Ch]  //only in signed
    012A1380  faddp       st(1),st  
    012A1382  cmp         eax,80h  
    012A1387  jne         measure<int>+6Bh (12A138Bh)  
    012A1389  xor         eax,eax  
    012A138B  inc         ecx  
    012A138C  dec         esi  
    012A138D  jne         measure<int>+4Dh (12A136Dh)  
    012A138F  fstp        qword ptr [ebp+edi*8-13888h]  
    012A1396  inc         edi  
    012A1397  cmp         edi,2710h  
    012A139D  jne         measure<int>+3Dh (12A135Dh)  
        } 
    

    有趣的。。。启用释放模式和SSE时(已删除fld和fld说明,但添加了4个说明)

    With int: 4.23944e+009 in 8secs
    With unsigned int: 4.23944e+009 in 10secs
    With int: 4.23944e+009 in 8secs
    
    
        for (int i = 0; i != Nr_Samples; ++i) { 
    00F614C1  mov         edx,dword ptr [esi+4]  
    00F614C4  xorps       xmm0,xmm0  //added in sse version
    00F614C7  add         esp,4  
    00F614CA  xor         edi,edi  
    00F614CC  sub         edx,dword ptr [esi]  
            moments_results[i] = moments<T>(dataptr, data.size(), 128); 
    00F614CE  mov         ecx,dword ptr [ebp-13894h]  
    00F614D4  xor         eax,eax  
    00F614D6  movsd       mmword ptr [ebp-13890h],xmm0  //added in sse version
    00F614DE  test        edx,edx  
    00F614E0  je          measure<unsigned int>+8Ch (0F6151Ch)  
    00F614E2  fld         qword ptr [ebp-13890h]  //added in sse version
    00F614E8  mov         esi,edx  
    00F614EA  movzx       ebx,byte ptr [ecx]  
    00F614ED  imul        ebx,eax  
    00F614F0  mov         dword ptr [ebp-1388Ch],ebx  
    00F614F6  fild        dword ptr [ebp-1388Ch]  
    00F614FC  test        ebx,ebx  
    00F614FE  jns         measure<unsigned int>+76h (0F61506h)  
    00F61500  fadd        qword ptr [__real@41f0000000000000 (0F631C8h)]  
    00F61506  inc         eax  
    00F61507  faddp       st(1),st  
    00F61509  cmp         eax,80h  
    00F6150E  jne         measure<unsigned int>+82h (0F61512h)  
    00F61510  xor         eax,eax  
    00F61512  inc         ecx  
    00F61513  dec         esi  
    00F61514  jne         measure<unsigned int>+5Ah (0F614EAh)  
    00F61516  fstp        qword ptr [ebp-13890h]  
    00F6151C  movsd       xmm1,mmword ptr [ebp-13890h]  //added in sse version
    00F61524  movsd       mmword ptr [ebp+edi*8-13888h],xmm1  //added in sse version
    00F6152D  inc         edi  
    00F6152E  cmp         edi,2710h  
    00F61534  jne         measure<unsigned int>+3Eh (0F614CEh)  
        } 
    
        4
  •  0
  •   anio    12 年前

    我在运行Linux的64位机器上运行了GCC4.7.0。 我把计时电话换成了计时电话。

    CPU:Intel X5680@3.33 GHZ

    GCC标志:-Wall-pedantic-O3-std=c++11

    With int time per operation in ns: 11996, total time sec: 1.57237
    Avg values: 1.06353e+09
    With unsigned int time per operation in ns: 11539, total time sec: 1.5125
    Avg values: 1.06353e+09
    With int time per operation in ns: 11994, total time sec: 1.57217
    Avg values: 1.06353e+09
    

    显然,在我的机器/编译器上,unsigned更快。