代码之家  ›  专栏  ›  技术社区  ›  markzzz

为什么godbolt生成的asm输出与visualstudio中实际的asm代码不同?

  •  1
  • markzzz  · 技术社区  · 6 年前

    godbolt .

    下面是visualstudio在我的主.asm文件(由Project & Gt;C/C++ + &输出文件)和汇编程序(源代码(/FAS)在汇编程序输出字段下):

    ; Listing generated by Microsoft (R) Optimizing Compiler Version 19.15.26732.1 
    
        TITLE   c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
        .686P
        .XMM
        include listing.inc
        .model  flat
    
    INCLUDELIB OLDNAMES
    
    EXTRN   __imp____std_terminate:PROC
    EXTRN   @__security_check_cookie@4:PROC
    EXTRN   __imp____CxxFrameHandler3:PROC
    PUBLIC  ??$?RABNABN@?$less@X@std@@QBE_NABN0@Z       ; std::less<void>::operator()<double const &,double const &>
    PUBLIC  ??$clamp@NU?$less@X@std@@@std@@YAABNABN00U?$less@X@0@@Z ; std::clamp<double,std::less<void> >
    PUBLIC  ??$clamp@N@std@@YAABNABN00@Z            ; std::clamp<double>
    PUBLIC  _main
    PUBLIC  ?ProcessOptimized@MyPlugin@@QAEXH@Z     ; MyPlugin::ProcessOptimized
    PUBLIC  ?Process@MyPlugin@@QAEXH@Z          ; MyPlugin::Process
    PUBLIC  ??1MyPlugin@@QAE@XZ             ; MyPlugin::~MyPlugin
    PUBLIC  ??0MyPlugin@@QAE@XZ             ; MyPlugin::MyPlugin
    PUBLIC  ?ProcessOptimized@Param@@QAEXHH@Z       ; Param::ProcessOptimized
    PUBLIC  ?Process@Param@@QAEXHH@Z            ; Param::Process
    PUBLIC  ??0Param@@QAE@XZ                ; Param::Param
    PUBLIC  __real@3ff0000000000000
    PUBLIC  __real@400921fb54442d18
    PUBLIC  __real@4024000000000000
    PUBLIC  __real@406fe00000000000
    PUBLIC  __xmm@00000003000000020000000100000000
    PUBLIC  __xmm@400921fb54442d18400921fb54442d18
    PUBLIC  __xmm@406fe00000000000406fe00000000000
    EXTRN   __chkstk:PROC
    EXTRN   ___security_cookie:DWORD
    EXTRN   __fltused:DWORD
    ;   COMDAT __xmm@406fe00000000000406fe00000000000
    CONST   SEGMENT
    __xmm@406fe00000000000406fe00000000000 DB 00H, 00H, 00H, 00H, 00H, 0e0H, 'o'
        DB  '@', 00H, 00H, 00H, 00H, 00H, 0e0H, 'o@'
    CONST   ENDS
    ;   COMDAT __xmm@400921fb54442d18400921fb54442d18
    CONST   SEGMENT
    __xmm@400921fb54442d18400921fb54442d18 DB 018H, '-DT', 0fbH, '!', 09H, '@'
        DB  018H, '-DT', 0fbH, '!', 09H, '@'
    CONST   ENDS
    ;   COMDAT __xmm@00000003000000020000000100000000
    CONST   SEGMENT
    __xmm@00000003000000020000000100000000 DB 00H, 00H, 00H, 00H, 01H, 00H, 00H
        DB  00H, 02H, 00H, 00H, 00H, 03H, 00H, 00H, 00H
    CONST   ENDS
    ;   COMDAT __real@406fe00000000000
    CONST   SEGMENT
    __real@406fe00000000000 DQ 0406fe00000000000r   ; 255
    CONST   ENDS
    ;   COMDAT __real@4024000000000000
    CONST   SEGMENT
    __real@4024000000000000 DQ 04024000000000000r   ; 10
    CONST   ENDS
    ;   COMDAT __real@400921fb54442d18
    CONST   SEGMENT
    __real@400921fb54442d18 DQ 0400921fb54442d18r   ; 3.14159
    CONST   ENDS
    ;   COMDAT __real@3ff0000000000000
    CONST   SEGMENT
    __real@3ff0000000000000 DQ 03ff0000000000000r   ; 1
    CONST   ENDS
    ; Function compile flags: /Ogtp
    ; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    ;   COMDAT ??0Param@@QAE@XZ
    _TEXT   SEGMENT
    ??0Param@@QAE@XZ PROC                   ; Param::Param, COMDAT
    ; _this$ = ecx
    
    ; 23   :    Param() { }
    
        xorps   xmm0, xmm0
        mov eax, ecx
        movsd   QWORD PTR [ecx], xmm0
        movsd   QWORD PTR [ecx+16], xmm0
        movsd   xmm0, QWORD PTR __real@4024000000000000
        movsd   QWORD PTR [ecx+32], xmm0
        movsd   xmm0, QWORD PTR __real@3ff0000000000000
        movsd   QWORD PTR [ecx+48], xmm0
        movsd   QWORD PTR [ecx+64], xmm0
        ret 0
    ??0Param@@QAE@XZ ENDP                   ; Param::Param
    _TEXT   ENDS
    ; Function compile flags: /Ogtp
    ; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
    ; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    ;   COMDAT ?Process@Param@@QAEXHH@Z
    _TEXT   SEGMENT
    $T1 = -24                       ; size = 8
    $T3 = -16                       ; size = 8
    $T2 = -8                        ; size = 8
    _voiceIndex$ = 8                    ; size = 4
    _blockSize$dead$ = 12                   ; size = 4
    ?Process@Param@@QAEXHH@Z PROC               ; Param::Process, COMDAT
    ; _this$ = ecx
    
    ; 25   :    inline void Process(int voiceIndex, int blockSize) {
    
        push    ebp
        mov ebp, esp
        sub esp, 24                 ; 00000018H
    
    ; 26   :        double *pB = b[voiceIndex];
    
        mov eax, DWORD PTR _voiceIndex$[ebp]
        xorps   xmm5, xmm5
    
    ; 32   :            // some other code (that will use phase, like sin(phase))
    ; 33   : 
    ; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
    
        movsd   xmm2, QWORD PTR __real@400921fb54442d18
        push    esi
        mov esi, ecx
        shl eax, 11                 ; 0000000bH
        push    edi
        movsd   QWORD PTR $T1[ebp], xmm2
        mov ecx, 256                ; 00000100H
        movsd   QWORD PTR $T2[ebp], xmm5
        movsd   xmm3, QWORD PTR [esi+48]
        lea edx, DWORD PTR [esi+2128]
        movsd   xmm1, QWORD PTR [esi]
        add edx, eax
        mulsd   xmm3, QWORD PTR [esi+32]
        movsd   xmm4, QWORD PTR [esi+64]
        npad    11
    $LL4@Process:
        movsd   xmm0, QWORD PTR [edx-2048]
        mulsd   xmm0, xmm3
        addsd   xmm0, QWORD PTR [edx]
        mulsd   xmm0, xmm4
        comisd  xmm0, xmm2
        movsd   QWORD PTR $T3[ebp], xmm0
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
    
    ; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
    
        jbe SHORT $LN10@Process
        movaps  xmm0, xmm2
        jmp SHORT $LN11@Process
    $LN10@Process:
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
    
    ; 287  :        return (static_cast<_Ty1&&>(_Left)
    
        comisd  xmm5, xmm0
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
    
    ; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
    
        lea eax, DWORD PTR $T2[ebp]
        lea edi, DWORD PTR $T3[ebp]
        cmovbe  eax, edi
        movsd   xmm0, QWORD PTR [eax]
    $LN11@Process:
    ; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    
    ; 31   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
    
        add edx, 8
    
    ; 32   :            // some other code (that will use phase, like sin(phase))
    ; 33   : 
    ; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
    
        addsd   xmm1, xmm0
        sub ecx, 1
        jne SHORT $LL4@Process
    
    ; 35   :        }
    ; 36   : 
    ; 37   :        mPhase = phase;
    ; 38   :    }
    
        pop edi
        movsd   QWORD PTR [esi], xmm1
        pop esi
        mov esp, ebp
        pop ebp
        ret 8
    ?Process@Param@@QAEXHH@Z ENDP               ; Param::Process
    _TEXT   ENDS
    ; Function compile flags: /Ogtp
    ; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    ;   COMDAT ?ProcessOptimized@Param@@QAEXHH@Z
    _TEXT   SEGMENT
    _v_phase$ = -16                     ; size = 16
    _voiceIndex$ = 8                    ; size = 4
    _blockSize$dead$ = 12                   ; size = 4
    ?ProcessOptimized@Param@@QAEXHH@Z PROC          ; Param::ProcessOptimized, COMDAT
    ; _this$ = ecx
    
    ; 39   :    inline void ProcessOptimized(int voiceIndex, int blockSize) {
    
        push    ebx
        mov ebx, esp
        sub esp, 8
        and esp, -16                ; fffffff0H
        add esp, 4
        push    ebp
        mov ebp, DWORD PTR [ebx+4]
        mov DWORD PTR [esp+4], ebp
        mov ebp, esp
    
    ; 40   :        double *pB = b[voiceIndex];
    
        mov eax, DWORD PTR _voiceIndex$[ebx]
        mov edx, ecx
        shl eax, 11                 ; 0000000bH
        xorps   xmm3, xmm3
        xorps   xmm2, xmm2
        sub esp, 16                 ; 00000010H
        xorps   xmm7, xmm7
        mov ecx, 128                ; 00000080H
    
    ; 41   :        double *pC = c[voiceIndex];
    ; 42   :        double phase = mPhaseOptimized;
    ; 43   :        double bp0 = mNoteFrequency * mHostPitch;
    
        movsd   xmm5, QWORD PTR [edx+48]
        mulsd   xmm5, QWORD PTR [edx+32]
    
    ; 44   : 
    ; 45   :        __m128d v_boundLower = _mm_set1_pd(0.0);
    ; 46   :        __m128d v_boundUpper = _mm_set1_pd(PI);
    ; 47   :        __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
    
        movsd   xmm6, QWORD PTR [edx+64]
    
    ; 48   :        __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
    ; 49   : 
    ; 50   :        __m128d v_pB0 = _mm_load_pd(pB);
    ; 51   :        v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
    ; 52   :        __m128d v_pC0 = _mm_load_pd(pC);
    ; 53   :        v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
    ; 54   : 
    ; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
    
        movsd   xmm0, QWORD PTR [eax+edx+80]
        movups  xmm4, XMMWORD PTR [eax+edx+80]
        movups  xmm1, XMMWORD PTR [eax+edx+2128]
        mulsd   xmm5, xmm6
        unpcklpd xmm3, xmm0
    
    ; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
    ; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
    
        movsd   xmm0, QWORD PTR [eax+edx+2128]
        add eax, 2136               ; 00000858H
        unpcklpd xmm2, xmm0
        add eax, edx
    
    ; 58   :        v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
    ; 59   : 
    ; 60   :        __m128d v_phaseAcc1;
    ; 61   :        __m128d v_phaseAcc2;
    ; 62   :        __m128d v_phase = _mm_set1_pd(phase);
    
        movsd   xmm0, QWORD PTR [edx+16]
        unpcklpd xmm5, xmm5
        unpcklpd xmm6, xmm6
        mulpd   xmm4, xmm5
        mulpd   xmm1, xmm6
        mulpd   xmm3, xmm5
        mulpd   xmm2, xmm6
        unpcklpd xmm0, xmm0
        npad    2
    $LL4@ProcessOpt:
    
    ; 63   : 
    ; 64   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
    ; 65   :            // some other code (that will use phase, like sin(phase))
    ; 66   : 
    ; 67   :            v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);
    
        addpd   xmm1, xmm4
    
    ; 68   :            v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
    ; 69   :            v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
    ; 70   :            v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
    ; 71   :            v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
    ; 72   :            v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
    ; 73   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
    ; 74   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
    ; 75   : 
    ; 76   :            v_pB0 = _mm_load_pd(pB + 2);
    
        movups  xmm4, XMMWORD PTR [eax-2040]
        addpd   xmm2, xmm3
    
    ; 77   :            v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
    ; 78   :            v_pC0 = _mm_load_pd(pC + 2);
    ; 79   :            v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
    ; 80   : 
    ; 81   :            v_pB1 = _mm_loadu_pd(pB + 1);
    
        movups  xmm3, XMMWORD PTR [eax-2048]
        maxpd   xmm1, xmm7
        maxpd   xmm2, xmm7
        minpd   xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
        minpd   xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
        addpd   xmm0, xmm1
        movups  xmm1, XMMWORD PTR [eax+8]
        addpd   xmm0, xmm2
    
    ; 82   :            v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
    ; 83   :            v_pC1 = _mm_loadu_pd(pC + 1);
    
        movups  xmm2, XMMWORD PTR [eax]
        add eax, 16                 ; 00000010H
        movaps  XMMWORD PTR _v_phase$[ebp], xmm0
        mulpd   xmm4, xmm5
        mulpd   xmm1, xmm6
        mulpd   xmm3, xmm5
    
    ; 84   :            v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
    
        mulpd   xmm2, xmm6
        sub ecx, 1
        jne SHORT $LL4@ProcessOpt
    
    ; 85   :        }
    ; 86   : 
    ; 87   :        mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
    
        movsd   xmm0, QWORD PTR _v_phase$[ebp+8]
        movsd   QWORD PTR [edx+16], xmm0
    
    ; 88   :    }
    
        mov esp, ebp
        pop ebp
        mov esp, ebx
        pop ebx
        ret 8
    ?ProcessOptimized@Param@@QAEXHH@Z ENDP          ; Param::ProcessOptimized
    _TEXT   ENDS
    ; Function compile flags: /Ogtp
    ; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    ;   COMDAT ??0MyPlugin@@QAE@XZ
    _TEXT   SEGMENT
    ??0MyPlugin@@QAE@XZ PROC                ; MyPlugin::MyPlugin, COMDAT
    ; _this$ = ecx
    
    ; 97   :        // fill b
    ; 98   :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
    ; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
    
        movaps  xmm2, XMMWORD PTR __xmm@00000003000000020000000100000000
        xorps   xmm0, xmm0
        movaps  xmm3, XMMWORD PTR __xmm@406fe00000000000406fe00000000000
        xor edx, edx
        push    esi
        mov esi, ecx
        push    edi
    
    ; 14   :    alignas(16) double mPhase = 0.0;
    
        movsd   QWORD PTR [esi], xmm0
    
    ; 97   :        // fill b
    ; 98   :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
    ; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
    
        lea ecx, DWORD PTR [esi+88]
    
    ; 15   :    alignas(16) double mPhaseOptimized = 0.0;
    
        movsd   QWORD PTR [esi+16], xmm0
    
    ; 16   :    alignas(16) double mNoteFrequency = 10.0;
    
        movsd   xmm0, QWORD PTR __real@4024000000000000
        movsd   QWORD PTR [esi+32], xmm0
    
    ; 17   :    alignas(16) double mHostPitch = 1.0;
    
        movsd   xmm0, QWORD PTR __real@3ff0000000000000
        movsd   QWORD PTR [esi+48], xmm0
    
    ; 18   :    alignas(16) double mRadiansPerSample = 1.0;
    
        movsd   QWORD PTR [esi+64], xmm0
    $LL7@MyPlugin:
    
    ; 100  :                double value = (sampleIndex / ((double)bufferSize - 1));
    
        movd    xmm0, edx
        lea eax, DWORD PTR [edx+2]
        pshufd  xmm1, xmm0, 0
        lea ecx, DWORD PTR [ecx+32]
        movq    xmm0, xmm2
        add edx, 4
        paddd   xmm1, xmm0
        cvtdq2pd xmm0, xmm1
        divpd   xmm0, xmm3
    
    ; 101  : 
    ; 102  :                mParam1.b[voiceIndex][sampleIndex] = value;
    
        movlpd  QWORD PTR [ecx-40], xmm0
        movhpd  QWORD PTR [ecx-32], xmm0
        movd    xmm0, eax
        pshufd  xmm1, xmm0, 0
        movq    xmm0, xmm2
        paddd   xmm1, xmm0
        cvtdq2pd xmm0, xmm1
        divpd   xmm0, xmm3
        movlpd  QWORD PTR [ecx-24], xmm0
        movhpd  QWORD PTR [ecx-16], xmm0
        cmp edx, 256                ; 00000100H
        jl  SHORT $LL7@MyPlugin
    
    ; 103  :            }
    ; 104  :        }
    ; 105  : 
    ; 106  :        // fill c
    ; 107  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
    ; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
    
        lea edi, DWORD PTR [esi+2128]
        xor eax, eax
        mov ecx, 512                ; 00000200H
        rep stosd
    
    ; 109  :                double value = 0.0;
    ; 110  : 
    ; 111  :                mParam1.c[voiceIndex][sampleIndex] = value;
    ; 112  :            }
    ; 113  :        }
    ; 114  :    }
    
        pop edi
        mov eax, esi
        pop esi
        ret 0
    ??0MyPlugin@@QAE@XZ ENDP                ; MyPlugin::MyPlugin
    _TEXT   ENDS
    ; Function compile flags: /Ogtp
    ; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    ;   COMDAT ??1MyPlugin@@QAE@XZ
    _TEXT   SEGMENT
    ??1MyPlugin@@QAE@XZ PROC                ; MyPlugin::~MyPlugin, COMDAT
    ; _this$dead$ = ecx
    
    ; 115  :    ~MyPlugin() { }
    
        ret 0
    ??1MyPlugin@@QAE@XZ ENDP                ; MyPlugin::~MyPlugin
    _TEXT   ENDS
    ; Function compile flags: /Ogtp
    ; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
    ; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    ;   COMDAT ?Process@MyPlugin@@QAEXH@Z
    _TEXT   SEGMENT
    $T2 = -28                       ; size = 8
    $T4 = -20                       ; size = 8
    $T3 = -12                       ; size = 8
    _blockSize$dead$ = 8                    ; size = 4
    ?Process@MyPlugin@@QAEXH@Z PROC             ; MyPlugin::Process, COMDAT
    ; _this$ = ecx
    
    ; 117  :    void Process(int blockSize) {
    
        push    ebp
        mov ebp, esp
        sub esp, 28                 ; 0000001cH
    
    ; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
    
        movsd   xmm2, QWORD PTR __real@400921fb54442d18
        xorps   xmm5, xmm5
    
    ; 117  :    void Process(int blockSize) {
    
        push    esi
        mov esi, ecx
    
    ; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
    
        movsd   QWORD PTR $T2[ebp], xmm2
    
    ; 117  :    void Process(int blockSize) {
    
        push    edi
    
    ; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
    
        movsd   QWORD PTR $T3[ebp], xmm5
        mov edx, 256                ; 00000100H
        movsd   xmm3, QWORD PTR [esi+48]
    
    ; 27   :        double *pC = c[voiceIndex];
    
        lea ecx, DWORD PTR [esi+2128]
    
    ; 28   :        double phase = mPhase;
    ; 29   :        double bp0 = mNoteFrequency * mHostPitch;
    
        movsd   xmm1, QWORD PTR [esi]
        mulsd   xmm3, QWORD PTR [esi+32]
        movsd   xmm4, QWORD PTR [esi+64]
        npad    3
    $LL9@Process:
    
    ; 32   :            // some other code (that will use phase, like sin(phase))
    ; 33   : 
    ; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
    
        movsd   xmm0, QWORD PTR [ecx-2048]
        mulsd   xmm0, xmm3
        addsd   xmm0, QWORD PTR [ecx]
        mulsd   xmm0, xmm4
        comisd  xmm0, xmm2
        movsd   QWORD PTR $T4[ebp], xmm0
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
    
    ; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
    
        jbe SHORT $LN15@Process
        movaps  xmm0, xmm2
        jmp SHORT $LN16@Process
    $LN15@Process:
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
    
    ; 287  :        return (static_cast<_Ty1&&>(_Left)
    
        comisd  xmm5, xmm0
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
    
    ; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
    
        lea eax, DWORD PTR $T3[ebp]
        lea edi, DWORD PTR $T4[ebp]
        cmovbe  eax, edi
        movsd   xmm0, QWORD PTR [eax]
    $LN16@Process:
    ; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    
    ; 31   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
    
        add ecx, 8
    
    ; 32   :            // some other code (that will use phase, like sin(phase))
    ; 33   : 
    ; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
    
        addsd   xmm1, xmm0
        sub edx, 1
        jne SHORT $LL9@Process
    
    ; 118  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
    ; 119  :            mParam1.Process(voiceIndex, blockSize);
    ; 120  :        }
    ; 121  :    }
    
        pop edi
    
    ; 37   :        mPhase = phase;
    
        movsd   QWORD PTR [esi], xmm1
    
    ; 118  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
    ; 119  :            mParam1.Process(voiceIndex, blockSize);
    ; 120  :        }
    ; 121  :    }
    
        pop esi
        mov esp, ebp
        pop ebp
        ret 4
    ?Process@MyPlugin@@QAEXH@Z ENDP             ; MyPlugin::Process
    _TEXT   ENDS
    ; Function compile flags: /Ogtp
    ; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    ;   COMDAT ?ProcessOptimized@MyPlugin@@QAEXH@Z
    _TEXT   SEGMENT
    _v_phase$31 = -16                   ; size = 16
    _blockSize$dead$ = 8                    ; size = 4
    ?ProcessOptimized@MyPlugin@@QAEXH@Z PROC        ; MyPlugin::ProcessOptimized, COMDAT
    ; _this$ = ecx
    
    ; 122  :    void ProcessOptimized(int blockSize) {
    
        push    ebx
        mov ebx, esp
        sub esp, 8
        and esp, -16                ; fffffff0H
        add esp, 4
        push    ebp
        mov ebp, DWORD PTR [ebx+4]
        mov DWORD PTR [esp+4], ebp
        mov ebp, esp
        mov edx, ecx
        xorps   xmm3, xmm3
        xorps   xmm2, xmm2
        sub esp, 16                 ; 00000010H
    
    ; 40   :        double *pB = b[voiceIndex];
    
        mov ecx, 128                ; 00000080H
        movsd   xmm6, QWORD PTR [edx+48]
        lea eax, DWORD PTR [edx+2136]
        mulsd   xmm6, QWORD PTR [edx+32]
    
    ; 41   :        double *pC = c[voiceIndex];
    ; 42   :        double phase = mPhaseOptimized;
    ; 43   :        double bp0 = mNoteFrequency * mHostPitch;
    ; 44   : 
    ; 45   :        __m128d v_boundLower = _mm_set1_pd(0.0);
    ; 46   :        __m128d v_boundUpper = _mm_set1_pd(PI);
    ; 47   :        __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
    
        movsd   xmm7, QWORD PTR [edx+64]
    
    ; 54   : 
    ; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
    
        movsd   xmm0, QWORD PTR [edx+80]
        movsd   xmm5, QWORD PTR [edx+16]
        movups  xmm4, XMMWORD PTR [edx+80]
        movups  xmm1, XMMWORD PTR [edx+2128]
        mulsd   xmm6, xmm7
        unpcklpd xmm3, xmm0
    
    ; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
    
        movsd   xmm0, QWORD PTR [edx+2128]
        unpcklpd xmm7, xmm7
        unpcklpd xmm6, xmm6
        unpcklpd xmm2, xmm0
        xorps   xmm0, xmm0
    
    ; 48   :        __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
    ; 49   : 
    ; 50   :        __m128d v_pB0 = _mm_load_pd(pB);
    ; 51   :        v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
    
        mulpd   xmm4, xmm6
    
    ; 52   :        __m128d v_pC0 = _mm_load_pd(pC);
    ; 53   :        v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
    
        mulpd   xmm1, xmm7
    
    ; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
    
        mulpd   xmm3, xmm6
    
    ; 58   :        v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
    
        mulpd   xmm2, xmm7
    
    ; 59   : 
    ; 60   :        __m128d v_phaseAcc1;
    ; 61   :        __m128d v_phaseAcc2;
    ; 62   :        __m128d v_phase = _mm_set1_pd(phase);
    
        unpcklpd xmm5, xmm5
        npad    13
    $LL9@ProcessOpt:
    
    ; 63   : 
    ; 64   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
    ; 65   :            // some other code (that will use phase, like sin(phase))
    ; 66   : 
    ; 67   :            v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);
    
        addpd   xmm1, xmm4
    
    ; 68   :            v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
    ; 69   :            v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
    ; 70   :            v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
    ; 71   :            v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
    ; 72   :            v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
    ; 73   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
    ; 74   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
    ; 75   : 
    ; 76   :            v_pB0 = _mm_load_pd(pB + 2);
    
        movups  xmm4, XMMWORD PTR [eax-2040]
        addpd   xmm2, xmm3
    
    ; 77   :            v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
    ; 78   :            v_pC0 = _mm_load_pd(pC + 2);
    ; 79   :            v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
    ; 80   : 
    ; 81   :            v_pB1 = _mm_loadu_pd(pB + 1);
    
        movups  xmm3, XMMWORD PTR [eax-2048]
        maxpd   xmm1, xmm0
        maxpd   xmm2, xmm0
        minpd   xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
        minpd   xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
        addpd   xmm5, xmm1
        movups  xmm1, XMMWORD PTR [eax+8]
        addpd   xmm5, xmm2
    
    ; 82   :            v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
    ; 83   :            v_pC1 = _mm_loadu_pd(pC + 1);
    
        movups  xmm2, XMMWORD PTR [eax]
        add eax, 16                 ; 00000010H
        movaps  XMMWORD PTR _v_phase$31[ebp], xmm5
        mulpd   xmm4, xmm6
        mulpd   xmm1, xmm7
        mulpd   xmm3, xmm6
    
    ; 84   :            v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
    
        mulpd   xmm2, xmm7
        sub ecx, 1
        jne SHORT $LL9@ProcessOpt
    
    ; 85   :        }
    ; 86   : 
    ; 87   :        mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
    
        movsd   xmm0, QWORD PTR _v_phase$31[ebp+8]
        movsd   QWORD PTR [edx+16], xmm0
    
    ; 123  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
    ; 124  :            mParam1.ProcessOptimized(voiceIndex, blockSize);
    ; 125  :        }
    ; 126  :    }
    
        mov esp, ebp
        pop ebp
        mov esp, ebx
        pop ebx
        ret 4
    ?ProcessOptimized@MyPlugin@@QAEXH@Z ENDP        ; MyPlugin::ProcessOptimized
    _TEXT   ENDS
    ; Function compile flags: /Ogtp
    ; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
    ; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    ;   COMDAT _main
    _TEXT   SEGMENT
    _counterProcessing$1$ = -4304               ; size = 4
    _counterProcessing$ = -4304             ; size = 8
    _bp0$1$ = -4296                     ; size = 8
    _v_radiansPerSample$1$ = -4288              ; size = 16
    $T3 = -4264                     ; size = 8
    _v_phase$38 = -4256                 ; size = 16
    $T4 = -4256                     ; size = 8
    $T2 = -4232                     ; size = 8
    tv1040 = -4224                      ; size = 16
    tv1039 = -4208                      ; size = 16
    _myPlugin$ = -4192                  ; size = 4176
    __$ArrayPad$ = -4                   ; size = 4
    _main   PROC                        ; COMDAT
    
    ; 129  : int main() {
    
        push    ebp
        mov ebp, esp
        and esp, -16                ; fffffff0H
        mov eax, 4312               ; 000010d8H
        call    __chkstk
        mov eax, DWORD PTR ___security_cookie
        xor eax, esp
        mov DWORD PTR __$ArrayPad$[esp+4312], eax
    
    ; 16   :    alignas(16) double mNoteFrequency = 10.0;
    
        movsd   xmm0, QWORD PTR __real@4024000000000000
    
    ; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
    
        lea ecx, DWORD PTR _myPlugin$[esp+4392]
        movsd   xmm1, QWORD PTR __real@406fe00000000000
        xorps   xmm2, xmm2
    
    ; 16   :    alignas(16) double mNoteFrequency = 10.0;
    
        movsd   QWORD PTR _myPlugin$[esp+4344], xmm0
    
    ; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
    
        xor eax, eax
    
    ; 17   :    alignas(16) double mHostPitch = 1.0;
    
        movsd   xmm0, QWORD PTR __real@3ff0000000000000
    
    ; 129  : int main() {
    
        push    esi
        push    edi
    
    ; 14   :    alignas(16) double mPhase = 0.0;
    
        movsd   QWORD PTR _myPlugin$[esp+4320], xmm2
    
    ; 15   :    alignas(16) double mPhaseOptimized = 0.0;
    
        movsd   QWORD PTR _myPlugin$[esp+4336], xmm2
    
    ; 17   :    alignas(16) double mHostPitch = 1.0;
    
        movsd   QWORD PTR _myPlugin$[esp+4368], xmm0
    
    ; 18   :    alignas(16) double mRadiansPerSample = 1.0;
    
        movsd   QWORD PTR _myPlugin$[esp+4384], xmm0
    $LL11@main:
        movd    xmm0, eax
    
    ; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
    
        lea ecx, DWORD PTR [ecx+8]
    
    ; 100  :                double value = (sampleIndex / ((double)bufferSize - 1));
    
        cvtdq2pd xmm0, xmm0
        inc eax
        divsd   xmm0, xmm1
    
    ; 101  : 
    ; 102  :                mParam1.b[voiceIndex][sampleIndex] = value;
    
        movsd   QWORD PTR [ecx-8], xmm0
        cmp eax, 256                ; 00000100H
        jl  SHORT $LL11@main
    
    ; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
    
        movsd   xmm6, QWORD PTR __real@400921fb54442d18
    
    ; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
    
        lea edi, DWORD PTR _myPlugin$[esp+6448]
        mov ecx, 512                ; 00000200H
    
    ; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
    
        movsd   QWORD PTR $T2[esp+4320], xmm6
    
    ; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
    
        xor eax, eax
    
    ; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
    
        movsd   QWORD PTR $T3[esp+4320], xmm2
    
    ; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
    
        rep stosd
        movsd   xmm3, QWORD PTR _myPlugin$[esp+4352]
        xorps   xmm0, xmm0
        mulsd   xmm3, QWORD PTR _myPlugin$[esp+4368]
    
    ; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
    
        movaps  xmm4, xmm2
        movsd   xmm1, QWORD PTR _myPlugin$[esp+4384]
    
    ; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
    ; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
    
        movsd   xmm5, QWORD PTR _myPlugin$[esp+4336]
    
    ; 130  :    MyPlugin myPlugin;
    ; 131  : 
    ; 132  :    long long numProcessing = 5;
    ; 133  :    long long counterProcessing = 0;
    
        movlpd  QWORD PTR _counterProcessing$[esp+4320], xmm0
    
    ; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
    
        movsd   xmm0, QWORD PTR _myPlugin$[esp+4400]
        movaps  xmm7, xmm3
        mulsd   xmm7, QWORD PTR _myPlugin$[esp+4384]
    
    ; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
    ; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
    
        mov edi, DWORD PTR _counterProcessing$[esp+4324]
        mov esi, DWORD PTR _counterProcessing$[esp+4320]
        unpcklpd xmm4, xmm0
        movsd   xmm0, QWORD PTR _myPlugin$[esp+6448]
        movups  XMMWORD PTR tv1040[esp+4320], xmm4
        movaps  xmm4, xmm2
        unpcklpd xmm1, xmm1
        unpcklpd xmm4, xmm0
        movups  XMMWORD PTR tv1039[esp+4320], xmm4
        movsd   xmm4, QWORD PTR _myPlugin$[esp+4320]
        movsd   QWORD PTR _bp0$1$[esp+4320], xmm3
        unpcklpd xmm7, xmm7
        movaps  XMMWORD PTR _v_radiansPerSample$1$[esp+4320], xmm1
        npad    8
    $LL2@main:
    
    ; 134  : 
    ; 135  :    // I'll only process once block, just for analysis
    ; 136  :    while (counterProcessing++ < numProcessing) {
    
        add esi, 1
    
    ; 26   :        double *pB = b[voiceIndex];
    
        lea ecx, DWORD PTR _myPlugin$[esp+6448]
    
    ; 134  : 
    ; 135  :    // I'll only process once block, just for analysis
    ; 136  :    while (counterProcessing++ < numProcessing) {
    
        mov DWORD PTR _counterProcessing$1$[esp+4320], esi
    
    ; 26   :        double *pB = b[voiceIndex];
    
        mov edx, 256                ; 00000100H
    
    ; 134  : 
    ; 135  :    // I'll only process once block, just for analysis
    ; 136  :    while (counterProcessing++ < numProcessing) {
    
        adc edi, 0
        npad    10
    $LL29@main:
    
    ; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
    
        movsd   xmm0, QWORD PTR [ecx-2048]
        mulsd   xmm0, xmm3
        addsd   xmm0, QWORD PTR [ecx]
        mulsd   xmm0, QWORD PTR _myPlugin$[esp+4384]
        comisd  xmm0, xmm6
        movsd   QWORD PTR $T4[esp+4320], xmm0
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
    
    ; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
    
        jbe SHORT $LN35@main
        movaps  xmm0, xmm6
        jmp SHORT $LN36@main
    $LN35@main:
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
    
    ; 287  :        return (static_cast<_Ty1&&>(_Left)
    
        comisd  xmm2, xmm0
    ; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
    
    ; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
    
        lea eax, DWORD PTR $T3[esp+4320]
        lea esi, DWORD PTR $T4[esp+4320]
        cmovbe  eax, esi
        movsd   xmm0, QWORD PTR [eax]
    
    // ...
    

    ( :我删除了一些行,因为StackOverflow限制了它。)

    完全不同。另外,我看到由VS生成的代码有点冗余,即搜索字符串 phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI); :有很多。

    0 回复  |  直到 6 年前
        1
  •  2
  •   Hadi Brais    6 年前

    您似乎没有使用相同的编译器标志。visualstudio中的程序集转储显示每个函数都使用标志进行了优化 /Ogtp ,在指定 /Og /Ot /O2 /Ogtpy . 如果我手动添加 /Oy 标志时,代码会略有不同,但仍然与VisualStudio生成的代码不同。

    我意识到编译器的版本并不完全相同,但是19.15.26726.0和19.15.26732.1之间的差异非常小,可能只包括错误修复。我认为还有其他不同的标志。您可以转到项目的属性页,找到“所有选项”和“其他选项”窗格中使用的所有编译器选项。在发布版本中,除了 /arch:SSE2 /Ot /O2 . 请注意 /arch:SSE2 is the default ,因此不必显式指定它。也, /O2 implies /Ot . 所以呢 /拱门:SSE2/Ot/氧气 /氧气 .

        2
  •  0
  •   Robert Houghton    6 年前

    一个目的地有多条路径。

    罗杰·奥尔给了一个很好的回答 talk at an ACCU conference

    19.15.26726.0 19.15.26732.1

    很接近,但也许足以改变这一切?

    19.15.26726.0

    19.15.26732.1

    MSVC特别奇怪,您可以用GCC输出asm,然后使用该asm再次通过GCC并获得相同的机器代码。你不能在MSVC里。所以,如果版本完全相同,你仍然会得到不同的asm,那将是一个有趣的实验, this article 演示如何在visualstudio中并排运行两个不同版本的MSVC。