Why is optimizing the O2 cycle better than O3?



  • Why this function?

    void add(int count, float* results, const float* dataA, const float* dataB) {
      for (int i = 0; i < count; ++i)
        results[i] = dataA[i] + dataB[i];
    }
    

    at -O2 optimized into smaller code

    add(int, float*, float const*, float const*):
            test    edi, edi
            jle     .L1
            movsx   rdi, edi
            xor     eax, eax
    .L3:
            movss   xmm0, DWORD PTR [rdx+rax*4]
            addss   xmm0, DWORD PTR [rcx+rax*4]
            movss   DWORD PTR [rsi+rax*4], xmm0
            add     rax, 1
            cmp     rdi, rax
            jne     .L3
    .L1:
            ret
    

    What do you mean? -O3

    add(int, float*, float const*, float const*):
            mov     r8d, edi
            mov     rdi, rdx
            mov     rdx, rcx
            test    r8d, r8d
            jle     .L1
            lea     rcx, [rcx+4]
            mov     rax, rsi
            sub     rax, rcx
            cmp     rax, 8
            seta    cl
            cmp     r8d, 1
            setne   al
            test    cl, al
            je      .L3
            lea     rcx, [rdi+4]
            mov     rax, rsi
            sub     rax, rcx
            cmp     rax, 8
            jbe     .L3
            lea     eax, [r8-1]
            mov     r9d, r8d
            cmp     eax, 2
            jbe     .L11
            mov     ecx, r8d
            xor     eax, eax
            shr     ecx, 2
            sal     rcx, 4
    .L5:
            movups  xmm0, XMMWORD PTR [rdi+rax]
            movups  xmm2, XMMWORD PTR [rdx+rax]
            addps   xmm0, xmm2
            movups  XMMWORD PTR [rsi+rax], xmm0
            add     rax, 16
            cmp     rax, rcx
            jne     .L5
            test    r8b, 3
            je      .L1
            mov     ecx, r8d
            mov     r9d, r8d
            and     ecx, -4
            sub     r9d, ecx
            mov     eax, ecx
            cmp     r9d, 1
            je      .L7
    .L4:
            movq    xmm0, QWORD PTR [rdi+rcx*4]
            movq    xmm1, QWORD PTR [rdx+rcx*4]
            addps   xmm0, xmm1
            movlps  QWORD PTR [rsi+rcx*4], xmm0
            test    r9b, 1
            je      .L1
            and     r9d, -2
            add     eax, r9d
    .L7:
            cdqe
            movss   xmm0, DWORD PTR [rdi+rax*4]
            addss   xmm0, DWORD PTR [rdx+rax*4]
            movss   DWORD PTR [rsi+rax*4], xmm0
            ret
    .L3:
            xor     eax, eax
    .L9:
            movss   xmm0, DWORD PTR [rdi+rax*4]
            addss   xmm0, DWORD PTR [rdx+rax*4]
            movss   DWORD PTR [rsi+rax*4], xmm0
            add     rax, 1
            cmp     r8, rax
            jne     .L9
    .L1:
            ret
    .L11:
            xor     ecx, ecx
            xor     eax, eax
            jmp     .L4
    

    https://godbolt.org/z/W1jjrPE6a



  • The code uses single/single vector instructions, so lowering itself to the usual code, the long code will check if parallel calculation can be applied and if it accelerates itself repeatedly. The higher &quot; costs &quot; on the entrance to the cycle do not match the benefits of the parallel calculation.



Suggested Topics

  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2
  • 2