ICS-E4020: Branch predication vs. prediction

C++ code

This is the performance-critical inner loop from a student's solution to task IS1.

Original version

for (int y0=0; y0<=ny-h; y0++) {
  for (int x0=0; x0<=nx-w; x0++) {
    y1 = y0 + h;
    x1 = x0 + w;
    vXc = S00[y1*snx + x1] - S00[y1*snx + x0]
        - S00[y0*snx + x1] + S00[y0*snx + x0];
    vYc = vPc - vXc;
    hXY4 = vXc * vXc * divX + vYc * vYc * divY;
    hXY = hXY4[0]+hXY4[1]+hXY4[2];
    if (hXY > max_hXY) {
      max_hXY = hXY;
      tx0 = x0;
      ty0 = y0;
      tx1 = x1;
      ty1 = y1;
    }
  }
}

Modified version 1

for (int y0=0; y0<=ny-h; y0++) {
  for (int x0=0; x0<=nx-w; x0++) {
    y1 = y0 + h;
    x1 = x0 + w;
    vXc = S00[y1*snx + x1] - S00[y1*snx + x0]
        - S00[y0*snx + x1] + S00[y0*snx + x0];
    vYc = vPc - vXc;
    hXY4 = vXc * vXc * divX + vYc * vYc * divY;
    hXY = hXY4[0]+hXY4[1]+hXY4[2];
    if (hXY > max_hXY) {
      max_hXY = hXY;
      tx0 = x0;
      ty0 = y0;
      tx1 = x1;
      ty1 = y1;
      dummy();
    }
  }
}

Here dummy() is an empty function defined in another compilation unit so that GCC does not know it is a no-op.

Modified version 2

for (int y0=0; y0<=ny-h; y0++) {
  for (int x0=0; x0<=nx-w; x0++) {
    y1 = y0 + h;
    x1 = x0 + w;
    vXc = S00[y1*snx + x1] - S00[y1*snx + x0]
        - S00[y0*snx + x1] + S00[y0*snx + x0];
    vYc = vPc - vXc;
    hXY4 = vXc * vXc * divX + vYc * vYc * divY;
    hXY = hXY4[0]+hXY4[1]+hXY4[2];
    if (hXY > max_hXY) {
      max_hXY = hXY;
      tx0 = x0;
      ty0 = y0;
      tx1 = x1;
      ty1 = y1;
      asm ("#dummy");
    }
  }
}

Here we add some inline assembly code that just consists of a comment.

Performance

Here are timings on my Haswell laptop, using just one thread, no OpenMP, for nx = ny = 250:

The version with a dummy function call is much faster. The explanation is in the machine code generated by GCC.

Machine code

These examples are from a Haswell platform, using GCC 5.1.0.

Original version

GCC has chosen to use branch predication when it generates code for the if statement. There are always 27 instructions in the inner loop. The part related to the if statement is highlighted — these instructions do useful work only if the condition is true (i.e., we have discovered a new maximum). This is a very rare event in practice.

L1:
    vmovapd (%rcx,%rsi), %ymm0
    leal    (%r10,%rax), %edi
    vsubpd  (%rcx), %ymm0, %ymm0
    vsubpd  (%rdx,%rsi), %ymm0, %ymm0
    vaddpd  (%rdx), %ymm0, %ymm0
    vsubpd  %ymm0, %ymm7, %ymm2
    vmulpd  %ymm0, %ymm0, %ymm0
    vmovapd %ymm0, %ymm1
    vmulpd  %ymm2, %ymm2, %ymm2
    vmulpd  %ymm5, %ymm2, %ymm2
    vfmadd132pd %ymm6, %ymm2, %ymm1
    vhaddpd %xmm1, %xmm1, %xmm0
    vextractf128 $0x1, %ymm1, %xmm1
    vaddsd  %xmm1, %xmm0, %xmm0
    vucomisd %xmm3, %xmm0
    vmaxsd  %xmm4, %xmm0, %xmm4
    cmova   %eax, %ebx
    cmova   %r8d, %r12d
    vucomisd %xmm3, %xmm0
    vmovapd %xmm4, %xmm3
    cmova   %edi, %r15d
    cmova   %r9d, %r13d
    addq    $1, %rax
    addq    $32, %rcx
    addq    $32, %rdx
    cmpl    %eax, %r14d
    jge L1

Modified version 1

Now that we have a subroutine call in the if statement, branch predication can no longer be used. GCC resorts to a fairly straightforward code: there is a conditional jump that skips the part related to the if block whenever the condition is false. Now there are a lot more instructions. However, whenever the condition is false, we remain within a block that contains just 20 instructions.

    jmp L2

L1:
    addq    $1, %rbx
    addq    $32, %r12
    addq    $32, %r15
    cmpl    %ebx, %esi
    jl  L3

L2:
    vmovapd (%r12,%r14), %ymm0
    leal    0(%r13,%rbx), %edx
    vsubpd  (%r12), %ymm0, %ymm0
    vsubpd  (%r15,%r14), %ymm0, %ymm0
    vaddpd  (%r15), %ymm0, %ymm0
    vsubpd  %ymm0, %ymm5, %ymm1
    vmulpd  %ymm0, %ymm0, %ymm0
    vmulpd  %ymm1, %ymm1, %ymm1
    vmulpd  %ymm3, %ymm1, %ymm1
    vfmadd132pd %ymm4, %ymm1, %ymm0
    vhaddpd %xmm0, %xmm0, %xmm1
    vextractf128 $0x1, %ymm0, %xmm0
    vaddsd  %xmm0, %xmm1, %xmm0
    vucomisd %xmm2, %xmm0
    jbe L1

    movl    %esi, -208(%rbp)
    vmovapd %ymm3, -176(%rbp)
    vmovapd %ymm4, -144(%rbp)
    movl    %edx, -60(%rbp)
    vmovapd %ymm5, -112(%rbp)
    vmovsd  %xmm0, -56(%rbp)
    vzeroupper
    addq    $32, %r12
    addq    $32, %r15
    call    dummy()
    movl    -60(%rbp), %edx
    movl    %ebx, %r8d
    addq    $1, %rbx
    movl    -208(%rbp), %esi
    vmovsd  -56(%rbp), %xmm0
    movl    -76(%rbp), %edi
    movl    %edx, %r10d
    movl    -64(%rbp), %r9d
    vmovapd %xmm0, %xmm2
    vmovapd -176(%rbp), %ymm3
    vmovapd -144(%rbp), %ymm4
    vmovapd -112(%rbp), %ymm5
    cmpl    %ebx, %esi
    jge L2

L3:

Modified version 2

Here the dummy inline assembly code prevents GCC from using predication. Again, whenever the condition is false there are just 20 instructions. Furthermore, all overhead related to the dummy function call is eliminated, too, so the code is not that expensive even if we take the branch.

L24:
    vmovapd (%rcx,%rsi), %ymm0
    leal    (%r9,%rax), %edi
    vsubpd  (%rcx), %ymm0, %ymm0
    vsubpd  (%rdx,%rsi), %ymm0, %ymm0
    vaddpd  (%rdx), %ymm0, %ymm0
    vsubpd  %ymm0, %ymm5, %ymm1
    vmulpd  %ymm0, %ymm0, %ymm0
    vmulpd  %ymm1, %ymm1, %ymm1
    vmulpd  %ymm3, %ymm1, %ymm1
    vfmadd132pd %ymm4, %ymm1, %ymm0
    vhaddpd %xmm0, %xmm0, %xmm1
    vextractf128    $0x1, %ymm0, %xmm0
    vaddsd  %xmm0, %xmm1, %xmm0
    vucomisd    %xmm2, %xmm0
    jbe L22

    #dummy
    movl    %r11d, %r15d
    movl    %edi, %r13d
    movl    %r10d, %r12d
    movl    %eax, %ebx
    vmovapd %xmm0, %xmm2

L22:
    addq    $1, %rax
    addq    $32, %rcx
    addq    $32, %rdx
    cmpl    %eax, %r8d
    jge L24