Today we go back to basics and see how Burst compiles some fundamental language features: switch statements and ref parameters… with surprising results!

Update: A Russian translation of this article is available.

Simple Switch

Let’s start off by writing a job with a simple switch statement:

[BurstCompile]
struct PlainSwitchJob : IJob
{
    [ReadOnly] public int InVal;
    [WriteOnly] public NativeArray<int> OutVal;
 
    public void Execute()
    {
        int outVal;
        switch (InVal)
        {
            case 10: outVal = 20; break;
            case 20: outVal = 40; break;
            case 30: outVal = 50; break;
            case 40: outVal = 60; break;
            default: outVal = 100; break;
        }
        OutVal[0] = outVal;
    }
}

Now let’s look at the Burst Inspector with Unity 2019.1.10f1 and Burst 1.1.1 to see how this job was compiled for 64-bit macOS:

    mov     r10d, dword ptr [rdi]
    cmp     r10d, 20
    mov     ecx, 40
    mov     r8d, 20
    mov     esi, 20
    cmovg   esi, ecx
    mov     edx, 60
    cmovle  edx, ecx
    mov     r9d, 30
    mov     ecx, 10
    cmovg   ecx, r9d
    mov     eax, 50
    cmovle  eax, r8d
    cmp     r10d, esi
    mov     esi, 100
    cmove   esi, edx
    cmp     r10d, ecx
    cmove   esi, eax
    mov     rax, qword ptr [rdi + 8]
    mov     dword ptr [rax], esi
    ret

Here we see a series of comparisons (cmp) with InVal (r10d) and conditional move instructions depending on whether the result was greater (cmovg), less than or equal (cmovle), or equal (cmove). This is essentially a series of if and else, except that there are no jump instructions.

Switch with When

Now let’s use a C# 7 feature and replace the simple case labels with when clauses:

[BurstCompile]
struct WhereSwitchJob : IJob
{
    [ReadOnly] public int InVal;
    [WriteOnly] public NativeArray<int> OutVal;
 
    public void Execute()
    {
        int outVal;
        switch (InVal)
        {
            case int _ when InVal == 10: outVal = 20; break;
            case int _ when InVal == 20: outVal = 40; break;
            case int _ when InVal == 30: outVal = 50; break;
            case int _ when InVal == 40: outVal = 60; break;
            default: outVal = 100; break;
        }
        OutVal[0] = outVal;
    }
}

Note that this code produces the exact same results. Will we see idential assembly output from Burst? Let’s check:

    mov     ecx, dword ptr [rdi]
    add     ecx, -10
    cmp     ecx, 30
    ja      .LBB0_5
    mov     eax, 20
    movabs  rdx, offset .LJTI0_0
    movsxd  rcx, dword ptr [rdx + 4*rcx]
    add     rcx, rdx
    jmp     rcx
.LBB0_2:
    mov     eax, 40
.LBB0_6:
    mov     rcx, qword ptr [rdi + 8]
    mov     dword ptr [rcx], eax
    ret
.LBB0_5:
    mov     eax, 100
    mov     rcx, qword ptr [rdi + 8]
    mov     dword ptr [rcx], eax
    ret
.LBB0_3:
    mov     eax, 50
    mov     rcx, qword ptr [rdi + 8]
    mov     dword ptr [rcx], eax
    ret
.LBB0_4:
    mov     eax, 60
    mov     rcx, qword ptr [rdi + 8]
    mov     dword ptr [rcx], eax
    ret

The results are definitely not the same! In this case, Burst generated a jump table so that it could perform only one comparison with InVal. Note that the OutVal[0] = inVal code has now been duplicated into each case, slightly bloating the code.

Non-Ref Parameter

Now let’s have a look at how Burst deals with big parameters when they’re not passed as ref:

[StructLayout(LayoutKind.Explicit)]
struct BigStruct
{
    [FieldOffset(128)] public int Val;
}
 
[BurstCompile]
struct NonRefJob : IJob
{
    [ReadOnly] public BigStruct Val1;
    [ReadOnly] public BigStruct Val2;
    [WriteOnly] public NativeArray<int> Result;
 
    public void Execute()
    {
        int f = Foo(Val1);
        f += Foo(Val2);
        Result[0] = f;
    }
 
    int Foo(BigStruct val)
    {
        int outVal;
        switch (val)
        {
            case BigStruct b when b.Val == 10: outVal = 20; break;
            case BigStruct b when b.Val == 20: outVal = 40; break;
            case BigStruct b when b.Val == 30: outVal = 50; break;
            case BigStruct b when b.Val == 40: outVal = 60; break;
            case BigStruct b when b.Val == 50: outVal = 70; break;
            case BigStruct b when b.Val == 60: outVal = 80; break;
            case BigStruct b when b.Val == 70: outVal = 90; break;
            case BigStruct b when b.Val == 80: outVal = 110; break;
            default: outVal = 100; break;
        }
        return outVal;
    }
}

BigStruct is made big by placing Val after 128 bytes of padding. This makes it 132 bytes large, which is pretty big.

After that, we have two calls to pass a BigStruct to Foo as a non-ref parameter. The switch within Foo is an expanded version of what we just saw. By adding a significant instruction count to Foo and calling it twice with different parameters, we ensure that Burst won’t inline the calls and foil our test by removing parameters altogether.

Now let’s see what this compiles to:

; Execute
    push    rbp
    push    r14
    push    rbx
    sub     rsp, 288
    mov     rbx, rdi
    movups  xmm0, xmmword ptr [rbx]
    movups  xmm1, xmmword ptr [rbx + 16]
    movups  xmm2, xmmword ptr [rbx + 32]
    movups  xmm3, xmmword ptr [rbx + 48]
    movups  xmm4, xmmword ptr [rbx + 64]
    movups  xmm5, xmmword ptr [rbx + 80]
    movups  xmm6, xmmword ptr [rbx + 96]
    movups  xmm7, xmmword ptr [rbx + 112]
    mov     eax, dword ptr [rbx + 128]
    movaps  xmmword ptr [rsp], xmm0
    movaps  xmmword ptr [rsp + 16], xmm1
    movaps  xmmword ptr [rsp + 32], xmm2
    movaps  xmmword ptr [rsp + 48], xmm3
    movaps  xmmword ptr [rsp + 64], xmm4
    movaps  xmmword ptr [rsp + 80], xmm5
    movaps  xmmword ptr [rsp + 96], xmm6
    movaps  xmmword ptr [rsp + 112], xmm7
    mov     dword ptr [rsp + 128], eax
    movabs  r14, offset ".LNonRefJob.Foo(NonRefJob* this, BigStruct val)_D56DDBCB4218697B"
    mov     rdi, rsp
    call    r14
    mov     ebp, eax
    movups  xmm0, xmmword ptr [rbx + 132]
    movups  xmm1, xmmword ptr [rbx + 148]
    movups  xmm2, xmmword ptr [rbx + 164]
    movups  xmm3, xmmword ptr [rbx + 180]
    movups  xmm4, xmmword ptr [rbx + 196]
    movups  xmm5, xmmword ptr [rbx + 212]
    movups  xmm6, xmmword ptr [rbx + 228]
    movups  xmm7, xmmword ptr [rbx + 244]
    mov     eax, dword ptr [rbx + 260]
    movaps  xmmword ptr [rsp + 144], xmm0
    movaps  xmmword ptr [rsp + 160], xmm1
    movaps  xmmword ptr [rsp + 176], xmm2
    movaps  xmmword ptr [rsp + 192], xmm3
    movaps  xmmword ptr [rsp + 208], xmm4
    movaps  xmmword ptr [rsp + 224], xmm5
    movaps  xmmword ptr [rsp + 240], xmm6
    movaps  xmmword ptr [rsp + 256], xmm7
    mov     dword ptr [rsp + 272], eax
    lea     rdi, [rsp + 144]
    call    r14
    add     eax, ebp
    mov     rcx, qword ptr [rbx + 264]
    mov     dword ptr [rcx], eax
    add     rsp, 288
    pop     rbx
    pop     r14
    pop     rbp
    ret
 
; Foo
    mov     ecx, dword ptr [rdi + 128]
    add     ecx, -10
    cmp     ecx, 70
    ja      .LBB1_10
    mov     eax, 20
    movabs  rdx, offset .LJTI1_0
    movsxd  rcx, dword ptr [rdx + 4*rcx]
    add     rcx, rdx
    jmp     rcx
.LBB1_2:
    mov     eax, 40
    ret
.LBB1_10:
    mov     eax, 100
    ret
.LBB1_3:
    mov     eax, 50
    ret
.LBB1_4:
    mov     eax, 60
    ret
.LBB1_5:
    mov     eax, 70
    ret
.LBB1_6:
    mov     eax, 80
    ret
.LBB1_7:
    mov     eax, 90
    ret
.LBB1_8:
    mov     eax, 110
.LBB1_9:
    ret

The body of Foo is similar to before since it’s still using a jump table.

In Execute, we see two call instructions showing that Foo is being called twice and hasn’t been inlined. To make those calls, Execute pushes the whole BigStruct to the stack, padding and all.

Ref Parameter

Now let’s try passing BigStruct as a ref parameter:

[BurstCompile]
struct RefJob : IJob
{
    [ReadOnly] public BigStruct Val1;
    [ReadOnly] public BigStruct Val2;
    [WriteOnly] public NativeArray<int> Result;
 
    public void Execute()
    {
        int f = Foo(ref Val1);
        f += Foo(ref Val2);
        Result[0] = f;
    }
 
    int Foo(ref BigStruct val)
    {
        int outVal;
        switch (val)
        {
            case BigStruct b when b.Val == 10: outVal = 20; break;
            case BigStruct b when b.Val == 20: outVal = 40; break;
            case BigStruct b when b.Val == 30: outVal = 50; break;
            case BigStruct b when b.Val == 40: outVal = 60; break;
            case BigStruct b when b.Val == 50: outVal = 70; break;
            case BigStruct b when b.Val == 60: outVal = 80; break;
            case BigStruct b when b.Val == 70: outVal = 90; break;
            case BigStruct b when b.Val == 80: outVal = 110; break;
            default: outVal = 100; break;
        }
        return outVal;
    }
}

All we’ve added is the ref keyword, so let’s see how much that affected the Burst output:

; Execute
    push    rbp
    push    r14
    push    rbx
    mov     rbx, rdi
    movabs  r14, offset ".LRefJob.Foo(RefJob* this, ref BigStruct val)_765ED9C01E5BA6A1"
    call    r14
    mov     ebp, eax
    lea     rdi, [rbx + 132]
    call    r14
    add     eax, ebp
    mov     rcx, qword ptr [rbx + 264]
    mov     dword ptr [rcx], eax
    pop     rbx
    pop     r14
    pop     rbp
    ret
 
; Foo
    mov     ecx, dword ptr [rdi + 128]
    add     ecx, -10
    cmp     ecx, 70
    ja      .LBB1_10
    mov     eax, 20
    movabs  rdx, offset .LJTI1_0
    movsxd  rcx, dword ptr [rdx + 4*rcx]
    add     rcx, rdx
    jmp     rcx
.LBB1_2:
    mov     eax, 40
    ret
.LBB1_10:
    mov     eax, 100
    ret
.LBB1_3:
    mov     eax, 50
    ret
.LBB1_4:
    mov     eax, 60
    ret
.LBB1_5:
    mov     eax, 70
    ret
.LBB1_6:
    mov     eax, 80
    ret
.LBB1_7:
    mov     eax, 90
    ret
.LBB1_8:
    mov     eax, 110
.LBB1_9:
    ret

Foo is identical to before, but Execute is now much shorter. Instead of pushing every byte of the BigStruct parameter, Foo now just uses the BigStruct that’s already present.

Conclusion

When it comes to switch statements, Burst doesn’t figure out that plain case labels and when labels are the same and generate the same code. It also doesn’t automatically add ref to large parameters even when it would be much more efficient. These learnings should reinforce the importance of looking at the Burst Inspector to ensure we’re getting the output we want the CPU to actually execute.