# Direct tail-call threading in Rust 2021-05-06 With Clang gaining [guaranteed tail calls][clang tce] Rust hopefully will catch up soon and also finally get guaranteed TCE. While the `become` keyword is not useable at the moment, I figured it's still worth looking into the potentially fastest implementation for bytecode interpreters. The benchmarks are performed on a Ryzen 2700X. ## A simple interpreter To keep things simple, the interpreter only has to be able to execute this pseudocode: let a = 0 let b = 1 let c = 100_000_000 loop: a += b if a != c goto loop print a ## Implementation in C While this blog post is about Rust, an implementation in C is easier to implement as it allows a lot of unsafe behaviour by default. It also allows using `clang-13`, which has the `musttail` attribute. ### Using a program counter A naive implementation using a program counter may look like this: ```c #include #define DEF_OP(_name_) static void _name_( \ const instruction_t *restrict instructions, \ int *restrict memory, \ unsigned int pc \ ) #define NEXT_OP do { \ __attribute__((musttail)) \ return instructions[pc].handler(instructions, memory, pc); \ } while(0) #define GET_OP(_instr_) \ const instruction_t *_instr_ = &instructions[pc]; typedef struct instruction { void (*handler)(const struct instruction *restrict, int *restrict, unsigned int); union { struct { unsigned char c; unsigned char d; }; unsigned int pc; int imm; }; unsigned char a; unsigned char b; } instruction_t; DEF_OP(load) { GET_OP(i); pc++; memory[i->a] = i->imm; NEXT_OP; } DEF_OP(add) { GET_OP(i); pc++; memory[i->a] = memory[i->b] + memory[i->c]; NEXT_OP; } DEF_OP(jmpnif) { GET_OP(i); if (memory[i->a] != memory[i->b]) { pc = i->pc; } else { pc++; } NEXT_OP; } DEF_OP(print) { GET_OP(i); pc++; printf("%d\n", memory[i->a]); NEXT_OP; } DEF_OP(ret) { } int main() { int memory[256] = {0}; instruction_t instructions[] = { // Init loop [0] = { .handler = load, .a = 0, .imm = 0 }, [1] = { .handler = load, .a = 1, .imm = 1 }, [2] = { .handler = load, .a = 2, .imm = 100 * 1000 * 1000 }, // Loop [3] = { .handler = add, .a = 0, .b = 0, .c = 1 }, [4] = { .handler = jmpnif, .a = 0, .b = 2, .pc = 3 }, // Finish [5] = { .handler = print, .a = 0 }, [6] = { .handler = ret }, }; instructions[0].handler(instructions, memory, 0); return 0; } ``` However, this does not generate efficient assembly: inspecting the `add` and `jmpnif` functions with `objdump -SC` shows this: 0000000000401210 : 401210: 89 d0 mov %edx,%eax 401212: 83 c2 01 add $0x1,%edx 401215: 48 c1 e0 04 shl $0x4,%rax 401219: 44 0f b6 44 07 0d movzbl 0xd(%rdi,%rax,1),%r8d 40121f: 0f b6 4c 07 08 movzbl 0x8(%rdi,%rax,1),%ecx 401224: 8b 0c 8e mov (%rsi,%rcx,4),%ecx 401227: 42 03 0c 86 add (%rsi,%r8,4),%ecx 40122b: 0f b6 44 07 0c movzbl 0xc(%rdi,%rax,1),%eax 401230: 89 0c 86 mov %ecx,(%rsi,%rax,4) 401233: 48 89 d0 mov %rdx,%rax 401236: 48 c1 e0 04 shl $0x4,%rax 40123a: 48 8b 04 07 mov (%rdi,%rax,1),%rax 40123e: ff e0 jmpq *%rax 0000000000401240 : 401240: 89 d0 mov %edx,%eax 401242: 48 c1 e0 04 shl $0x4,%rax 401246: 0f b6 4c 07 0c movzbl 0xc(%rdi,%rax,1),%ecx 40124b: 44 8b 04 8e mov (%rsi,%rcx,4),%r8d 40124f: 0f b6 4c 07 0d movzbl 0xd(%rdi,%rax,1),%ecx 401254: 44 3b 04 8e cmp (%rsi,%rcx,4),%r8d 401258: 75 0f jne 401269 40125a: 83 c2 01 add $0x1,%edx 40125d: 89 d0 mov %edx,%eax 40125f: 48 c1 e0 04 shl $0x4,%rax 401263: 48 8b 04 07 mov (%rdi,%rax,1),%rax 401267: ff e0 jmpq *%rax 401269: 8b 54 07 08 mov 0x8(%rdi,%rax,1),%edx 40126d: 89 d0 mov %edx,%eax 40126f: 48 c1 e0 04 shl $0x4,%rax 401273: 48 8b 04 07 mov (%rdi,%rax,1),%rax 401277: ff e0 jmpq *%rax 401279: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) That's an awful lot of work for what should be two relatively simple operations. ### Using direct tail call threading An implementation in C using what I dub **direct tail-call threading** could look like this: ```c #include #define DEF_OP(_name_) static void _name_( \ const instruction_t *restrict instruction, \ int *restrict memory \ ) #define NEXT_OP do { \ __attribute__((musttail)) \ return instruction->handler(instruction, memory); \ } while(0) typedef struct instruction { void (*handler)(const struct instruction *restrict, int *restrict); union { unsigned char c; struct instruction *jmp_instruction; int imm; }; unsigned char a; unsigned char b; } instruction_t; DEF_OP(load) { memory[instruction->a] = instruction->imm; instruction++; NEXT_OP; } DEF_OP(add) { memory[instruction->a] = memory[instruction->b] + memory[instruction->c]; instruction++; NEXT_OP; } DEF_OP(jmpnif) { if (memory[instruction->a] != memory[instruction->b]) { instruction = instruction->jmp_instruction; } else { instruction++; } NEXT_OP; } DEF_OP(print) { printf("%d\n", memory[instruction->a]); instruction++; NEXT_OP; } DEF_OP(ret) { } int main() { int memory[256] = {0}; instruction_t instructions[] = { // Init loop [0] = { .handler = load, .a = 0, .imm = 0 }, [1] = { .handler = load, .a = 1, .imm = 1 }, [2] = { .handler = load, .a = 2, .imm = 100 * 1000 * 1000 }, // Loop [3] = { .handler = add, .a = 0, .b = 0, .c = 1 }, [4] = { .handler = jmpnif, .a = 0, .b = 2, .jmp_instruction = &instructions[3] }, // Finish [5] = { .handler = print, .a = 0 }, [6] = { .handler = ret }, }; instructions[0].handler(instructions, memory); return 0; } ``` The generated assembly is much more efficient (and is ideal at first sight): 0000000000401270 : 401270: 0f b6 47 11 movzbl 0x11(%rdi),%eax 401274: 0f b6 4f 08 movzbl 0x8(%rdi),%ecx 401278: 8b 0c 8e mov (%rsi,%rcx,4),%ecx 40127b: 03 0c 86 add (%rsi,%rax,4),%ecx 40127e: 0f b6 47 10 movzbl 0x10(%rdi),%eax 401282: 89 0c 86 mov %ecx,(%rsi,%rax,4) 401285: 48 8b 47 18 mov 0x18(%rdi),%rax 401289: 48 83 c7 18 add $0x18,%rdi 40128d: ff e0 jmpq *%rax 40128f: 90 nop 0000000000401290 : 401290: 0f b6 47 10 movzbl 0x10(%rdi),%eax 401294: 8b 04 86 mov (%rsi,%rax,4),%eax 401297: 0f b6 4f 11 movzbl 0x11(%rdi),%ecx 40129b: 3b 04 8e cmp (%rsi,%rcx,4),%eax 40129e: 75 09 jne 4012a9 4012a0: 48 83 c7 18 add $0x18,%rdi 4012a4: 48 8b 07 mov (%rdi),%rax 4012a7: ff e0 jmpq *%rax 4012a9: 48 8b 7f 08 mov 0x8(%rdi),%rdi 4012ad: 48 8b 07 mov (%rdi),%rax 4012b0: ff e0 jmpq *%rax 4012b2: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1) 4012b9: 00 00 00 4012bc: 0f 1f 40 00 nopl 0x0(%rax) The difference is clearly visible when running both versions with `perf stat`: $ perf stat ./with_direct_threading 100000000 Performance counter stats for './build/main': 229.35 msec task-clock:u # 0.991 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 50 page-faults:u # 0.218 K/sec 957,568,185 cycles:u # 4.175 GHz (82.76%) 575,004 stalled-cycles-frontend:u # 0.06% frontend cycles idle (84.37%) 538,644,702 stalled-cycles-backend:u # 56.25% backend cycles idle (80.98%) 1,691,695,590 instructions:u # 1.77 insn per cycle # 0.32 stalled cycles per insn (84.43%) 298,083,096 branches:u # 1299.694 M/sec (82.70%) 303 branch-misses:u # 0.00% of all branches (84.75%) 0.231515959 seconds time elapsed 0.225605000 seconds user 0.004256000 seconds sys $ perf stat ./with_program_counter 100000000 Performance counter stats for './build/with_program_counter': 263.86 msec task-clock:u # 0.998 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 52 page-faults:u # 0.197 K/sec 1,089,623,673 cycles:u # 4.129 GHz (83.33%) 223,565 stalled-cycles-frontend:u # 0.02% frontend cycles idle (83.33%) 613,199,854 stalled-cycles-backend:u # 56.28% backend cycles idle (83.34%) 2,491,824,728 instructions:u # 2.29 insn per cycle # 0.25 stalled cycles per insn (83.33%) 299,860,727 branches:u # 1136.422 M/sec (83.34%) 822 branch-misses:u # 0.00% of all branches (83.33%) 0.264392820 seconds time elapsed 0.264333000 seconds user 0.000000000 seconds sys ## Porting the C code to Rust While Rust doesn't guarantee TCE, `rustc` is still able to optimize simple functions to use tailcalls. The Rust equivalent of the threaded C implementation looks like this: ```rust macro_rules! def_op { ($name:ident($instruction:ident, $memory:ident) $code:block) => { unsafe fn $name($instruction: *const Instruction, $memory: &mut [i32]) { $code } }; } #[cfg(feature = "unchecked_memory")] macro_rules! get_mem { ($mem:ident[$r:expr]) => { $mem.get_unchecked($r as usize) }; (mut $mem:ident[$r:expr]) => { $mem.get_unchecked_mut($r as usize) }; } #[cfg(not(feature = "unchecked_memory"))] macro_rules! get_mem { ($mem:ident[$r:expr]) => { &$mem[$r as usize] }; (mut $mem:ident[$r:expr]) => { &mut $mem[$r as usize] }; } /// ## SAFETY /// /// `instruction` is a valid pointer. #[inline(always)] unsafe fn next_op(instruction: *const Instruction, memory: &mut [i32]) { ((*instruction).handler)(instruction, memory) } union ExtraParam { c: u8, jmp: *const Instruction, imm: i32, } struct Instruction { handler: unsafe fn(*const Self, &mut [i32]), param: ExtraParam, a: u8, b: u8, } def_op!(load(instruction, memory) { *get_mem!(mut memory[(*instruction).a]) = (*instruction).param.imm; next_op(instruction.offset(1), memory) }); def_op!(add(instruction, memory) { *get_mem!(mut memory[(*instruction).a]) = *get_mem!(memory[(*instruction).b]) + *get_mem!(memory[(*instruction).param.c]); next_op(instruction.offset(1), memory) }); def_op!(jmpnif(instruction, memory) { let instruction = if get_mem!(memory[(*instruction).a]) != get_mem!(memory[(*instruction).b]) { (*instruction).param.jmp } else { instruction.offset(1) }; next_op(instruction, memory) }); def_op!(print(instruction, memory) { println!("{}", get_mem!(memory[(*instruction).a as usize])); next_op(instruction.offset(1), memory) }); def_op!(ret(_instruction, _memory) {}); fn main() { type I = Instruction; type P = ExtraParam; let mut memory = [0; 256]; let mut instructions = [ // Init loop I { handler: load, a: 0, b: 0, param: P { imm: 0 }}, I { handler: load, a: 1, b: 0, param: P { imm: 1 }}, I { handler: load, a: 2, b: 0, param: P { imm: 100 * 1000 * 1000 }}, // Loop I { handler: add, a: 0, b: 0, param: P { c: 1 }}, I { handler: jmpnif, a: 0, b: 2, param: P { jmp: core::ptr::null() }}, // Finish I { handler: print, a: 0, b: 0, param: P { c: 0 }}, I { handler: ret, a: 0, b: 0, param: P { c: 0 }}, ]; instructions[4].param.jmp = &instructions[3]; // SAFETY: the bytecode is valid. unsafe { (instructions[0].handler)(&instructions as *const _, &mut memory); } } ``` Building with `rustc -C opt-level=3 --cfg 'feature="unchecked_memory"' main.rs` and inspecting the final assembly, we can see that the assembly is almost equivalent: 0000000000006a20 : 6a20: 0f b6 47 11 movzbl 0x11(%rdi),%eax 6a24: 0f b6 4f 08 movzbl 0x8(%rdi),%ecx 6a28: 8b 0c 8e mov (%rsi,%rcx,4),%ecx 6a2b: 03 0c 86 add (%rsi,%rax,4),%ecx 6a2e: 0f b6 47 10 movzbl 0x10(%rdi),%eax 6a32: 89 0c 86 mov %ecx,(%rsi,%rax,4) 6a35: 48 8b 47 18 mov 0x18(%rdi),%rax 6a39: 48 83 c7 18 add $0x18,%rdi 6a3d: ff e0 jmpq *%rax 6a3f: 90 nop 0000000000006a40 : 6a40: 0f b6 47 10 movzbl 0x10(%rdi),%eax 6a44: 0f b6 4f 11 movzbl 0x11(%rdi),%ecx 6a48: 8b 04 86 mov (%rsi,%rax,4),%eax 6a4b: 3b 04 8e cmp (%rsi,%rcx,4),%eax 6a4e: 75 09 jne 6a59 6a50: 48 83 c7 18 add $0x18,%rdi 6a54: 48 8b 07 mov (%rdi),%rax 6a57: ff e0 jmpq *%rax 6a59: 48 8b 7f 08 mov 0x8(%rdi),%rdi 6a5d: 48 8b 07 mov (%rdi),%rax 6a60: ff e0 jmpq *%rax 6a62: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1) 6a69: 00 00 00 6a6c: 0f 1f 40 00 nopl 0x0(%rax) The only difference between the Rust and C version is two swapped instructions in `jmpnif`: 0000000000006a40 : 6a40: 0f b6 47 10 movzbl 0x10(%rdi),%eax 6a44: 0f b6 4f 11 movzbl 0x11(%rdi),%ecx 6a48: 8b 04 86 mov (%rsi,%rax,4),%eax 6a4b: 3b 04 8e cmp (%rsi,%rcx,4),%eax 0000000000401290 : 401290: 0f b6 47 10 movzbl 0x10(%rdi),%eax 401294: 8b 04 86 mov (%rsi,%rax,4),%eax 401297: 0f b6 4f 11 movzbl 0x11(%rdi),%ecx 40129b: 3b 04 8e cmp (%rsi,%rcx,4),%eax Rust's version is slightly faster since it works better with pipelining and doesn't stall the frontend as much: $ perf stat ./with_direct_threading 100000000 Performance counter stats for './build/main': 229.35 msec task-clock:u # 0.991 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 50 page-faults:u # 0.218 K/sec 957,568,185 cycles:u # 4.175 GHz (82.76%) 575,004 stalled-cycles-frontend:u # 0.06% frontend cycles idle (84.37%) 538,644,702 stalled-cycles-backend:u # 56.25% backend cycles idle (80.98%) 1,691,695,590 instructions:u # 1.77 insn per cycle # 0.32 stalled cycles per insn (84.43%) 298,083,096 branches:u # 1299.694 M/sec (82.70%) 303 branch-misses:u # 0.00% of all branches (84.75%) 0.231515959 seconds time elapsed 0.225605000 seconds user 0.004256000 seconds sys $ perf stat ./build/threaded_tco_rust 100000000 Performance counter stats for './build/threaded_tco_rust': 223.33 msec task-clock:u # 0.998 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 81 page-faults:u # 0.363 K/sec 940,918,264 cycles:u # 4.213 GHz (82.10%) 50,970 stalled-cycles-frontend:u # 0.01% frontend cycles idle (83.13%) 632,340,038 stalled-cycles-backend:u # 67.20% backend cycles idle (83.89%) 1,697,590,700 instructions:u # 1.80 insn per cycle # 0.37 stalled cycles per insn (83.89%) 299,671,562 branches:u # 1341.808 M/sec (83.89%) 504 branch-misses:u # 0.00% of all branches (83.10%) 0.223767884 seconds time elapsed 0.223755000 seconds user 0.000000000 seconds sys ## Conclusion With TCE and direct threading it is possible to create very efficient interpreters in Rust with simple bytecode instructions. Whether the generated assembly will be as ideal for more complex bytecode instructions (which may suffer from many `push`es & `pop`s) remains to be seen. [clang tce]: https://reviews.llvm.org/D99517