rust中的「函数」

Function Item

第一次接触到function item这个新概念可能会令人感到困惑，因为在其它语言中几乎都找不到类似的设计，但它却是 Rust 零成本抽象（zero-overhead abstractions）的一个重要体现

#![feature(const_type_name)]
#![feature(const_size_of_val)]

use std::{any::type_name, mem};

const fn type_name_of_val<T>(_: &T) -> &'static str {
    type_name::<T>()
}

const fn size_of_val<T>(t: &T) -> usize {
    mem::size_of_val(t)
}

fn add(a: i32, b: i32) -> i32 {
    a + b
}

fn sub(a: i32, b: i32) -> i32 {
    a - b
}

fn main() {
    println!("add size: {}", size_of_val(&add)); // 0
    println!("sub size: {}", size_of_val(&sub)); // 0
    println!("add type: {}", type_name_of_val(&add)); // func::add
    println!("sub type: {}", type_name_of_val(&sub)); // func::sub
}

上述代码中的 add & sub就是function item，它们的size都为0，类型分别为 func::add & func::sub （func是crate name）

如果按照其它语言中对「函数」这一概念的惯性思维，我们可能会编写出如下的代码：

1 2	let mut f = add; f = sub; //~ ERROR mismatched types

但它并不能通过编译。这是由于function item直接表示函数本身，虽然它们具有相同的函数签名，但实际是两个不同的类型。由此我们也能窥见Rust中对于「函数」的谁急似乎和其他语言存在着差异

Function Pointer

function pointer和 function item的区别可以概括为：function pointer通过存储的地址引用函数；function item通过类型信息定位函数

可以将具有相同函数签名的function item强转（coerce）为function pointer，来支持改变f的指向：

1 2	let mut f: fn(i32, i32) -> i32 = add; f = sub;

下面通过一个完整例子探索一下function item & function pointer

#[inline(never)]
fn add(a: i32, b: i32) -> i32 {
    a + b
}

#[inline(never)]
fn sub(a: i32, b: i32) -> i32 {
    a - b
}

fn main() {
    let item = sub; // function item
    let item1 = sub; // function item

    let mut ptr: fn(i32, i32) -> i32 = add; // function pointer
    let ptr1: fn(i32, i32) -> i32 = sub; // function pointer
    let x = item(2, 2);
    let y = item1(2, 2);
    let z = ptr(1, 2);
    ptr = sub;
    let a = ptr(2, 2);
}

使用rust-lldb，在第19行处打一个断点，然后运行至断点处打印变量观察：

(lldb) v
(int (*)(int, int)) item = 0x0001000017ac0000
(int (*)(int, int)) item1 = 0x000001000017ac00
(int (*)(int, int)) ptr = 0x00000001000017ac (main`main::add::h243b87047811e43e at main.rs:2)
(int (*)(int, int)) ptr1 = 0x00000001000017f0 (main`main::sub::hdca0f0b5bfd66359 at main.rs:7)
(int) x = 0
(int) y = 0
(int) z = 3

item 和 item1 是function item type，虽然lldb这里输出了地址，但是如果尝试访问会发现它是无效地址，可能是lldb无法识别function item（可以看到这里标注的类型还是(int (*)(int, int)) 这种函数指针类型），也可能是调试信息的中间产物，故忽略

主要注意ptr和ptr1，它们分别指向0x00000001000017ac 和 0x00000001000017f0，对这两个地址进行反汇编：

(lldb) disassemble -s 0x00000001000017ac
main`main::add::h243b87047811e43e:
    0x1000017ac <+0>:  sub    sp, sp, #0x20
    0x1000017b0 <+4>:  stp    x29, x30, [sp, #0x10]
    0x1000017b4 <+8>:  add    x29, sp, #0x10
    0x1000017b8 <+12>: str    w0, [sp, #0x8]
    0x1000017bc <+16>: stur   w1, [x29, #-0x4]
    0x1000017c0 <+20>: adds   w8, w0, w1
    0x1000017c4 <+24>: str    w8, [sp, #0x4]
    0x1000017c8 <+28>: cset   w8, vs
(lldb) disassemble -s 0x00000001000017f0
main`main::sub::hdca0f0b5bfd66359:
    0x1000017f0 <+0>:  sub    sp, sp, #0x20
    0x1000017f4 <+4>:  stp    x29, x30, [sp, #0x10]
    0x1000017f8 <+8>:  add    x29, sp, #0x10
    0x1000017fc <+12>: str    w0, [sp, #0x8]
    0x100001800 <+16>: stur   w1, [x29, #-0x4]
    0x100001804 <+20>: subs   w8, w0, w1
    0x100001808 <+24>: str    w8, [sp, #0x4]
    0x10000180c <+28>: cset   w8, vs

可以看到它们指向的正是add & sub 函数的入口地址，因此可以得出结论：function pointer 本质就是一个指针，内部指向一个函数的地址

当程序执行完 ptr = sub; 语句后，再次打印变量:

1 2	(int ()(int, int)) ptr = 0x00000001000017f0 (main`main::sub::hdca0f0b5bfd66359 at main.rs:7) (int ()(int, int)) ptr1 = 0x00000001000017f0 (main`main::sub::hdca0f0b5bfd66359 at main.rs:7)

此时ptr & ptr1 都指向sub的入口地址0x00000001000017f0

为了更进一步理解，可以对main函数进行反汇编：

(lldb) disassemble -n main
main`main::main::h8d79616f9b6b86ce:
    0x100001834 <+0>:   sub    sp, sp, #0x40
    0x100001838 <+4>:   stp    x29, x30, [sp, #0x30]
    0x10000183c <+8>:   add    x29, sp, #0x30
    0x100001840 <+12>:  adrp   x8, 0
    0x100001844 <+16>:  add    x8, x8, #0x7ac            ; main::add::h243b87047811e43e at main.rs:2
    0x100001848 <+20>:  str    x8, [sp, #0x10]
    0x10000184c <+24>:  adrp   x8, 0
    0x100001850 <+28>:  add    x8, x8, #0x7f0            ; main::sub::hdca0f0b5bfd66359 at main.rs:7
    0x100001854 <+32>:  str    x8, [sp]
    0x100001858 <+36>:  str    x8, [sp, #0x18]
    0x10000185c <+40>:  mov    w1, #0x2
    0x100001860 <+44>:  str    w1, [sp, #0x8]
    0x100001864 <+48>:  mov    x0, x1
    0x100001868 <+52>:  bl     0x1000017ac               ; main::add::h243b87047811e43e at main.rs:2
    0x10000186c <+56>:  ldr    w1, [sp, #0x8]
    0x100001870 <+60>:  stur   w0, [x29, #-0x10]
    0x100001874 <+64>:  mov    x0, x1
    0x100001878 <+68>:  bl     0x1000017f0               ; main::sub::hdca0f0b5bfd66359 at main.rs:7
    0x10000187c <+72>:  ldr    w1, [sp, #0x8]
    0x100001880 <+76>:  stur   w0, [x29, #-0xc]
    0x100001884 <+80>:  ldr    x8, [sp, #0x10]
    0x100001888 <+84>:  mov    w0, #0x1
    0x10000188c <+88>:  blr    x8
    0x100001890 <+92>:  ldr    x8, [sp]
    0x100001894 <+96>:  ldr    w1, [sp, #0x8]
    0x100001898 <+100>: stur   w0, [x29, #-0x8]
    0x10000189c <+104>: str    x8, [sp, #0x10]
->  0x1000018a0 <+108>: ldr    x8, [sp, #0x10]
    0x1000018a4 <+112>: mov    x0, x1
    0x1000018a8 <+116>: blr    x8
    0x1000018ac <+120>: stur   w0, [x29, #-0x4]
    0x1000018b0 <+124>: ldp    x29, x30, [sp, #0x30]
    0x1000018b4 <+128>: add    sp, sp, #0x40
    0x1000018b8 <+132>: ret

由于我使用的是Apple芯片的mac，这里输出的是arm汇编，和amd汇编还是有较大区别，但这不妨碍我们挑重点进行阅读：

0x100001844 <+16>:  add    x8, x8, #0x7ac            ; main::add::h243b87047811e43e at main.rs:2
0x100001848 <+20>:  str    x8, [sp, #0x10]
0x10000184c <+24>:  adrp   x8, 0
0x100001850 <+28>:  add    x8, x8, #0x7f0            ; main::sub::hdca0f0b5bfd66359 at main.rs:7
0x100001854 <+32>:  str    x8, [sp]

把0x7ac（add的地址）存储在栈空间偏移 0x10处；把0x7f0（sub的地址）存储在栈空间sp处

1 2	0x100001868 <+52>: bl 0x1000017ac ; main::add::h243b87047811e43e at main.rs:2 0x100001878 <+68>: bl 0x1000017f0 ; main::sub::hdca0f0b5bfd66359 at main.rs:7

这两条语句对应的是item(2, 2) & item1(2, 2)的调用，可以看到这里是直接拿到了对应的函数地址进行执行

1
2
3

0x100001884 <+80>:  ldr    x8, [sp, #0x10]
0x100001888 <+84>:  mov    w0, #0x1
0x10000188c <+88>:  blr    x8

从栈的偏移量0x10处加载函数地址到x8，所以x8当前保存的是add的地址，之后通过blr指令跳转到x8存储的地址并执行，对应ptr(1,2)的调用

0x100001890 <+92>:  ldr    x8, [sp]
0x100001894 <+96>:  ldr    w1, [sp, #0x8]
0x100001898 <+100>: stur   w0, [x29, #-0x8]
0x10000189c <+104>: str    x8, [sp, #0x10]
0x1000018a0 <+108>: ldr    x8, [sp, #0x10]
0x1000018a4 <+112>: mov    x0, x1
0x1000018a8 <+116>: blr    x8

从栈顶加载sub函数地址到x8，str x8, [sp, #0x10] ldr x8, [sp, #0x10] 这两句看着有些费解，先将x8的地址保存在偏移量0x10处，又从偏移量0x10处再次加载函数地址到x8，猜测和未开启编译优化有关。最后还是通过blr指令跳转执行，对应ptr重新赋值为sub后，ptr(2,2)的调用

和直接跳转地址相比，可以发现blr x8这种调用方式多了一层寻址操作

Closure

闭包是将函数，或者说代码和其环境一起存储的一种数据结构。闭包引用的上下文中的自由变量，会被捕获到闭包的结构中，成为闭包类型的一部分。在 Rust 中，闭包可以用 |args| {code} 来表示。写段代码探索一下Rust的闭包：

use std::{collections::HashMap, mem::size_of_val};

fn main() {
    // 长度为 0
    let c1 = || println!("hello world!");

    // 和参数无关，长度也为 0
    let c2 = |i: i32| println!("hello: {}", i);
    let name = String::from("cae");
    let name1 = name.clone();
    let mut table = HashMap::new();
    table.insert("hello", "world");

    // 如果捕获一个引用，长度为 8
    let c3 = || println!("hello: {}", name);

    // 捕获移动的数据 name1(长度 24) + table(长度 48)，closure 长度 72
    let c4 = move || println!("hello: {}, {:?}", name1, table);

    let name2 = name.clone();
    // 和局部变量无关，通过move捕获了一个 String name2，closure 长度 24
    let c5 = move || {
        let x = 1;
        let name3 = String::from("lindsey");
        println!("hello: {}, {:?}, {:?}", x, name2, name3);
    };

    println!(
        "c1: {}\nc2: {}\nc3: {}\nc4: {}\nc5: {}\nmain: {}",
        size_of_val(&c1),
        size_of_val(&c2),
        size_of_val(&c3),
        size_of_val(&c4),
        size_of_val(&c5),
        size_of_val(&main),
    )
}

可以发现：closure的大小跟参数、局部变量都无关，只跟捕获的变量有关。

通过rust-lldb设置断点，观察各个变量的结构：

(lldb) v
(main::main::{closure_env#0}) c1 =
(main::main::{closure_env#1}) c2 =
(alloc::string::String) name = "cae" {
  vec = size=3 {
    [0] = 'c'
    [1] = 'a'
    [2] = 'e'
  }
}
(alloc::string::String) name1 = "cae" {
  vec = size=3 {
    [0] = 'c'
    [1] = 'a'
    [2] = 'e'
  }
}
(std::collections::hash::map::HashMap<&str, &str, std::hash::random::RandomState>) table = size=1 {
  [0] = {
    0 = "hello" {
      [0] = 'h'
      [1] = 'e'
      [2] = 'l'
      [3] = 'l'
      [4] = 'o'
    }
    1 = "world" {
      [0] = 'w'
      [1] = 'o'
      [2] = 'r'
      [3] = 'l'
      [4] = 'd'
    }
  }
}
(main::main::{closure_env#2}) c3 = {
  _ref__name = 0x000000016fdfea68
}
(main::main::{closure_env#3}) c4 = {
  name1 = "cae" {
    vec = size=3 {
      [0] = 'c'
      [1] = 'a'
      [2] = 'e'
    }
  }
  table = size=1 {
    [0] = {
      0 = "hello" {
        [0] = 'h'
        [1] = 'e'
        [2] = 'l'
        [3] = 'l'
        [4] = 'o'
      }
      1 = "world" {
        [0] = 'w'
        [1] = 'o'
        [2] = 'r'
        [3] = 'l'
        [4] = 'd'
      }
    }
  }
}
(alloc::string::String) name2 = "cae" {
  vec = size=3 {
    [0] = 'c'
    [1] = 'a'
    [2] = 'e'
  }
}
(main::main::{closure_env#4}) c5 = {
  name2 = "cae" {
    vec = size=3 {
      [0] = 'c'
      [1] = 'a'
      [2] = 'e'
    }
  }
}

主要观察几个长度不为 0 闭包内存里存放的数据是什么。从输出可以看出c3中的 _ref__name 字段是一个引用，指向地址为 0x000000016fdfea68 的数据。继续看看这个地址里存的数据是什么：

1
2
3

(lldb) x/3xg 0x000000016fdfea68
0x16fdfea68: 0x0000000000000003 0x00006000015cc000
0x16fdfea78: 0x0000000000000003

Rust 的String在内存中表示为ptr | cap | len的结构，因此在内存中占用3个words。由于编译器内存重排，导致输出顺序和逻辑位置有些差异。其中，ptr指向了具体的数据：

1 2	(lldb) x/c 0x00006000015cc000 0x6000015cc000: cae\0\0\0\0\0

closure 类型是由编译器生成的匿名数据类型，故无法在代码标注具体类型。对于closure来说，编译器知道像函数一样调用闭包 c4() 是合法的，并且知道执行 c4() 时，代码应该跳转到什么地址来执行。在执行过程中，如果遇到自由变量，可以从自己的数据结构中获取。从rust-lldb 的输出可得，closure是存储在栈上，并且除了捕获的数据外，closure本身不包含任何额外函数指针指向闭包的代码，至于在调用时跳转到哪个地址，是由编译器在编译期静态生成。

1 2	let mut closure = \|a: i32, b: i32\| a + b; closure = \|a, b\| a * b; //~ ERROR mismatched types

和function item类似，具有相同签名的closure并不是同一个类型。所以上述代码并不能够编译

通过为每个 closure 生成一个匿名类型，使得调用闭包时可以直接和代码一一对应，省去了使用函数指针再次寻址的额外消耗。这种方式比起golang的[function value](golang中的function value | Caelansar)二级指针的设计不知道高到哪里去了🤷

价值

说了这么多，我们已经理解了Rust对于「函数」的设计，至于为什么这么设计，就让我们结合一个实际例子来看看这些设计所带来的实际价值

#[inline(never)]
#[no_mangle]
pub fn calculate<F>(f: F) -> i32
where
    F: FnMut(i32) -> i32,
{
    (0..5).map(f).sum()
}

#[no_mangle]
pub fn f(x: i32) -> i32 {
    x + x
}

#[no_mangle]
pub fn call() -> i32 {
    calculate(f)
}

calculate 函数的作用是使用传入的 f 对range (0..5) 中的每个值进行运算，再将结果累加返回。只要实现了 FnMut(i32) -> i32 的类型，都可以作为参数传入calculate，将函数作为参数的写法在函数作为一等公民的语言中很常见，但Rust在这背后实际上通过单态化（monomorphization）实现了对f的静态派发（static dispatch）

通过 cargo rustc --release -- --emit asm 输出汇编代码

        .section        __TEXT,__text,regular,pure_instructions
        .build_version macos, 11, 0
        .p2align        2
_calculate:
        .cfi_startproc
        mov     w0, #20
        ret
        .cfi_endproc

        .globl  _f
        .p2align        2
_f:
        .cfi_startproc
        lsl     w0, w0, #1
        ret
        .cfi_endproc

        .globl  _call
        .p2align        2
_call:
        .cfi_startproc
        b       _calculate
        .cfi_endproc

.subsections_via_symbols

可以发现calculate函数直接在编译期完成了map & sum 运算，直接得到了结果20
对代码稍作修改，新增f1 & f2 函数，其中f1 和 f 完全一致，f2 和 f 逻辑一致

#[no_mangle]
pub fn f1(x: i32) -> i32 {
    x + x
}

#[no_mangle]
pub fn f2(x: i32) -> i32 {
    x * 2
}

再次观察输出的汇编：

        .section        __TEXT,__text,regular,pure_instructions
        .build_version macos, 11, 0
        .p2align        2
_calculate:
        .cfi_startproc
        mov     w0, #20
        ret
        .cfi_endproc

        .globl  _f
        .p2align        2
_f:
        .cfi_startproc
        lsl     w0, w0, #1
        ret
        .cfi_endproc

        .globl  _call
        .p2align        2
_call:
        .cfi_startproc
        b       _calculate
        .cfi_endproc

        .globl  _f1
.set _f1, _f
        .globl  _f2
.set _f2, _f

可以发现，Rust并未对f1 & f2生成相应的汇编代码，只是将它们的值直接赋值为f。得益于单态化，Rust在编译期就能明确f究竟是什么，无论是function item或是closure背后的匿名结构体，都能通过静态派发为calculate的参数f提供足够的类型信息，也就避免了函数指针间接调用带来的开销，并进行更激进的优化

你可能会好奇这个优化能带来多少性能提升。空口无凭，这时候就得进行benchmark对比看看
在Rust中，一般使用criterion 进行benchmark

use criterion::{black_box, criterion_group, criterion_main, Criterion};

#[inline(never)]
pub fn calculate<F>(f: F) -> u128
where
    F: FnMut(u128) -> u128,
    F: Copy,
{
    (0..1000000).map(f).map(f).map(f).map(f).map(f).sum()
}

#[inline(never)]
pub fn calculate_pointer(f: fn(u128) -> u128) -> u128 {
    (0..1000).map(f).map(f).map(f).map(f).map(f).sum()
}

pub fn f(x: u128) -> u128 {
    x + x
}

pub fn f1(x: u128) -> u128 {
    x * 2
}

pub fn call() -> u128 {
    calculate(f);

    calculate(f1)
}

pub fn call_pointer() -> u128 {
    let mut fp: fn(u128) -> u128 = f;
    calculate_pointer(fp);

    fp = f1;
    calculate_pointer(fp)
}

fn criterion_benchmark(c: &mut Criterion) {
    c.bench_function("func item", |b| {
        b.iter(|| {
            black_box(for _ in 1..=100 {
                black_box(call());
            })
        })
    });

    c.bench_function("func pointer", |b| {
        b.iter(|| {
            black_box(for _ in 1..=100 {
                black_box(call_pointer());
            })
        })
    });
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

注意这段代码：

calculate 和 calculate_pointer函数中map的执行次数不同，calculate中map的次数要多3个数量级，对于function item，编译器在编译期就能得到函数的所有信息，所以这里的数量级差异对性能影响为零
定义了function pointer mut fp: fn(u128) -> u128 , 并在之后改变了它的指向，如果不进行修改操作，在fp指向未改变的情况下，编译器也会对它进行优化，完成和function item相同的编译期求值

benchmark的结果为：

     Running benches/func.rs (target/release/deps/func-1b1df2d0b22f971a)
Gnuplot not found, using plotters backend
func item               time:   [147.79 ns 148.10 ns 148.42 ns]
                        change: [-0.2788% +0.0758% +0.3918%] (p = 0.67 > 0.05)
                        No change in performance detected.
Found 2 outliers among 100 measurements (2.00%)
  1 (1.00%) high mild
  1 (1.00%) high severe

func pointer            time:   [633.96 µs 635.06 µs 636.35 µs]
                        change: [-0.2078% +0.1107% +0.4070%] (p = 0.49 > 0.05)
                        No change in performance detected.
Found 6 outliers among 100 measurements (6.00%)
  4 (4.00%) high mild
  2 (2.00%) high severe

可以看出相较于function pointer，function item 在性能上有数量级的领先，这还是在循环次数多了3个数量级的前提

总结

通过对function item，function pointer，closure的讨论，我们可以得出：

Rust通过

使用所有权和借用机制，解决了内存归属问题有关。不用费劲分析捕获变量究竟是放在stack还是heap
提供Fn/FnMut/FnOnce trait，对函数调用这一行为定义了统一的抽象，function item，function pointer和closure能够像函数一样调用，仅仅因为他们实现了这些trait，而不是编译器借助函数地址施展的黑魔法
使用单态化，在编译期将泛型代码具体化为特定的类型实例，从而进行优化，在享受泛型灵活性的同时避免了运行时开销

当回归到最初的本质，一个好的设计所解决的不是单个问题，而是由此引发的所有问题。我们不必为堆内存管理设计GC，不必为资源的回收提供 defer 关键字，不必为并发安全进行诸多限制，也不必为闭包性能费尽心思优化

参考

Function item types
Function pointer types
fn - Rust
rust quiz
“Expected fn item, found a different fn item” when working with function pointers
javascript - What are free variables? - Stack Overflow