Rust 和 OCaml谁更合适用来开发编译器？-duidaima 堆代码

Rust 和 OCaml谁更合适用来开发编译器？

发布于 2个月前
 302 热度

 0 评论

BruceLe
1 粉丝 54 篇博客

关于如何选择最合适的编程语言来开发编译器，这个话题在编程语言爱好者中经常引起热议。遗憾的是，许多人的回答要么局限于自己偏爱的语言，没有提供合理解释，要么给出模糊的解释却缺乏具体的例证。这两种回答对提问者来说几乎没有任何帮助。在本文中，我将尝试通过比较两种语言——Rust 和 OCaml，来对这个话题提供更详细的阐述。

CPS 转换
在阐述我的具体观点之前，我会展示一个非常简单的语言的 CPS（延续传递风格）转换的两个相似实现，并不做任何结论性陈述。这一通用方法源于 Andrew W. Appel 的“Compiling with Continuations”。即使你对这个概念不太了解，也不必担心；你只需要关注 Rust 和 OCaml 中是如何具体实现这个理念的。

以下是用 Rust 编写的 CPS 转换：

use std::cell::RefCell;
use std::ops::Deref;
use std::rc::Rc;
// 堆代码 duidaima.com
// lambda 语言的变量标识符 `Term`。
type Var = String;

// lambda语言；直接风格。
type Term = Rc<TermTree>;

enum TermTree {
    Var(Var),
    Fix(Vec<(Var, Vec<Var>, Term)>, Term),
    Appl(Term, Vec<Term>),
    Record(Vec<Term>),
    Select(Term, u32),
}

use TermTree::*;

#[derive(Clone)]
enum CpsVar {
    // 在 CPS 转换过程中从 lambda 项中获取。
    CLamVar(Var),
    // 在 CPS 转换过程中唯一生成。
    CGenVar(u32),
}

use CpsVar::*;

// 结果的 CPS 项。
enum CpsTerm {
    CFix(Vec<(CpsVar, Vec<CpsVar>, CpsTerm)>, Box<CpsTerm>),
    CAppl(CpsVar, Vec<CpsVar>),
    CRecord(Vec<CpsVar>, Binder),
    CSelect(CpsVar, u32, Binder),
    CHalt(CpsVar),
}

use CpsTerm::*;

// 在 `CpsTerm` 内部绑定唯一的 `CpsVar`。
type Binder = (CpsVar, Box<CpsTerm>);

// 根据当前的`i` 生成一个唯一的 CPS 变量。
fn gensym(i: RefCell<u32>) -> CpsVar {
    let x = CGenVar(i.clone().into_inner());
    i.replace_with(|&mut i| i + 1);
    x
}

// 将`Term`转换为`CpsTerm`，并将`finish`应用于结果的CPS变量。
fn convert(gen: RefCell<u32>, finish: impl FnOnce(CpsVar) -> CpsTerm, term: Term) -> CpsTerm {
    match term.deref() {
        Var(x) => finish(CLamVar(x.to_string())),
        Fix(defs, m) => CFix(
            defs.iter()
            .map(|def| convert_def(gen.clone(), def.clone()))
            .collect(),
            Box::new(convert(gen, finish, m.clone())),
        ),
        Appl(f, args) => {
            let ret_k = gensym(gen.clone());
            let ret_k_x = gensym(gen.clone());
            CFix(
                vec![(ret_k.clone(), vec![ret_k_x.clone()], finish(ret_k_x))],
                Box::new(convert(
                    gen.clone(),
                    |f_cps| {
                        convert_list(
                            gen,
                            |args_cps| {
                                CAppl(f_cps, args_cps.into_iter().chain(vec![ret_k]).collect())
                            },
                            args,
                        )
                    },
                    f.clone(),
                )),
            )
        }
        Record(fields) => convert_list(
            gen.clone(),
            |fields_cps| {
                let x = gensym(gen);
                CRecord(fields_cps, (x.clone(), Box::new(finish(x))))
            },
            fields,
        ),
        Select(m, i) => convert(
            gen.clone(),
            |m_cps| {
                let x = gensym(gen);
                CSelect(m_cps, *i, (x.clone(), Box::new(finish(x))))
            },
            m.clone(),
        ),
    }
}

// 将`Vec<Term>`转换为`Vec<CpsVar>`并 调用`finish`
fn convert_list(
    gen: RefCell<u32>,
    finish: impl FnOnce(Vec<CpsVar>) -> CpsTerm,
    terms: &[Term],
) -> CpsTerm {
    fn go(
        gen: RefCell<u32>,
        finish: impl FnOnce(Vec<CpsVar>) -> CpsTerm,
        mut acc: Vec<CpsVar>,
        terms: &[Term],
    ) -> CpsTerm {
        match terms.split_first() {
            None => finish(acc),
            Some((x, xs)) => convert(
                gen.clone(),
                |x_cps| {
                    acc.push(x_cps);
                    go(gen, finish, acc, xs)
                },
                x.clone(),
            ),
        }
    }
    let acc = Vec::with_capacity(terms.len());
    go(gen, finish, acc, terms)
}


// 将单个函数定义转换为其 CPS 形式。
fn convert_def(
    gen: RefCell<u32>,
    (f, params, m): (Var, Vec<Var>, Term),
) -> (CpsVar, Vec<CpsVar>, CpsTerm) {
    let k = gensym(gen.clone());
    (
        CLamVar(f),
        params
            .into_iter()
            .map(CLamVar)
            .chain(std::iter::once(k.clone()))
            .collect(),
        convert(gen, |m_cps| CAppl(k, vec![m_cps]), m),
    )
}

代码包括注释和空行共 145 行。
同样的算法用地道的 OCaml 编写：

(* lambda语言中的变量标识符[term]。 *)
type var = string

(* lambda语言；直接风格。 *)
type term =
  | Var of var
  | Fix of (var * var list * term) list * term
  | Appl of term * term list
  | Record of term list
  | Select of term * int

type cps_var =
  (* 在CPS转换过程中从lambda项中提取。 *)
  | CLamVar of var
  (* 在CPS转换过程中独特生成。 *)
  | CGenVar of int

(* 生成的CPS项。 *)
type cps_term =
  | CFix of (cps_var * cps_var list * cps_term) list * cps_term
  | CAppl of cps_var * cps_var list
  | CRecord of cps_var list * binder
  | CSelect of cps_var * int * binder
  | CHalt of cps_var

(* 在[cps_term]内部绑定唯一的[cps_var]。 *)
and binder = cps_var * cps_term

(* 根据当前的[i]生成唯一的CPS变量。 *)
let gensym i =
  let x = CGenVar !i in
  i := !i + 1;
  x

(* 将[term]转换为[cps_term]，并将[finish]应用于生成的CPS变量。 *)
let rec convert gen finish = function
  | Var x -> finish (CLamVar x)
  | Fix (defs, m) -> CFix (List.map (convert_def gen) defs, convert gen finish m)
  | Appl (f, args) ->
      let ret_k = gensym gen in
      let ret_k_x = gensym gen in
      CFix
        ( [ (ret_k, [ ret_k_x ], finish ret_k_x) ],
          f
          |> convert gen (fun f_cps ->
                 args
                 |> convert_list gen (fun args_cps ->
                        CAppl (f_cps, args_cps @ [ ret_k ]))) )
  | Record fields ->
      fields
      |> convert_list gen (fun fields_cps ->
             let x = gensym gen in
             CRecord (fields_cps, (x, finish x)))
  | Select (m, i) ->
      m
      |> convert gen (fun m_cps ->
             let x = gensym gen in
             CSelect (m_cps, i, (x, finish x)))


(* 将[term list]转换为[cps_var list]并将[finish]应用于它。 *)
and convert_list gen finish =
  let rec go acc = function
    | [] -> finish (List.rev acc)
    | x :: xs -> x |> convert gen (fun x_cps -> go (x_cps :: acc) xs)
  in
  go []

(* 将单个函数定义转换为其CPS形式。 *)
and convert_def gen (f, params, m) =
  let k = gensym gen in
  ( CLamVar f,
    List.map (fun x -> CLamVar x) params @ [ k ],
    m |> convert gen (fun m_cps -> CAppl (k, [ m_cps ])) )

代码包括注释和空行总共有 74 行。这比 Rust 版本短了大概一半。

比较两种实现
开发的核心特点涵盖了：
.大量使用递归定义的数据结构。
.复杂的数据转换流程。
下面简要概述 Rust 和 OCaml 在这两方面的处理差异：

1.递归数据结构：
OCaml：直接支持递归数据结构。
Rust：递归数据结构的实现需要借助Rc 和 Box将TermTree和CpsTerm的递归结构进行封装。
2.复杂数据转换：
OCaml：
递归广泛应用，拥有尾调用和“ Tail Modulo Constructor (TMC )”优化。
模式匹配的实现便捷高效，无需额外缩进和复杂的参数描述。
标准数据结构主要为不可变类型，有助于代码理解。
Rust：
递归使用较少，且并不保证尾递归。
模式匹配的实现相对复杂，涉及额外的缩进和参数详述。
标准数据结构大多为可变类型，倾向于使用命令式风格，需要进行迭代操作才能实现流水线编程。
与 OCaml 相比，Rust 的语法更冗长。作为一门无垃圾回收的语言，它要求开发者必须精确处理内存管理。这确实增加了对执行过程的透明度，但对理解算法本身的帮助却不明显。

在 Rust 中，修改变量或数据结构的值也相对复杂：

fn gensym(i: RefCell<u32>) -> CpsVar {
    let x = CGenVar(i.clone().into_inner());
    i.replace_with(|&mut i| i + 1);
    x
}

与之相比，在 OCaml 中比较简单：

// 堆代码 duidaima.com
let gensym i =
  let x = CGenVar !i in
  i := !i + 1;
  x

为何选择RefCell<u32>而非&mut u32？因为 Rust 强制在任何时刻只允许一个可变引用指向特定值。尽管在多线程代码中这是合理的，但在单线程的算法中，我们需用RefCell来绕过这个限制。

另外，Rust 中convert_list的实现也值得注意。由于fn本质上只是代码指针，所以我们每次调用go都必须明确传递gen和finish，导致了变量类型的重复声明。与之相对，OCaml则会自动捕获gen和finish。

虽然上述算法不算特别复杂，但已经足以体现 ML 家族语言在编程方面的便利性。下面，我们将深入探讨两种语言类型系统的更多细节。

类型安全性：GADTs
与 Rust 相比，OCaml 的类型系统通常更具表现力，并且在资源管理之外具有更多优势。例如，OCaml 支持广义代数数据类型（GADTs），以强化数据结构的某些不变性。考虑一种包含布尔值、整数及其操作的对象语言，其描述如下：

enum Term {
    Bool(bool),
    Not(Box<Term>),
    And(Box<Term>, Box<Term>),
    Int(i32),
    Neg(Box<Term>),
    Add(Box<Term>, Box<Term>),
}

enum Value {
    Bool(bool),
    Int(i32),
}

那么，我们该如何编写该语言的求值器呢？以下是一个可能的解决方案：

fn eval(term: &Term) -> Value {
    use Term::*;

    match term {
        Bool(b) => Value::Bool(*b),
        Not(m) => match eval(m) {
            Value::Bool(b) => Value::Bool(!b),
            _ => panic!("`Not`运算符应用于非布尔值"),
        },
        And(m, n) => match (eval(m), eval(n)) {
            (Value::Bool(b1), Value::Bool(b2)) => Value::Bool(b1 && b2),
            _ => panic!("`And`运算符应用于非布尔值"),
        },
        Int(i) => Value::Int(*i),
        Neg(m) => match eval(m) {
            Value::Int(i) => Value::Int(-i),
            _ => panic!("`Neg`运算符应用于非整数值"),
        },
        Add(m, n) => match (eval(m), eval(n)) {
            (Value::Int(i1), Value::Int(i2)) => Value::Int(i1 + i2),
            _ => panic!("`Add`运算符应用于非整数值"),
        },
    }
}

虽然该解决方案相对简单，但并不够稳健。如果对eval的输入类型不合适会怎样呢？以下示例说明了问题：

fn main() {
    use Term::*;
    let term = Not(Box::new(And(Box::new(Bool(true)), Box::new(Int(42)))));
    dbg!(eval(&term));
}

程序会因为“And运算符的第二操作数必须为布尔值”而出现问题。
为了避免此类错误，我们可以在 OCaml 中采用 GADTs ：

type _ term =
  | Bool : bool -> bool term
  | Not : bool term -> bool term
  | And : bool term * bool term -> bool term
  | Int : int -> int term
  | Neg : int term -> int term
  | Add : int term * int term -> int term

let rec eval : type a. a term -> a = function
  | Bool b -> b
  | Not m -> not (eval m)
  | And (m, n) ->
      let b1, b2 = (eval m, eval n) in
      b1 && b2
  | Int i -> i
  | Neg m -> -eval m
  | Add (m, n) ->
      let i1, i2 = (eval m, eval n) in
      i1 + i2

现在，尝试构造一个不合适的类型会是什么情况呢？

// 堆代码 duidaima.com
let () =
  let _term = Not (And (Bool true, Int 42)) in
  ()

类型检查不会通过！

File "bin/main.ml", line 22, characters 35-41:
22 |   let _term = Not (And (Bool true, Int 42)) in
                                        ^^^^^^
Error: This expression has type int term
       but an expression was expected of type bool term
       Type int is not compatible with type bool

之所以会这样，是因为我们在term的定义中实质上编码了对象语言的类型系统。OCaml 知道And只接受布尔类型的项，而不是整数类型。在实际应用场景中，我们可以有一个无限制的、类似 Rust 的Term项，该项是解析生成的，并可进一步详细阐述为合适的 GADT 风格的term。无论采用eval还是compile，后者均可被处理。

类型灵活性：First-Class Modules
OCaml 中具备一项 Rust 所不具备的独特功能：First-Class Modules。First-Class Modules允许模块存储于变量、作为参数传递，甚至可以从常规函数返回。假设你的目标语言涵盖了各种固定大小的整数，如i8/u8、i16/u16等，那么你可以通过 OCaml 中的（常规）模块来表示这些类型：

fixed_ints.mli

(* [u8], [u16]等由我们定义。*)

module type S = sig
  type t

  val add : t -> t -> t
  val sub : t -> t -> t
  val mul : t -> t -> t
  val div : t -> t -> t
  val rem : t -> t -> t

  (* 这里还有一些操作。*)
end

module U8 : S with type t = u8
module U16 : S with type t = u16
module U32 : S with type t = u32
module U64 : S with type t = u64
module U128 : S with type t = u128
module I8 : S with type t = i8
module I16 : S with type t = i16
module I32 : S with type t = i32
module I64 : S with type t = i64
module I128 : S with type t = i128

(* [u8], [u16]等由我们定义。*)

module type S = sig
  type t

  val add : t -> t -> t
  val sub : t -> t -> t
  val mul : t -> t -> t
  val div : t -> t -> t
  val rem : t -> t -> t

  (* 这里还有一些操作。*)
end

module U8 : S with type t = u8
module U16 : S with type t = u16
module U32 : S with type t = u32
module U64 : S with type t = u64
module U128 : S with type t = u128
module I8 : S with type t = i8
module I16 : S with type t = i16
module I32 : S with type t = i32
module I64 : S with type t = i64
module I128 : S with type t = i128

在抽象语法树（AST）中，整数值可以按照以下方式表示：

type generic =
  | U8 of u8
  | U16 of u16
  | U32 of u32
  | U64 of u64
  | U128 of u128
  | I8 of i8
  | I16 of i16
  | I32 of i32
  | I64 of i64
  | I128 of i128

那么，在存在诸多算术运算符add/sub/mul/div/rem和多种类型的操作数时，该如何高效地实现评估呢？

解决方案如下：
1.定义函数pair_exn，将两个generic映射为一个一等模块Pair。
2.为特定的整数对定义模块Pair，并实现S。
3.定义函数do_int_bin_op，接收Pair作为参数，并执行整数对上的操作op。
4.从eval中调用do_int_bin_op。
在 OCaml 中:

fixed_ints.mli

module type Pair = sig
  type t

  include S with type t := t

  val pair : t * t
end

val pair_exn : generic * generic -> (module Pair)

pair的实现如下：

fixed_ints.ml

let pair_exn =
  let make (type a) (module S : S with type t = a) (x, y) =
    (module struct
      include S

      let pair = x, y
    end : Pair)
  in
  function
  | U8 x, U8 y -> make (module U8) (x, y)
  | U16 x, U16 y -> make (module U16) (x, y)
  | U32 x, U32 y -> make (module U32) (x, y)
  | U64 x, U64 y -> make (module U64) (x, y)
  | U128 x, U128 y -> make (module U128) (x, y)
  | I8 x, I8 y -> make (module I8) (x, y)
  | I16 x, I16 y -> make (module I16) (x, y)
  | I32 x, I32 y -> make (module I32) (x, y)
  | I64 x, I64 y -> make (module I64) (x, y)
  | I128 x, I128 y -> make (module I128) (x, y)
  | _, _ -> raise (invalid_arg "Fixed_ints.pair_exn")
;;

现在，我们可以按如下方式编写 eval：

(* 在 [eval] 的定义中的某处。*)
| IntBinOp (op, ty, m, n) ->
  let x = extract_int_exn (eval m) in
  let y = extract_int_exn (eval n) in
  let (module Pair) = Fixed_ints.pair_exn (x, y) in
  do_int_bin_op op (module Pair)

extract_int_exn 取出一个值，并提取一个整型 generic，如果该值不是整数则抛出异常。

最后，do_int_bin_op 定义如下：

let do_int_bin_op op (module S : Fixed_ints.Pair) =
  let x, y = S.pair in
  match op with
  | Add -> S.add x y |> S.to_value
  | Sub -> S.sub x y |> S.to_value
  | Mul -> S.mul x y |> S.to_value
  | Div -> S.div x y |> S.to_value
  | Rem -> S.rem x y |> S.to_value
;;

S.to_value 将类型化的整数转换回持有 generic 的值。

通过借助 First-Class Modules，我们能够在无需过多样板代码的前提下实现固定大小整数的评估。而在 Rust 中的最佳实践则是使用macro_rules!。然而，该方法因其难以理解的语法、与编程语言其他部分集成不深入，以及较差的 IDE 支持而备受诟病。

结束语
虽然 Rust 在资源管理方面表现优秀，但 OCaml 更适合于编译器的开发。这其中涉及许多引人注目的特性，如多态变体、自定义绑定操作符和Effect Handlers。得益于完全静态且灵活的类型系统，OCaml 在历史上已广泛应用于众多项目，包括 Frama-C 工具链、Coq 定理证明器，以及 Rust 编译器的早期版本。

然而，OCaml 也不是完美的。 OCaml 的标准库和整体生态系统与 Rust 相比显得略逊一筹。在 OCaml 中直接使用 Rust 的完整固定大小整数集合并不可行。不过，通过整合原生 OCaml 整数、Int32、Int64模块和 C FFI，可以达到同样的效果。（专业提示：避免使用[ocaml-stdint]，截至 2023 年 8 月 6 日，该库未维护且存在多个错误。[ocaml-integers]是更稳定的选择，但缺乏Int8、Int16和 128 位整数的支持，并在文档方面有所不足。）

随着 Rust 的日益普及，更多的开发者开始在 GitHub 上使用它来启动编译器项目。我认为，如果你试图借助 Rust 编写大量编译器来深入学习 Rust ，或者确切了解自己的目标，这可能是明智之举。如果你主要关注的是编译器开发，那么 OCaml 将能够为你节省大量时间和精力。

其他值得考虑的选项包括 Haskell 和各类 Lisp 方言。如果你已经熟练掌握了 Haskell（对此我同时表示祝贺和哀悼），那么仅为了编写编译器而学习 OCaml 可能是不必要的。如果你尚未掌握 Haskell，OCaml 可能是更容易上手的选择。尽管 Lisps 极具灵活性，但由于它们通常缺少静态类型安全性，运行时错误可能成为一个棘手问题。

 用户评论

Rust编程
 47 成员 |  312 话题
+我要提问 +随便写写

可能感兴趣的话题

你知道Rust中Any Trait的用法吗？

如何使用Rust构建一个基于QUIC的HTTP/3服务器

为什么用Rust写代码能提高能效？

Rust代码进入Linux内核：争议中前行的未来