import Lean import Mathlib.Lean.Expr.Basic import Mathlib.Lean.CoreM import Batteries.Lean.Util.Path import Batteries.Data.String.Matcher open Lean open Lean.Meta open Lean.Elab open Lean.Elab.Command /- This file is modified from: https://github.com/semorrison/lean-training-data To run this, prepare a Lean project with Mathlib installed. Suppose the project is called LeanProj. Put this extract.lean in LeanProj/scripts/extract.lean In LeanProj/lakefile.lean, below @[default_target] lean_lib «LeanProj» where ```, add this: @[default_target] lean_exe extract where root := `scripts.extract supportInterpreter := true ``` Now in the terminal, run: cd LeanProj mkdir data lake exe extract > data/theorems.txt The stdout (IO.println) will be directed to data/theorems.txt, The stderr (IO.eprintln) will be shown in the terminal. You will see the progress. It takes about 15 minutes. -/ /-- Helps print kinds of constants and distinguish between theorems, definitions, axioms, etc. -/ def Lean.ConstantInfo.kind : ConstantInfo → String | .axiomInfo _ => "axiom" | .defnInfo _ => "def" | .thmInfo _ => "theorem" | .opaqueInfo _ => "opaque" | .quotInfo _ => "quot" -- Not sure what this is! | .inductInfo _ => "inductive" | .ctorInfo _ => "constructor" | .recInfo _ => "recursor" /- Turn Lean.Expr and related types into JSON. -/ def truncatePrint (s : String) : IO Unit := do IO.println (if s.length < 1000 then s else s.take 997 ++ " ...") def main : IO Unit := do IO.eprintln "\n\n" IO.eprintln "importing modules" searchPathRef.set compile_time_search_path% CoreM.withImportModules #[`Mathlib] (options := {entries := [ -- (`maxHeartbeats, .ofNat 0), (`smartUnfolding, .ofBool false), ]}) do IO.eprintln s!"simplification timeout: {← getMaxHeartbeats}" let env ← getEnv IO.eprintln "filtering out unwanted objects" let mut x := 0 let mut lst := #[] lst := lst ++ env.constants.map₁.toArray lst := lst ++ env.constants.map₂.toArray lst := lst.filter fun (name, info) => ( !name.isInternalOrNum -- s.startsWith "_" | .num && !name.isInaccessibleUserName -- s.contains '✝' || s == "_inaccessible" && !name.isImplementationDetail -- s.startsWith "__" && !(name.toString.findSubstr? ".proof_" |>.isSome) && !(name.toString.findSubstr? ".match_" |>.isSome) ) -- name.toString.startsWith "Nat.add" IO.eprintln s!"printing {lst.size} theorems" for (name, info) in lst do IO.println "----ast----" -- print module. -- eg. "import Mathlib.Data.Real.Basic" match (← findModuleOf? name) with | .none => IO.println "unknown" | .some mod => IO.println s!"import {mod}" IO.println "--ast--" -- print kine and decl name -- eg. "theorem Nat.add_comm" or "def Nat.add" IO.println s!"{info.kind} {name}" IO.println "--ast--" -- print decl type. -- eg. "∀ (n m : Nat), n + m = m + n" -- ppExpr is better than dbgToString let ppType := toString (← MetaM.run' do ppExpr info.type) truncatePrint ppType IO.println "--ast--" -- print doc string if exists. -- eg. O(|l| + |r|). Merge two lists using s as a switch. match (← findDocString? env name) with | .none => IO.println "" | .some doc => IO.println doc -- IO.println "--ast--" -- print decl value. These Lean.Expr are very large. -- match info.value? with -- | .none => IO.println "" -- | .some val => IO.println (toString (← MetaM.run' do ppExpr val)) -- IO.println "--ast--" x := x + 1 if x % 100 == 0 then IO.eprintln s!"printed {x} / {lst.size} theorems, {x.toFloat / lst.size.toFloat * 100}%" -- if x ≥ 2 then break IO.eprintln "done"