Spaces:
Running
Running
import Lean | |
import Mathlib.Lean.Expr.Basic | |
import Mathlib.Lean.CoreM | |
import Batteries.Lean.Util.Path | |
import Batteries.Data.String.Matcher | |
open Lean | |
open Lean.Meta | |
open Lean.Elab | |
open Lean.Elab.Command | |
/- | |
This file is modified from: https://github.com/semorrison/lean-training-data | |
To run this, prepare a Lean project with Mathlib installed. Suppose the project is called LeanProj. | |
Put this extract.lean in LeanProj/scripts/extract.lean | |
In LeanProj/lakefile.lean, below | |
@[default_target] | |
lean_lib «LeanProj» where | |
```, | |
add this: | |
@[default_target] | |
lean_exe extract where | |
root := `scripts.extract | |
supportInterpreter := true | |
``` | |
Now in the terminal, run: | |
cd LeanProj | |
mkdir data | |
lake exe extract > data/theorems.txt | |
The stdout (IO.println) will be directed to data/theorems.txt, | |
The stderr (IO.eprintln) will be shown in the terminal. You will see the progress. It takes about 15 minutes. | |
-/ | |
/-- | |
Helps print kinds of constants | |
and distinguish between theorems, definitions, axioms, etc. | |
-/ | |
def Lean.ConstantInfo.kind : ConstantInfo → String | |
| .axiomInfo _ => "axiom" | |
| .defnInfo _ => "def" | |
| .thmInfo _ => "theorem" | |
| .opaqueInfo _ => "opaque" | |
| .quotInfo _ => "quot" -- Not sure what this is! | |
| .inductInfo _ => "inductive" | |
| .ctorInfo _ => "constructor" | |
| .recInfo _ => "recursor" | |
/- | |
Turn Lean.Expr and related types into JSON. | |
-/ | |
def truncatePrint (s : String) : IO Unit := do | |
IO.println (if s.length < 1000 then s else s.take 997 ++ " ...") | |
def main : IO Unit := do | |
IO.eprintln "\n\n" | |
IO.eprintln "importing modules" | |
searchPathRef.set compile_time_search_path% | |
CoreM.withImportModules #[`Mathlib] do | |
IO.eprintln s!"simplification timeout: {← getMaxHeartbeats}" | |
let env ← getEnv | |
IO.eprintln "filtering out unwanted objects" | |
let mut x := 0 | |
let mut lst := #[] | |
lst := lst ++ env.constants.map₁.toArray | |
lst := lst ++ env.constants.map₂.toArray | |
lst := lst.filter fun (name, info) => ( | |
!name.isInternalOrNum -- s.startsWith "_" | .num | |
&& !name.isInaccessibleUserName -- s.contains '✝' || s == "_inaccessible" | |
&& !name.isImplementationDetail -- s.startsWith "__" | |
&& !(name.toString.findSubstr? ".proof_" |>.isSome) | |
&& !(name.toString.findSubstr? ".match_" |>.isSome) | |
) | |
-- name.toString.startsWith "Nat.add" | |
IO.eprintln s!"printing {lst.size} theorems" | |
for (name, info) in lst do | |
IO.println "----ast----" | |
-- print module. | |
-- eg. "import Mathlib.Data.Real.Basic" | |
match (← findModuleOf? name) with | |
| .none => IO.println "unknown" | |
| .some mod => IO.println s!"import {mod}" | |
IO.println "--ast--" | |
-- print kine and decl name | |
-- eg. "theorem Nat.add_comm" or "def Nat.add" | |
IO.println s!"{info.kind} {name}" | |
IO.println "--ast--" | |
-- print decl type. | |
-- eg. "∀ (n m : Nat), n + m = m + n" | |
-- ppExpr is better than dbgToString | |
let ppType := toString (← MetaM.run' do ppExpr info.type) | |
truncatePrint ppType | |
IO.println "--ast--" | |
-- print doc string if exists. | |
-- eg. O(|l| + |r|). Merge two lists using s as a switch. | |
match (← findDocString? env name) with | |
| .none => IO.println "" | |
| .some doc => IO.println doc | |
-- IO.println "--ast--" | |
-- print decl value. These Lean.Expr are very large. | |
-- match info.value? with | |
-- | .none => IO.println "" | |
-- | .some val => IO.println (toString (← MetaM.run' do ppExpr val)) | |
-- IO.println "--ast--" | |
x := x + 1 | |
if x % 100 == 0 then | |
IO.eprintln s!"printed {x} / {lst.size} theorems, {x.toFloat / lst.size.toFloat * 100}%" | |
-- if x ≥ 2 then break | |
IO.eprintln "done" | |