search-mathlib / extract.lean
dx2102's picture
Update extract.lean
13b6cf4 verified
import Lean
import Mathlib.Lean.Expr.Basic
import Mathlib.Lean.CoreM
import Batteries.Lean.Util.Path
import Batteries.Data.String.Matcher
open Lean
open Lean.Meta
open Lean.Elab
open Lean.Elab.Command
/-
This small file is modified from: https://github.com/semorrison/lean-training-data
You can refer to the file structure there for more details.
To run this, there is no need to download another 4GB of Mathlib files.
Just prepare a Lean project with Mathlib installed. Suppose the project is in the folder LeanProj.
Your Mathlib files should live in LeanProj/.lake/packages/mathlib.
My LeanProj/lean-toolchain says I am using version: leanprover/lean4:v4.9.0-rc2
Now, put this extract.lean in LeanProj/scripts/extract.lean
In LeanProj/lakefile.lean, below
@[default_target]
lean_lib «LeanProj» where
add this:
@[default_target]
lean_exe extract where
root := `scripts.extract
supportInterpreter := true
This will make the `lake exe extract` command available,
and enable the correct `compile_time_search_path%` in the script.
Now in the terminal, run:
cd path-to-LeanProj
mkdir data
lake exe extract > data/theorems.txt
The stdout (IO.println) will be directed to data/theorems.txt,
The stderr (IO.eprintln) will be shown in the terminal. You will see the progress. It takes about 15 minutes.
-/
/--
Helps print kinds of constants
and distinguish between theorems, definitions, axioms, etc.
-/
def Lean.ConstantInfo.kind : ConstantInfo → String
| .axiomInfo _ => "axiom"
| .defnInfo _ => "def"
| .thmInfo _ => "theorem"
| .opaqueInfo _ => "opaque"
| .quotInfo _ => "quot" -- Not sure what this is!
| .inductInfo _ => "inductive"
| .ctorInfo _ => "constructor"
| .recInfo _ => "recursor"
def truncatePrint (s : String) : IO Unit := do
IO.println (if s.length < 1000 then s else s.take 997 ++ " ...")
def main : IO Unit := do
IO.eprintln "\n\n"
IO.eprintln "importing modules"
searchPathRef.set compile_time_search_path%
CoreM.withImportModules #[`Mathlib] (options := {entries := [
-- (`maxHeartbeats, .ofNat 0),
(`smartUnfolding, .ofBool false),
]}) do
IO.eprintln s!"simplification timeout: {← getMaxHeartbeats}"
let env ← getEnv
IO.eprintln "filtering out unwanted objects"
let mut x := 0
let mut lst := #[]
lst := lst ++ env.constants.map₁.toArray
lst := lst ++ env.constants.map₂.toArray
lst := lst.filter fun (name, info) => (
!name.isInternalOrNum -- s.startsWith "_" | .num
&& !name.isInaccessibleUserName -- s.contains '✝' || s == "_inaccessible"
&& !name.isImplementationDetail -- s.startsWith "__"
&& !(name.toString.findSubstr? ".proof_" |>.isSome)
&& !(name.toString.findSubstr? ".match_" |>.isSome)
)
-- name.toString.startsWith "Nat.add"
IO.eprintln s!"printing {lst.size} theorems"
for (name, info) in lst do
IO.println "----thm----"
-- print module.
-- eg. "import Mathlib.Data.Real.Basic"
match (← findModuleOf? name) with
| .none => IO.println "unknown"
| .some mod => IO.println s!"import {mod}"
IO.println "--thm--"
-- print kine and decl name
-- eg. "theorem Nat.add_comm" or "def Nat.add"
IO.println s!"{info.kind} {name}"
IO.println "--thm--"
-- print decl type.
-- eg. "∀ (n m : Nat), n + m = m + n"
-- ppExpr is better than dbgToString
let ppType := toString (← MetaM.run' do ppExpr info.type)
truncatePrint ppType
IO.println "--thm--"
-- print doc string if exists.
-- eg. O(|l| + |r|). Merge two lists using s as a switch.
match (← findDocString? env name) with
| .none => IO.println ""
| .some doc => IO.println doc
-- IO.println "--thm--"
-- print decl value. These Lean.Expr are very large.
-- match info.value? with
-- | .none => IO.println ""
-- | .some val => IO.println (toString (← MetaM.run' do ppExpr val))
-- IO.println "--thm--"
x := x + 1
if x % 100 == 0 then
IO.eprintln s!"printed {x} / {lst.size} theorems, {x.toFloat / lst.size.toFloat * 100}%"
-- if x ≥ 2 then break
IO.eprintln "done"