search-mathlib / extract.lean
dx2102's picture
Update extract.lean
6fbc7f0 verified
raw
history blame
3.75 kB
import Lean
import Mathlib.Lean.Expr.Basic
import Mathlib.Lean.CoreM
import Batteries.Lean.Util.Path
import Batteries.Data.String.Matcher
open Lean
open Lean.Meta
open Lean.Elab
open Lean.Elab.Command
/-
This file is modified from: https://github.com/semorrison/lean-training-data
To run this, prepare a Lean project with Mathlib installed. Suppose the project is called LeanProj.
Put this extract.lean in LeanProj/scripts/extract.lean
In LeanProj/lakefile.lean, below
@[default_target]
lean_lib «LeanProj» where
```,
add this:
@[default_target]
lean_exe extract where
root := `scripts.extract
supportInterpreter := true
```
Now in the terminal, run:
cd LeanProj
mkdir data
lake exe extract > data/theorems.txt
The stdout (IO.println) will be directed to data/theorems.txt,
The stderr (IO.eprintln) will be shown in the terminal. You will see the progress. It takes about 15 minutes.
-/
/--
Helps print kinds of constants
and distinguish between theorems, definitions, axioms, etc.
-/
def Lean.ConstantInfo.kind : ConstantInfo → String
| .axiomInfo _ => "axiom"
| .defnInfo _ => "def"
| .thmInfo _ => "theorem"
| .opaqueInfo _ => "opaque"
| .quotInfo _ => "quot" -- Not sure what this is!
| .inductInfo _ => "inductive"
| .ctorInfo _ => "constructor"
| .recInfo _ => "recursor"
/-
Turn Lean.Expr and related types into JSON.
-/
def truncatePrint (s : String) : IO Unit := do
IO.println (if s.length < 1000 then s else s.take 997 ++ " ...")
def main : IO Unit := do
IO.eprintln "\n\n"
IO.eprintln "importing modules"
searchPathRef.set compile_time_search_path%
CoreM.withImportModules #[`Mathlib] do
IO.eprintln s!"simplification timeout: {← getMaxHeartbeats}"
let env ← getEnv
IO.eprintln "filtering out unwanted objects"
let mut x := 0
let mut lst := #[]
lst := lst ++ env.constants.map₁.toArray
lst := lst ++ env.constants.map₂.toArray
lst := lst.filter fun (name, info) => (
!name.isInternalOrNum -- s.startsWith "_" | .num
&& !name.isInaccessibleUserName -- s.contains '✝' || s == "_inaccessible"
&& !name.isImplementationDetail -- s.startsWith "__"
&& !(name.toString.findSubstr? ".proof_" |>.isSome)
&& !(name.toString.findSubstr? ".match_" |>.isSome)
)
-- name.toString.startsWith "Nat.add"
IO.eprintln s!"printing {lst.size} theorems"
for (name, info) in lst do
IO.println "----ast----"
-- print module.
-- eg. "import Mathlib.Data.Real.Basic"
match (← findModuleOf? name) with
| .none => IO.println "unknown"
| .some mod => IO.println s!"import {mod}"
IO.println "--ast--"
-- print kine and decl name
-- eg. "theorem Nat.add_comm" or "def Nat.add"
IO.println s!"{info.kind} {name}"
IO.println "--ast--"
-- print decl type.
-- eg. "∀ (n m : Nat), n + m = m + n"
-- ppExpr is better than dbgToString
let ppType := toString (← MetaM.run' do ppExpr info.type)
truncatePrint ppType
IO.println "--ast--"
-- print doc string if exists.
-- eg. O(|l| + |r|). Merge two lists using s as a switch.
match (← findDocString? env name) with
| .none => IO.println ""
| .some doc => IO.println doc
-- IO.println "--ast--"
-- print decl value. These Lean.Expr are very large.
-- match info.value? with
-- | .none => IO.println ""
-- | .some val => IO.println (toString (← MetaM.run' do ppExpr val))
-- IO.println "--ast--"
x := x + 1
if x % 100 == 0 then
IO.eprintln s!"printed {x} / {lst.size} theorems, {x.toFloat / lst.size.toFloat * 100}%"
-- if x ≥ 2 then break
IO.eprintln "done"