File size: 3,851 Bytes
19cafec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import Lean
import Mathlib.Lean.Expr.Basic
import Mathlib.Lean.CoreM
import Batteries.Lean.Util.Path
import Batteries.Data.String.Matcher

open Lean
open Lean.Meta
open Lean.Elab
open Lean.Elab.Command

/-
This file is modified from: https://github.com/semorrison/lean-training-data

To run this, prepare a Lean project with Mathlib installed. Suppose the project is called LeanProj.

Put this extract.lean in LeanProj/scripts/extract.lean

In LeanProj/lakefile.lean, below
@[default_target]
lean_lib «LeanProj» where
```,

add this:
@[default_target]
lean_exe extract where
  root := `scripts.extract
  supportInterpreter := true
```

Now in the terminal, run:
cd LeanProj
mkdir data
lake exe extract > data/theorems.txt

The stdout (IO.println) will be directed to data/theorems.txt,
The stderr (IO.eprintln) will be shown in the terminal. You will see the progress. It takes about 15 minutes.
-/

/--
Helps print kinds of constants
and distinguish between theorems, definitions, axioms, etc.
-/
def Lean.ConstantInfo.kind : ConstantInfo → String
  | .axiomInfo  _ => "axiom"
  | .defnInfo   _ => "def"
  | .thmInfo    _ => "theorem"
  | .opaqueInfo _ => "opaque"
  | .quotInfo   _ => "quot" -- Not sure what this is!
  | .inductInfo _ => "inductive"
  | .ctorInfo   _ => "constructor"
  | .recInfo    _ => "recursor"

/-
Turn Lean.Expr and related types into JSON.
-/


def truncatePrint (s : String) : IO Unit := do
  IO.println (if s.length < 1000 then s else s.take 997 ++ " ...")

def main : IO Unit := do
  IO.eprintln "\n\n"
  IO.eprintln "importing modules"
  searchPathRef.set compile_time_search_path%
  CoreM.withImportModules #[`Mathlib] (options := {entries := [
    -- (`maxHeartbeats, .ofNat 0),
    (`smartUnfolding, .ofBool false),
  ]}) do
    IO.eprintln s!"simplification timeout: {← getMaxHeartbeats}"
    let env ← getEnv

    IO.eprintln "filtering out unwanted objects"
    let mut x := 0
    let mut lst := #[]
    lst := lst ++ env.constants.map₁.toArray
    lst := lst ++ env.constants.map₂.toArray
    lst := lst.filter fun (name, info) => (
      !name.isInternalOrNum -- s.startsWith "_" | .num
      && !name.isInaccessibleUserName -- s.contains '✝' || s == "_inaccessible"
      && !name.isImplementationDetail  -- s.startsWith "__"
      && !(name.toString.findSubstr? ".proof_" |>.isSome)
      && !(name.toString.findSubstr? ".match_" |>.isSome)
    )

      -- name.toString.startsWith "Nat.add"
    IO.eprintln s!"printing {lst.size} theorems"
    for (name, info) in lst do
      IO.println "----ast----"
      -- print module.
      -- eg. "import Mathlib.Data.Real.Basic"
      match (← findModuleOf? name) with
      | .none => IO.println "unknown"
      | .some mod => IO.println s!"import {mod}"
      IO.println "--ast--"

      -- print kine and decl name
      -- eg. "theorem Nat.add_comm" or "def Nat.add"
      IO.println s!"{info.kind} {name}"
      IO.println "--ast--"

      -- print decl type.
      -- eg. "∀ (n m : Nat), n + m = m + n"
      -- ppExpr is better than dbgToString
      let ppType := toString (← MetaM.run' do ppExpr info.type)
      truncatePrint ppType
      IO.println "--ast--"

      -- print doc string if exists.
      -- eg. O(|l| + |r|). Merge two lists using s as a switch.
      match (← findDocString? env name) with
      | .none => IO.println ""
      | .some doc => IO.println doc
      -- IO.println "--ast--"

      -- print decl value. These Lean.Expr are very large.
      -- match info.value? with
      -- | .none => IO.println ""
      -- | .some val => IO.println (toString (← MetaM.run' do ppExpr val))
      -- IO.println "--ast--"

      x := x + 1
      if x % 100 == 0 then
        IO.eprintln s!"printed {x} / {lst.size} theorems, {x.toFloat / lst.size.toFloat * 100}%"
      -- if x ≥ 2 then break
    IO.eprintln "done"