lukbl commited on
Commit
3dd4fff
β€’
1 Parent(s): 932d4ac

exchange math delimiters

Browse files

+ markdown specific postprocessing

Files changed (1) hide show
  1. app.py +7 -2
app.py CHANGED
@@ -3,6 +3,7 @@ import subprocess
3
  import uuid
4
  import os
5
  import requests
 
6
 
7
 
8
  def get_pdf(pdf_link):
@@ -31,7 +32,8 @@ def nougat_ocr(file_name):
31
  #'--out', unique_filename,
32
  '--out', 'output',
33
  'pdf', f'{file_name}',
34
- '--checkpoint', 'nougat'
 
35
  ]
36
 
37
  # Run the command and capture its output
@@ -64,6 +66,8 @@ def predict(pdf_file, pdf_link):
64
  file_name = file_name.split('/')[-1][:-4]
65
  with open(f'output/{file_name}.mmd', 'r') as file:
66
  content = file.read()
 
 
67
  return content
68
 
69
 
@@ -76,7 +80,8 @@ def nougat_ocr1(file_name):
76
  'nougat',
77
  '--out', 'output',
78
  'pdf', f'{file_name}',
79
- '--checkpoint', 'nougat'
 
80
  ]
81
 
82
  # Run the command and get .mmd file in an output folder
 
3
  import uuid
4
  import os
5
  import requests
6
+ import re
7
 
8
 
9
  def get_pdf(pdf_link):
 
32
  #'--out', unique_filename,
33
  '--out', 'output',
34
  'pdf', f'{file_name}',
35
+ '--checkpoint', 'nougat',
36
+ '--markdown'
37
  ]
38
 
39
  # Run the command and capture its output
 
66
  file_name = file_name.split('/')[-1][:-4]
67
  with open(f'output/{file_name}.mmd', 'r') as file:
68
  content = file.read()
69
+ # switch math delimiters
70
+ content = content.replace(r'\(', '$').replace(r'\)', '$').replace(r'\[', '$$').replace(r'\]', '$$')
71
  return content
72
 
73
 
 
80
  'nougat',
81
  '--out', 'output',
82
  'pdf', f'{file_name}',
83
+ '--checkpoint', 'nougat',
84
+ '--markdown'
85
  ]
86
 
87
  # Run the command and get .mmd file in an output folder