ketanmore's picture
Upload folder using huggingface_hub
2720487 verified
import re
from ftfy import fix_text
def contains_math(text):
return text.startswith("$") or text.endswith("$")
def fix_math(text):
# Fix any issues with the text
text = fix_text(text)
# Remove LaTeX labels and references
text = remove_labels(text)
text = replace_katex_invalid(text)
text = fix_fences(text)
return text
def remove_labels(text):
pattern = r'\\label\{[^}]*\}'
text = re.sub(pattern, '', text)
ref_pattern = r'\\ref\{[^}]*\}'
text = re.sub(ref_pattern, '', text)
pageref_pattern = r'\\pageref\{[^}]*\}'
text = re.sub(pageref_pattern, '', text)
return text
def replace_katex_invalid(string):
# KaTeX cannot render all LaTeX, so we need to replace some things
string = re.sub(r'\\tag\{.*?\}', '', string)
string = re.sub(r'\\(?:Bigg?|bigg?)\{(.*?)\}', r'\1', string)
string = re.sub(r'\\quad\\mbox\{(.*?)\}', r'\1', string)
string = re.sub(r'\\mbox\{(.*?)\}', r'\1', string)
string = remove_inner_dollars(string)
return string
def remove_inner_dollars(text):
def replace_dollar(match):
# Replace single $ with nothing, keep $$ intact
math_block = match.group(1)
return '$$' + math_block.replace('$', '') + '$$'
pattern = r'\$\$(.*?)\$\$'
return re.sub(pattern, replace_dollar, text, flags=re.DOTALL)
def extract_latex_with_positions(text):
pattern = r'(\$\$.*?\$\$|\$.*?\$)'
matches = []
for match in re.finditer(pattern, text, re.DOTALL):
matches.append((match.group(), match.start(), match.end()))
return matches
def slice_latex(text):
# Extract LaTeX blocks along with their positions
latex_blocks_with_positions = extract_latex_with_positions(text)
chunks = []
last_position = 0
for block, start, end in latex_blocks_with_positions:
# Add text before the current LaTeX block, if any
if start > last_position:
chunks.append({"text": text[last_position:start], "type": "text"})
# Add the LaTeX block
chunks.append({"text": block, "type": "latex"})
last_position = end
# Add remaining text after the last LaTeX block, if any
if last_position < len(text):
chunks.append({"text": text[last_position:], "type": "text"})
return chunks
def is_latex(text):
latex_patterns = [
r'\\(?:begin|end)\{[a-zA-Z]*\}',
r'\$.*?\$',
r'\$\$.*?\$\$',
r'\\[a-zA-Z]+',
r'\\[^a-zA-Z]',
]
combined_pattern = '|'.join(latex_patterns)
if re.search(combined_pattern, text, re.DOTALL):
return True
return False
def fix_fences(text):
if text.startswith("$$") and not text.endswith("$$"):
if text[-1] == "$":
text += "$"
else:
text += "$$"
if text.endswith("$$") and not text.startswith("$$"):
if text[0] == "$":
text = "$" + text
else:
text = "$$" + text
if text.startswith("$") and not text.endswith("$"):
text = "$" + text + "$$"
if text.endswith("$") and not text.startswith("$"):
text = "$$" + text + "$"
return text
def strip_fences(text):
while text.startswith("$"):
text = text[1:]
while text.endswith("$"):
text = text[:-1]
return text