|
import re |
|
from ftfy import fix_text |
|
|
|
|
|
def contains_math(text): |
|
return text.startswith("$") or text.endswith("$") |
|
|
|
|
|
def fix_math(text): |
|
|
|
text = fix_text(text) |
|
|
|
|
|
text = remove_labels(text) |
|
text = replace_katex_invalid(text) |
|
text = fix_fences(text) |
|
return text |
|
|
|
|
|
def remove_labels(text): |
|
pattern = r'\\label\{[^}]*\}' |
|
text = re.sub(pattern, '', text) |
|
|
|
ref_pattern = r'\\ref\{[^}]*\}' |
|
text = re.sub(ref_pattern, '', text) |
|
|
|
pageref_pattern = r'\\pageref\{[^}]*\}' |
|
text = re.sub(pageref_pattern, '', text) |
|
return text |
|
|
|
|
|
def replace_katex_invalid(string): |
|
|
|
string = re.sub(r'\\tag\{.*?\}', '', string) |
|
string = re.sub(r'\\(?:Bigg?|bigg?)\{(.*?)\}', r'\1', string) |
|
string = re.sub(r'\\quad\\mbox\{(.*?)\}', r'\1', string) |
|
string = re.sub(r'\\mbox\{(.*?)\}', r'\1', string) |
|
string = remove_inner_dollars(string) |
|
return string |
|
|
|
|
|
def remove_inner_dollars(text): |
|
def replace_dollar(match): |
|
|
|
math_block = match.group(1) |
|
return '$$' + math_block.replace('$', '') + '$$' |
|
|
|
pattern = r'\$\$(.*?)\$\$' |
|
return re.sub(pattern, replace_dollar, text, flags=re.DOTALL) |
|
|
|
|
|
def extract_latex_with_positions(text): |
|
pattern = r'(\$\$.*?\$\$|\$.*?\$)' |
|
matches = [] |
|
for match in re.finditer(pattern, text, re.DOTALL): |
|
matches.append((match.group(), match.start(), match.end())) |
|
return matches |
|
|
|
|
|
def slice_latex(text): |
|
|
|
latex_blocks_with_positions = extract_latex_with_positions(text) |
|
|
|
chunks = [] |
|
last_position = 0 |
|
for block, start, end in latex_blocks_with_positions: |
|
|
|
if start > last_position: |
|
chunks.append({"text": text[last_position:start], "type": "text"}) |
|
|
|
chunks.append({"text": block, "type": "latex"}) |
|
last_position = end |
|
|
|
if last_position < len(text): |
|
chunks.append({"text": text[last_position:], "type": "text"}) |
|
|
|
return chunks |
|
|
|
|
|
def is_latex(text): |
|
latex_patterns = [ |
|
r'\\(?:begin|end)\{[a-zA-Z]*\}', |
|
r'\$.*?\$', |
|
r'\$\$.*?\$\$', |
|
r'\\[a-zA-Z]+', |
|
r'\\[^a-zA-Z]', |
|
] |
|
|
|
combined_pattern = '|'.join(latex_patterns) |
|
if re.search(combined_pattern, text, re.DOTALL): |
|
return True |
|
|
|
return False |
|
|
|
|
|
def fix_fences(text): |
|
if text.startswith("$$") and not text.endswith("$$"): |
|
if text[-1] == "$": |
|
text += "$" |
|
else: |
|
text += "$$" |
|
|
|
if text.endswith("$$") and not text.startswith("$$"): |
|
if text[0] == "$": |
|
text = "$" + text |
|
else: |
|
text = "$$" + text |
|
|
|
if text.startswith("$") and not text.endswith("$"): |
|
text = "$" + text + "$$" |
|
|
|
if text.endswith("$") and not text.startswith("$"): |
|
text = "$$" + text + "$" |
|
|
|
return text |
|
|
|
|
|
def strip_fences(text): |
|
while text.startswith("$"): |
|
text = text[1:] |
|
while text.endswith("$"): |
|
text = text[:-1] |
|
return text |
|
|
|
|
|
|