File size: 3,313 Bytes
2720487
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import re
from ftfy import fix_text


def contains_math(text):
    return text.startswith("$") or text.endswith("$")


def fix_math(text):
    # Fix any issues with the text
    text = fix_text(text)

    # Remove LaTeX labels and references
    text = remove_labels(text)
    text = replace_katex_invalid(text)
    text = fix_fences(text)
    return text


def remove_labels(text):
    pattern = r'\\label\{[^}]*\}'
    text = re.sub(pattern, '', text)

    ref_pattern = r'\\ref\{[^}]*\}'
    text = re.sub(ref_pattern, '', text)

    pageref_pattern = r'\\pageref\{[^}]*\}'
    text = re.sub(pageref_pattern, '', text)
    return text


def replace_katex_invalid(string):
    # KaTeX cannot render all LaTeX, so we need to replace some things
    string = re.sub(r'\\tag\{.*?\}', '', string)
    string = re.sub(r'\\(?:Bigg?|bigg?)\{(.*?)\}', r'\1', string)
    string = re.sub(r'\\quad\\mbox\{(.*?)\}', r'\1', string)
    string = re.sub(r'\\mbox\{(.*?)\}', r'\1', string)
    string = remove_inner_dollars(string)
    return string


def remove_inner_dollars(text):
    def replace_dollar(match):
        # Replace single $ with nothing, keep $$ intact
        math_block = match.group(1)
        return '$$' + math_block.replace('$', '') + '$$'

    pattern = r'\$\$(.*?)\$\$'
    return re.sub(pattern, replace_dollar, text, flags=re.DOTALL)


def extract_latex_with_positions(text):
    pattern = r'(\$\$.*?\$\$|\$.*?\$)'
    matches = []
    for match in re.finditer(pattern, text, re.DOTALL):
        matches.append((match.group(), match.start(), match.end()))
    return matches


def slice_latex(text):
    # Extract LaTeX blocks along with their positions
    latex_blocks_with_positions = extract_latex_with_positions(text)

    chunks = []
    last_position = 0
    for block, start, end in latex_blocks_with_positions:
        # Add text before the current LaTeX block, if any
        if start > last_position:
            chunks.append({"text": text[last_position:start], "type": "text"})
        # Add the LaTeX block
        chunks.append({"text": block, "type": "latex"})
        last_position = end
    # Add remaining text after the last LaTeX block, if any
    if last_position < len(text):
        chunks.append({"text": text[last_position:], "type": "text"})

    return chunks


def is_latex(text):
    latex_patterns = [
        r'\\(?:begin|end)\{[a-zA-Z]*\}',
        r'\$.*?\$',
        r'\$\$.*?\$\$',
        r'\\[a-zA-Z]+',
        r'\\[^a-zA-Z]',
    ]

    combined_pattern = '|'.join(latex_patterns)
    if re.search(combined_pattern, text, re.DOTALL):
        return True

    return False


def fix_fences(text):
    if text.startswith("$$") and not text.endswith("$$"):
        if text[-1] == "$":
            text += "$"
        else:
            text += "$$"

    if text.endswith("$$") and not text.startswith("$$"):
        if text[0] == "$":
            text = "$" + text
        else:
            text = "$$" + text

    if text.startswith("$") and not text.endswith("$"):
        text = "$" + text + "$$"

    if text.endswith("$") and not text.startswith("$"):
        text = "$$" + text + "$"

    return text


def strip_fences(text):
    while text.startswith("$"):
        text = text[1:]
    while text.endswith("$"):
        text = text[:-1]
    return text