import markdown import re import os import math from textwrap import dedent from functools import lru_cache from pymdownx.superfences import fence_code_format from latex2mathml.converter import convert as tex2mathml from shared_utils.config_loader import get_conf as get_conf from shared_utils.text_mask import apply_gpt_academic_string_mask markdown_extension_configs = { "mdx_math": { "enable_dollar_delimiter": True, "use_gitlab_delimiters": False, }, } code_highlight_configs = { "pymdownx.superfences": { "css_class": "codehilite", "custom_fences": [ {"name": "mermaid", "class": "mermaid", "format": fence_code_format} ], }, "pymdownx.highlight": { "css_class": "codehilite", "guess_lang": True, # 'auto_title': True, # 'linenums': True }, } code_highlight_configs_block_mermaid = { "pymdownx.superfences": { "css_class": "codehilite", # "custom_fences": [ # {"name": "mermaid", "class": "mermaid", "format": fence_code_format} # ], }, "pymdownx.highlight": { "css_class": "codehilite", "guess_lang": True, # 'auto_title': True, # 'linenums': True }, } def tex2mathml_catch_exception(content, *args, **kwargs): try: content = tex2mathml(content, *args, **kwargs) except: content = content return content def replace_math_no_render(match): content = match.group(1) if "mode=display" in match.group(0): content = content.replace("\n", "
") return f'$${content}$$' else: return f'${content}$' def replace_math_render(match): content = match.group(1) if "mode=display" in match.group(0): if "\\begin{aligned}" in content: content = content.replace("\\begin{aligned}", "\\begin{array}") content = content.replace("\\end{aligned}", "\\end{array}") content = content.replace("&", " ") content = tex2mathml_catch_exception(content, display="block") return content else: return tex2mathml_catch_exception(content) def markdown_bug_hunt(content): """ 解决一个mdx_math的bug（单$包裹begin命令时多余\n", "") return content def is_equation(txt): """ 判定是否为公式 | 测试1 写出洛伦兹定律，使用tex格式公式测试2 给出柯西不等式，使用latex格式测试3 写出麦克斯韦方程组 """ if "```" in txt and "```reference" not in txt: return False if "$" not in txt and "\\[" not in txt: return False mathpatterns = { r"(?^[ \t]*(?:~{3,}|`{3,}))[ ]* # opening fence ((\{(?P[^\}\n]*)\})| # (optional {attrs} or (\.?(?P[\w#.+-]*)[ ]*)? # optional (.)lang (hl_lines=(?P"|')(?P.*?)(?P=quot)[ ]*)?) # optional hl_lines) \n # newline (end of opening fence) (?P

.*?)(?<=\n)                                     # the code block
        (?P=fence)[ ]*$                                          # closing fence
    """
    ),
    re.MULTILINE | re.DOTALL | re.VERBOSE,
)


def get_line_range(re_match_obj, txt):
    start_pos, end_pos = re_match_obj.regs[0]
    num_newlines_before = txt[: start_pos + 1].count("\n")
    line_start = num_newlines_before
    line_end = num_newlines_before + txt[start_pos:end_pos].count("\n") + 1
    return line_start, line_end


def fix_code_segment_indent(txt):
    lines = []
    change_any = False
    txt_tmp = txt
    while True:
        re_match_obj = FENCED_BLOCK_RE.search(txt_tmp)
        if not re_match_obj:
            break
        if len(lines) == 0:
            lines = txt.split("\n")

        # 清空 txt_tmp 对应的位置方便下次搜索
        start_pos, end_pos = re_match_obj.regs[0]
        txt_tmp = txt_tmp[:start_pos] + " " * (end_pos - start_pos) + txt_tmp[end_pos:]
        line_start, line_end = get_line_range(re_match_obj, txt)

        # 获取公共缩进
        shared_indent_cnt = 1e5
        for i in range(line_start, line_end):
            stripped_string = lines[i].lstrip()
            num_spaces = len(lines[i]) - len(stripped_string)
            if num_spaces < shared_indent_cnt:
                shared_indent_cnt = num_spaces

        # 修复缩进
        if (shared_indent_cnt < 1e5) and (shared_indent_cnt % 4) == 3:
            num_spaces_should_be = math.ceil(shared_indent_cnt / 4) * 4
            for i in range(line_start, line_end):
                add_n = num_spaces_should_be - shared_indent_cnt
                lines[i] = " " * add_n + lines[i]
            if not change_any:  # 遇到第一个
                change_any = True

    if change_any:
        return "\n".join(lines)
    else:
        return txt


def markdown_convertion_for_file(txt):
    """
    将Markdown格式的文本转换为HTML格式。如果包含数学公式，则先将公式转换为HTML格式。
    """
    pre = ''
    suf = ""
    if txt.startswith(pre) and txt.endswith(suf):
        # print('警告，输入了已经经过转化的字符串，二次转化可能出问题')
        return txt  # 已经被转化过，不需要再次转化

    find_equation_pattern = r' $等。 """ if "```" in text: # careful input：markdown输入 text = special_render_issues_for_mermaid(text) # 处理特殊的渲染问题 return text elif "</div>" in text: # careful input：html输入 return text else: # whatever input：非markdown输入 lines = text.split("\n") for i, line in enumerate(lines): lines[i] = lines[i].replace(" ", "&nbsp;") # 空格转换为&nbsp; text = "</br>".join(lines) # 换行符转换为</br> return text @lru_cache(maxsize=128) # 使用lru缓存 def simple_markdown_convertion(text): pre = '<div class="markdown-body">' suf = "</div>" if text.startswith(pre) and text.endswith(suf): return text # 已经被转化过，不需要再次转化 text = compat_non_markdown_input(text) # 兼容非markdown输入 text = markdown.markdown( text, extensions=["pymdownx.superfences", "tables", "pymdownx.highlight"], extension_configs=code_highlight_configs, ) return pre + text + suf def format_io(self, y): """ 将输入和输出解析为HTML格式。将y中最后一项的输入部分段落化，并将输出部分的Markdown和数学公式转换为HTML格式。 """ if y is None or y == []: return [] i_ask, gpt_reply = y[-1] i_ask = apply_gpt_academic_string_mask(i_ask, mode="show_render") gpt_reply = apply_gpt_academic_string_mask(gpt_reply, mode="show_render") # 当代码输出半截的时候，试着补上后个``` if gpt_reply is not None: gpt_reply = close_up_code_segment_during_stream(gpt_reply) # 处理提问与输出 y[-1] = ( # 输入部分 None if i_ask is None else simple_markdown_convertion(i_ask), # 输出部分 None if gpt_reply is None else markdown_convertion(gpt_reply), ) return y$