File size: 1,365 Bytes
5a692ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


import gradio as gr
from transformers import pipeline
import tokenizer
from difflib import Differ, SequenceMatcher

text1 = "Kver á á þenan bússtað"
text2 = "Hver á þennan bústað?"

def diff_texts(text1, text2):
    d = Differ()
    return [
        (token[2:], token[0] if token[0] != " " else None)
        for token in d.compare(text1, text2)
    ]

def split_text(text):
    sentence_list = [i for i in tokenizer.split_into_sentences(text, original=True)]
    return sentence_list

def mark_text( text, tag,):
    return (text, tag, )
    
def mark_span(text, tag,):
    return [mark_text(token, tag) for token in text]
    
def markup_diff(a, b,
                mark=mark_span,
                default_mark = lambda x: x,
                isjunk=None):
    """Returns a and b with any differences processed by mark

    Junk is ignored by the differ
    """
    seqmatcher = SequenceMatcher(isjunk=isjunk, a=a, b=b, autojunk=False)

    out_a, out_b = [], []
    for tag, a0, a1, b0, b1 in seqmatcher.get_opcodes():
        #markup = (default_mark) if tag == 'equal' else mark
        markup=mark
        out_a += markup(a[a0:a1], tag)
        out_b += markup(b[b0:b1], tag)
    assert len(out_a) == len(a)
    assert len(out_b) == len(b)
    return out_a, out_b

print(diff_texts(text1, text2))
print(markup_diff(text1.split(" "), text2.split(" ")))