from io import StringIO import itertools import gradio as gr import pandas as pd import spacy nlp = spacy.load('en_core_web_sm') HTML_RED = '{t}' HTML_GRN = '{t}' HTML_YLW = '{t}' HTML_BLU = '{t}' HTML_PLN = '{t}' TABLE_CSS = ''' th, td { padding: 4px; } table, th, td { border: 1px solid black; border-collapse: collapse; } ''' def colorize(file_obj): with open(file_obj.name, 'r') as f: raw = f.read() raw = raw[raw.find('example_id'):] data = pd.read_csv(StringIO(raw)) table_content = [] for row in data.iterrows(): id_ = row[1]['example_id'] gold, genA, genB = nlp.pipe(( row[1]['target summary'], row[1]['model summary A'], row[1]['model summary B'] )) tokens_gold = {token.lemma_.lower(): 0 for token in gold} for token in itertools.chain(genA, genB): if token.lemma_.lower() in tokens_gold: tokens_gold[token.lemma_.lower()] += 1 gold_text = ''.join([ ( HTML_PLN.format(t=token.text) if token.pos_ not in {'NOUN', 'PROPN', 'VERB'} else ( ( HTML_BLU if tokens_gold[token.lemma_.lower()] > 0 else HTML_YLW ).format(t=token.text) ) ) + token.whitespace_ for token in gold ]) table_content.append( [id_, gold_text] + [ ''.join( ( HTML_PLN.format(t=token.text) if token.pos_ not in {'NOUN', 'PROPN', 'VERB'} else ( HTML_GRN.format(t=token.text) if token.lemma_.lower() in tokens_gold else HTML_RED.format(t=token.text) ) ) + token.whitespace_ for token in gen ) for gen in (genA, genB) ] ) # return an HTML table using data in table_content return '\n'.join(( '
id | ", "Gold | ", "Model A | ", "Model B | ", "
{} | '.format(cell) for cell in row) + '\n