Spaces:

xu1998hz
/

sescore

Build error

App Files Files Community

sescore / app.py

xu1998hz

Update app.py

9c687ce about 2 years ago

raw

history blame contribute delete

3.33 kB

	import evaluate
	import sys
	from pathlib import Path
	from evaluate.utils import infer_gradio_input_types, json_to_string_type, parse_readme, parse_gradio_data, parse_test_cases


	def launch_gradio_widget(metric):
	"""Launches `metric` widget with Gradio."""

	try:
	import gradio as gr
	except ImportError as error:
	logger.error("To create a metric widget with Gradio make sure gradio is installed.")
	raise error

	local_path = Path(sys.path[0])
	# if there are several input types, use first as default.
	if isinstance(metric.features, list):
	(feature_names, feature_types) = zip(*metric.features[0].items())
	else:
	(feature_names, feature_types) = zip(*metric.features.items())
	gradio_input_types = infer_gradio_input_types(feature_types)

	def compute(data):
	return metric.compute(**parse_gradio_data(data, gradio_input_types))

	header_html = '''<div style="max-width:800px; margin:auto; float:center; margin-top:0; margin-bottom:0; padding:0;">
	<img src="https://huggingface.co/spaces/xu1998hz/sescore/resolve/main/img/logo_sescore.png" style="margin:0; padding:0; margin-top:-10px; margin-bottom:-50px;">
	</div>
	<h2 style='margin-top: 5pt; padding-top:10pt;'>About <i>SEScore</i></h2>

	<p><b>SEScore</b> is a reference-based text-generation evaluation metric that requires no pre-human-annotated error data,
	described in our paper <a href="https://arxiv.org/abs/2210.05035"><b>"Not All Errors are Equal: Learning Text Generation Metrics using
	Stratified Error Synthesis"</b></a> from EMNLP 2022.</p>

	<p>Its effectiveness over prior methods like BLEU, BERTScore, BARTScore, PRISM, COMET and BLEURT has been demonstrated on a diverse set of language generation tasks, including
	translation, captioning, and web text generation. <a href="https://twitter.com/LChoshen/status/1580136005654700033">Readers have even described SEScore as "one unsupervised evaluation to rule them all"</a>
	and we are very excited to share it with you!</p>

	<h2 style='margin-top: 10pt; padding-top:0;'>Try it yourself!</h2>
	<p>Provide sample (gold) reference text and (model output) predicted text below and see how SEScore rates them! It is most performant
	in a relative ranking setting, so in general <b>it will rank better predictions higher than worse ones.</b> Providing useful
	absolute numbers based on SEScore is an ongoing direction of investigation.</p>
	'''.replace('\n',' ')


	tail_markdown = parse_readme(local_path / "description.md")


	iface = gr.Interface(
	fn=compute,
	inputs=gr.inputs.Dataframe(
	headers=feature_names,
	col_count=len(feature_names),
	row_count=2,
	datatype=json_to_string_type(gradio_input_types),
	),
	outputs=gr.outputs.Textbox(label=metric.name),
	description=header_html,
	#title=f"SEScore Metric Usage Example",
	article=tail_markdown,
	# TODO: load test cases and use them to populate examples
	# examples=[parse_test_cases(test_cases, feature_names, gradio_input_types)]
	)

	print(dir(iface))

	iface.launch()



	module = evaluate.load("xu1998hz/sescore")
	launch_gradio_widget(module)