Spaces:

larkkin
/

perin

Runtime error

perin / model_wrapper.py

Larisa Kolesnichenko

Make processing of punctuation consistent with train data: frame each symbol with spaces

099a2f3 about 2 years ago

4.25 kB

	import os
	import json
	import tempfile
	import sys
	import datetime
	import re
	import string
	sys.path.append('mtool')

	import torch

	from model.model import Model
	from data.dataset import Dataset
	from config.params import Params
	from utility.initialize import initialize
	from data.batch import Batch
	from mtool.main import main as mtool_main


	from tqdm import tqdm

	class PredictionModel:
	def __init__(self, checkpoint_path=os.path.join('models', 'checkpoint.bin'), default_mrp_path=os.path.join('models', 'default.mrp'), verbose=False):
	self.verbose = verbose
	self.checkpoint = torch.load('./models/checkpoint.bin', map_location=torch.device('cpu'))
	self.args = Params().load_state_dict(self.checkpoint['params'])
	self.args.log_wandb = False
	self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

	self.args.training_data = default_mrp_path
	self.args.validation_data = default_mrp_path
	self.args.test_data = default_mrp_path
	self.args.only_train = False
	self.args.encoder = os.path.join('models', 'encoder')
	initialize(self.args, init_wandb=False)
	self.dataset = Dataset(self.args, verbose=False)
	self.model = Model(self.dataset, self.args).to(self.device)
	self.model.load_state_dict(self.checkpoint["model"])
	self.model.eval()


	def _mrp_to_text(self, mrp_list, graph_mode='labeled-edge'):
	framework = 'norec'
	with tempfile.NamedTemporaryFile(delete=False, mode='w') as output_text_file:
	output_text_filename = output_text_file.name

	with tempfile.NamedTemporaryFile(delete=False, mode='w') as mrp_file:
	line = '\n'.join([json.dumps(entry) for entry in mrp_list])
	mrp_file.write(line)
	mrp_filename = mrp_file.name

	if graph_mode == 'labeled-edge':
	mtool_main([
	'--strings',
	'--ids',
	'--read', 'mrp',
	'--write', framework,
	mrp_filename, output_text_filename
	])
	elif graph_mode == 'node-centric':
	mtool_main([
	'--node_centric',
	'--strings',
	'--ids',
	'--read', 'mrp',
	'--write', framework,
	mrp_filename, output_text_filename
	])
	else:
	raise Exception(f'Unknown graph mode: {graph_mode}')

	with open(output_text_filename) as f:
	texts = json.load(f)

	os.unlink(output_text_filename)
	os.unlink(mrp_filename)

	return texts


	def clean_texts(self, texts):
	punctuation = ''.join([f'\\{s}' for s in string.punctuation])
	texts = [re.sub(f'([{punctuation}])', ' \\1 ', t) for t in texts]
	texts = [re.sub(r' +', ' ', t) for t in texts]
	return texts


	def _predict_to_mrp(self, texts, graph_mode='labeled-edge'):
	texts = self.clean_texts(texts)
	framework, language = self.args.framework, self.args.language
	data = self.dataset.load_sentences(texts, self.args)
	res_sentences = {f"{i}": {'input': sentence} for i, sentence in enumerate(texts)}
	date_str = datetime.datetime.now().date().isoformat()
	for key, value_dict in res_sentences.items():
	value_dict['id'] = key
	value_dict['time'] = date_str
	value_dict['framework'], value_dict['language'] = framework, language
	value_dict['nodes'], value_dict['edges'], value_dict['tops'] = [], [], []
	for i, batch in enumerate(tqdm(data) if self.verbose else data):
	with torch.no_grad():
	predictions = self.model(Batch.to(batch, self.device), inference=True)
	for prediction in predictions:
	for key, value in prediction.items():
	res_sentences[prediction['id']][key] = value
	return res_sentences


	def predict(self, text_list, graph_mode='labeled-edge', language='no'):
	mrp_predictions = self._predict_to_mrp(text_list, graph_mode)
	predictions = self._mrp_to_text(mrp_predictions.values(), graph_mode)
	return predictions