File size: 2,193 Bytes
7ac259c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from transformers import Pipeline
class MyPipeline(Pipeline):
def _sanitize_parameters(self, **kwargs):
preprocess_kwargs = {}
if "max_length" in kwargs:
preprocess_kwargs["max_length"] = kwargs["max_length"]
if "num_beams" in kwargs:
preprocess_kwargs["num_beams"] = kwargs["num_beams"]
return preprocess_kwargs, {}, {}
def preprocess(self, inputs, **kwargs):
inputs = re.sub(r'[^A-Za-z가-힣,<>0-9:&# ]', '', inputs)
inputs = "질문 생성: <unused0>"+inputs
input_ids = [tokenizer.bos_token_id] + tokenizer.encode(inputs) + [tokenizer.eos_token_id]
return {"inputs":torch.tensor([input_ids]),'max_length':kwargs['max_length'],'num_beams':kwargs['num_beams'] }
def _forward(self, model_inputs):
res_ids = model.generate(
model_inputs['inputs'],
max_length=model_inputs['max_length'],
num_beams=model_inputs['num_beams'],
eos_token_id=tokenizer.eos_token_id,
bad_words_ids=[[tokenizer.unk_token_id]]
)
return {"logits": res_ids}
def postprocess(self, model_outputs):
a = tokenizer.batch_decode(model_outputs["logits"].tolist())[0]
out_question = a.replace('<s>', '').replace('</s>', '')
return out_question
def _inference(self,paragraph,**kwargs):
input_ids = self.preprocess(paragraph,**kwargs)
reds_ids = self._forward(input_ids)
out_question = self.postprocess(reds_ids)
return out_question
def make_question(self, text, **kwargs):
words = text.split(" ")
frame_size = kwargs['frame_size']
hop_length = kwargs['hop_length']
steps = round((len(words)-frame_size)/hop_length) + 1
outs = []
for step in range(steps):
try:
script = " ".join(words[step*hop_length:step*hop_length+frame_size])
except:
script = " ".join(words[(1+step)*hop_length:])
outs.append(self._inference(script,**kwargs))
#if step>4:
# break
return outs |