metadata
license: cc-by-nc-nd-4.0
LongformerμΈμ½λ KoBARTλ‘ AIHUB κΈμ΅ λ° μ½ μλ΄ λν λ°μ΄ν°λ₯Ό CHATGPTλ₯Ό ν΅ν΄ μμ½ν νμ΅ λ°μ΄ν°λ₯Ό νμ΅ν λͺ¨λΈ
input = """κ³ κ°: μλ
νμΈμ, μ κ° μ¬κΈ°μ μ¬μ©νλ μ μ©μΉ΄λμ λν΄ κΆκΈν κ² μμ΄μ.
μλ΄μ: μλ
νμΈμ! λ€, μ΄λ€ λ¬Έμκ° μμΌμ κ°μ?
κ³ κ°: μ κ° μ΄λ² λ¬μ μΉ΄λλ₯Ό μ¬μ©νλ©΄μ 리μλ ν¬μΈνΈλ₯Ό μΌλ§λ μμλμ§ νμΈνκ³ μΆμ΄μ.
μλ΄μ: λ€, λΉμ μ 리μλ ν¬μΈνΈ μμ‘μ νμΈν΄ λ릴 μ μμ΅λλ€. μ κ° λΉμ μ μΉ΄λ λ²νΈλ₯Ό μ
λ ₯νκ³ νμΈν΄λ³Όκ²μ. λ²νΈλ₯Ό μλ €μ£Όμ€ μ μμκΉμ?
κ³ κ°: λ€, μ μΉ΄λ λ²νΈλ 1234-5678-9012-3456μ
λλ€.
μλ΄μ: κ°μ¬ν©λλ€. μ μλ§ κΈ°λ€λ €μ£ΌμΈμ. νμΈ μ€μ΄μμ... λ€, νμ¬ λΉμ μ 리μλ ν¬μΈνΈ μμ‘μ 3,250 ν¬μΈνΈμ
λλ€.
κ³ κ°: μκ² μ΄μ, κ°μ¬ν©λλ€! κ·ΈλΌ μΆκ°μ μΈ μ΄μ© ννμ΄λ ν μΈμ κ΄ν μ 보λ μ»μ μ μμκΉμ?
μλ΄μ: λ¬Όλ‘ μ΄μ£ ! μ ν¬ μΉ΄λμ¬λ λ€μν μ΄μ© ννμ μ 곡νκ³ μμ΅λλ€. μλ₯Ό λ€μ΄, μ¬ν, μΌν, μμ¬ λ± λ€μν λΆμΌμμ ν μΈ ννμ λ°μ μ μκ±°λ, 리μλ ν¬μΈνΈλ₯Ό μ¬μ©νμ¬ μνμ΄λ κΈ°ννΈ μΉ΄λλ‘ κ΅νν μ μμ΅λλ€. μ΄λ€ ννμ κ΄μ¬μ΄ μμΌμ κ°μ?
κ³ κ°: μ λ μ¬ν ν μΈμ΄λ λ§μΌλ¦¬μ§ μ 립μ κ΄μ¬μ΄ μμ΄μ.
μλ΄μ: κ·Έλ° κ²½μ°μλ λΉμ μκ² μ ν©ν μ¬ν μΉ΄λ ννμ μ 곡νλ μΉ΄λλ₯Ό μΆμ²ν΄ λ릴 μ μμ΅λλ€. μ¬ν μΉ΄λλ νκ³΅μ¬ λ§μΌλ¦¬μ§λ₯Ό μμ μ μκ³ , νΈν
ν μΈ ννμ λ°μ μλ μμ΅λλ€. μ κ° λͺ κ°μ§ μ΅μ
μ μ μν΄ λ³ΌκΉμ?
κ³ κ°: λ€, κ·Έλ¬λ©΄ μ’μ κ² κ°μμ. κ°μ¬ν©λλ€!
μλ΄μ: λ§μν΄ μ£Όμ
μ κ°μ¬ν©λλ€. μ΄μ μ κ° λͺ κ°μ§ μΆμ²μ λ리λλ‘ νκ² μ΅λλ€. μ΄λ€ ν곡μ¬λ₯Ό μ£Όλ‘ μ΄μ©νμλμ?"""
output ="""
- κ³ κ°μ΄ μ μ©μΉ΄λμ λν΄ κΆκΈν μ¬ν μλ΄
- 리μλ ν¬μΈνΈ νμΈ μμ²
- μλ΄μμ΄ μΉ΄λ λ²νΈμ μμ‘ νμΈ ν μΆκ° μ΄μ© νν μλ΄
- κ³ κ°μ΄ μ¬ν ν μΈ, λ§μΌλ¦¬μ§, νΈν
ν μΈ λ± λ€μν ννμ κ΄μ¬ νν
"""
ν΄λΉ λͺ¨λΈμ νμ©νκΈ° μν΄μ λ€μκ³Ό κ°μ class νμ
class LongformerSelfAttentionForBart(nn.Module):
def __init__(self, config, layer_id):
super().__init__()
self.embed_dim = config.d_model
self.longformer_self_attn = LongformerSelfAttention(config, layer_id=layer_id)
self.output = nn.Linear(self.embed_dim, self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
is_cross_attention = key_value_states is not None
bsz, tgt_len, embed_dim = hidden_states.size()
# bs x seq_len x seq_len -> bs x seq_len μΌλ‘ λ³κ²½
attention_mask = attention_mask.squeeze(dim=1)
attention_mask = attention_mask[:,0]
is_index_masked = attention_mask < 0
is_index_global_attn = attention_mask > 0
is_global_attn = is_index_global_attn.flatten().any().item()
outputs = self.longformer_self_attn(
hidden_states,
attention_mask=attention_mask,
layer_head_mask=None,
is_index_masked=is_index_masked,
is_index_global_attn=is_index_global_attn,
is_global_attn=is_global_attn,
output_attentions=output_attentions,
)
attn_output = self.output(outputs[0])
return (attn_output,) + outputs[1:] if len(outputs) == 2 else (attn_output, None, None)
class LongformerEncoderDecoderForConditionalGeneration(BartForConditionalGeneration):
def __init__(self, config):
super().__init__(config)
if config.attention_mode == 'n2':
pass # do nothing, use BertSelfAttention instead
else:
self.model.encoder.embed_positions = BartLearnedPositionalEmbedding(
config.max_encoder_position_embeddings,
config.d_model)
self.model.decoder.embed_positions = BartLearnedPositionalEmbedding(
config.max_decoder_position_embeddings,
config.d_model)
for i, layer in enumerate(self.model.encoder.layers):
layer.self_attn = LongformerSelfAttentionForBart(config, layer_id=i)
class LongformerEncoderDecoderConfig(BartConfig):
def __init__(self, attention_window: List[int] = None, attention_dilation: List[int] = None,
autoregressive: bool = False, attention_mode: str = 'sliding_chunks',
gradient_checkpointing: bool = False, **kwargs):
"""
Args:
attention_window: list of attention window sizes of length = number of layers.
window size = number of attention locations on each side.
For an affective window size of 512, use `attention_window=[256]*num_layers`
which is 256 on each side.
attention_dilation: list of attention dilation of length = number of layers.
attention dilation of `1` means no dilation.
autoregressive: do autoregressive attention or have attention of both sides
attention_mode: 'n2' for regular n^2 self-attention, 'tvm' for TVM implemenation of Longformer
selfattention, 'sliding_chunks' for another implementation of Longformer selfattention
"""
super().__init__(**kwargs)
self.attention_window = attention_window
self.attention_dilation = attention_dilation
self.autoregressive = autoregressive
self.attention_mode = attention_mode
self.gradient_checkpointing = gradient_checkpointing
assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2']
λͺ¨λΈ μ€λΈμ νΈ λ‘λ ν weightνμΌμ λ³λλ‘ λ€μ΄λ°μμ load_state_dictλ‘ μ¨μ΄νΈλ₯Ό λΆλ¬μΌ ν©λλ€.
tokenizer = AutoTokenizer.from_pretrained("cocoirun/longforemr-kobart-summary-v1")
model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained("cocoirun/longforemr-kobart-summary-v1")
device = torch.device('cuda')
model.load_state_dict(torch.load("summary weight.ckpt"))
model.to(device)
λͺ¨λΈ μμ½ ν¨μ
def summarize(text, max_len):
max_seq_len = 4096
context_tokens = ['<s>'] + tokenizer.tokenize(text) + ['</s>']
input_ids = tokenizer.convert_tokens_to_ids(context_tokens)
if len(input_ids) < max_seq_len:
while len(input_ids) < max_seq_len:
input_ids += [tokenizer.pad_token_id]
else:
input_ids = input_ids[:max_seq_len - 1] + [
tokenizer.eos_token_id]
res_ids = model.generate(torch.tensor([input_ids]).to(device),
max_length=max_len,
num_beams=5,
no_repeat_ngram_size = 3,
eos_token_id=tokenizer.eos_token_id,
bad_words_ids=[[tokenizer.unk_token_id]])
res = tokenizer.batch_decode(res_ids.tolist(), skip_special_tokens=True)[0]
res = res.replace("\n\n","\n")
return res