Spaces:
Runtime error
Runtime error
File size: 2,921 Bytes
eb1ba05 54412bc eb1ba05 e556c82 eb1ba05 9d7201e eb1ba05 aff6f6a eb1ba05 2a014dc eb1ba05 fb622a8 eb1ba05 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import numpy as np
#import itertools
from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
import pandas as pd
# make function using import pip to install torch
import pip
#pip.main(['install', 'torch'])
#pip.main(['install', 'transformers'])
import torch
import transformers
from transformers import BertTokenizerFast
from transformers import AutoModel
def make_candiadte(prompt):
okt = Okt()
tokenized_doc = okt.pos(prompt)
tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun'])
n_gram_range = (2, 3)
count = CountVectorizer(ngram_range=n_gram_range).fit([tokenized_nouns])
candidates = count.get_feature_names_out()
return candidates
# saved_model
def load_model():
pretrained_model_name = "kykim/bert-kor-base"
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name)
model = AutoModel.from_pretrained("./bertmodel/")
return model, tokenizer
# main
def inference(prompt):
candidates = make_candiadte(prompt)
model, tokenizer = load_model()
input_ids = tokenizer.encode(prompt)
input_ids = torch.tensor(input_ids).unsqueeze(0)
doc_embedding = model(input_ids)["pooler_output"]
top_n = 5
words = []
distances = []
for word in candidates:
input_ids = tokenizer.encode(word)
input_ids = torch.tensor(input_ids).unsqueeze(0)
word_embedding = model(input_ids)["pooler_output"]
distance = torch.cosine_similarity(doc_embedding, word_embedding, dim=1).item()
words.append(word)
distances.append(distance)
#print(word, torch.cosine_similarity(doc_embedding, word_embedding, dim=1).item())
cos_df = pd.DataFrame({'word':words, 'distance':distances})
# sort by distance
cos_df = cos_df.sort_values(by='distance', ascending=False)
# top n
cos_df = cos_df[:top_n]
cos_df["word"].values
# ๋ช
์ฌ๋ง ์ถ์ถ
outputs = []
for word in cos_df["word"].values:
okt = Okt()
tokenized_doc = okt.pos(word)
tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun'])
outputs.append("#" + tokenized_nouns)
outputs = " ".join(outputs)
return outputs
demo = gr.Interface(
fn=inference,
inputs="text",
outputs="text", #return ๊ฐ
examples=[
"์ง๋ํด ๊ตญ๋ด ํด๋์๊ณ ์ต๊ณ ์คํ๋ก ๋ ์ค๋ฅธ ํผ์๋์คํธ ์์ค์ฐฌ์ด ๋ฏธ๊ตญ ๋ฐด ํด๋ผ์ด๋ฒ ๊ตญ์ ์ฝฉ์ฟ ๋ฅด ๊ฒฐ์ ์์ ์ฐ์ฃผํ ๋ผํ๋ง๋๋
ธํ ํผ์๋
ธ ํ์ฃผ๊ณก ์ 3๋ฒ ์์์ด ์ ํ๋ธ์์ ์กฐํ์ 1000๋งํ๋ฅผ ๋๊ฒผ๋ค. ๋ผํ๋ง๋๋
ธํ 3๋ฒ ์ฐ์ฃผ ์์ ์ค ๋จ์ฐ ์ต๊ณ ์กฐํ์๋ค."
]
).launch() # launch(share=True)๋ฅผ ์ค์ ํ๋ฉด ์ธ๋ถ์์ ์ ์ ๊ฐ๋ฅํ ๋งํฌ๊ฐ ์์ฑ๋จ
demo.launch() |