Spaces:
Runtime error
Runtime error
import numpy as np | |
#import itertools | |
from konlpy.tag import Okt | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import gradio as gr | |
import pandas as pd | |
# make function using import pip to install torch | |
import pip | |
#pip.main(['install', 'torch']) | |
#pip.main(['install', 'transformers']) | |
import torch | |
import transformers | |
from transformers import BertTokenizerFast | |
from transformers import AutoModel | |
def make_candiadte(prompt): | |
okt = Okt() | |
tokenized_doc = okt.pos(prompt) | |
tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun']) | |
n_gram_range = (2, 3) | |
count = CountVectorizer(ngram_range=n_gram_range).fit([tokenized_nouns]) | |
candidates = count.get_feature_names_out() | |
return candidates | |
# saved_model | |
def load_model(): | |
pretrained_model_name = "kykim/bert-kor-base" | |
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name) | |
model = AutoModel.from_pretrained("./bertmodel/") | |
return model, tokenizer | |
# main | |
def inference(prompt): | |
candidates = make_candiadte(prompt) | |
model, tokenizer = load_model() | |
input_ids = tokenizer.encode(prompt) | |
input_ids = torch.tensor(input_ids).unsqueeze(0) | |
doc_embedding = model(input_ids)["pooler_output"] | |
top_n = 5 | |
words = [] | |
distances = [] | |
for word in candidates: | |
input_ids = tokenizer.encode(word) | |
input_ids = torch.tensor(input_ids).unsqueeze(0) | |
word_embedding = model(input_ids)["pooler_output"] | |
distance = torch.cosine_similarity(doc_embedding, word_embedding, dim=1).item() | |
words.append(word) | |
distances.append(distance) | |
#print(word, torch.cosine_similarity(doc_embedding, word_embedding, dim=1).item()) | |
cos_df = pd.DataFrame({'word':words, 'distance':distances}) | |
# sort by distance | |
cos_df = cos_df.sort_values(by='distance', ascending=False) | |
# top n | |
cos_df = cos_df[:top_n] | |
cos_df["word"].values | |
# ๋ช ์ฌ๋ง ์ถ์ถ | |
outputs = [] | |
for word in cos_df["word"].values: | |
okt = Okt() | |
tokenized_doc = okt.pos(word) | |
tokenized_nouns = ' '.join([word[0] for word in tokenized_doc if word[1] == 'Noun']) | |
outputs.append("#" + tokenized_nouns) | |
outputs = " ".join(outputs) | |
return outputs | |
demo = gr.Interface( | |
fn=inference, | |
inputs="text", | |
outputs="text", #return ๊ฐ | |
examples=[ | |
"์ง๋ํด ๊ตญ๋ด ํด๋์๊ณ ์ต๊ณ ์คํ๋ก ๋ ์ค๋ฅธ ํผ์๋์คํธ ์์ค์ฐฌ์ด ๋ฏธ๊ตญ ๋ฐด ํด๋ผ์ด๋ฒ ๊ตญ์ ์ฝฉ์ฟ ๋ฅด ๊ฒฐ์ ์์ ์ฐ์ฃผํ ๋ผํ๋ง๋๋ ธํ ํผ์๋ ธ ํ์ฃผ๊ณก ์ 3๋ฒ ์์์ด ์ ํ๋ธ์์ ์กฐํ์ 1000๋งํ๋ฅผ ๋๊ฒผ๋ค. ๋ผํ๋ง๋๋ ธํ 3๋ฒ ์ฐ์ฃผ ์์ ์ค ๋จ์ฐ ์ต๊ณ ์กฐํ์๋ค." | |
] | |
).launch() # launch(share=True)๋ฅผ ์ค์ ํ๋ฉด ์ธ๋ถ์์ ์ ์ ๊ฐ๋ฅํ ๋งํฌ๊ฐ ์์ฑ๋จ | |
demo.launch() |