|
--- |
|
library_name: transformers |
|
license: apache-2.0 |
|
datasets: |
|
- jaeyong2/Thai-emb-PreView |
|
language: |
|
- th |
|
base_model: |
|
- Alibaba-NLP/gte-multilingual-base |
|
--- |
|
|
|
# Model Card for Model ID |
|
|
|
<!-- Provide a quick summary of what the model is/does. --> |
|
|
|
|
|
|
|
## Model Details |
|
|
|
|
|
## Train |
|
|
|
- H/W : colab A100 40GB |
|
- Data : jaeyong2/Thai-emb-PreView |
|
|
|
``` |
|
model_name = "Alibaba-NLP/gte-multilingual-base" |
|
dataset = datasets.load_dataset("jaeyong2/Thai-emb-PreView") |
|
train_dataloader = DataLoader(dataset['train'], batch_size=8, shuffle=True) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModel.from_pretrained(model_name).to(torch.bfloat16) |
|
triplet_loss = TripletLoss(margin=1.0) |
|
|
|
optimizer = AdamW(model.parameters(), lr=5e-5) |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
for epoch in range(3): # 에포크 반복 |
|
model.train() |
|
total_loss = 0 |
|
count = 0 |
|
for batch in tqdm(train_dataloader): |
|
optimizer.zero_grad() |
|
loss = None |
|
for index in range(len(batch["context"])): |
|
anchor_encodings = tokenizer([batch["context"][index]], truncation=True, padding="max_length", max_length=4096, return_tensors="pt") |
|
positive_encodings = tokenizer([batch["Title"][index]], truncation=True, padding="max_length", max_length=256, return_tensors="pt") |
|
negative_encodings = tokenizer([batch["Fake Title"][index]], truncation=True, padding="max_length", max_length=256, return_tensors="pt") |
|
|
|
anchor_encodings = batch_to_device(anchor_encodings, device) |
|
positive_encodings = batch_to_device(positive_encodings, device) |
|
negative_encodings = batch_to_device(negative_encodings, device) |
|
|
|
# 모델 출력 (임베딩 벡터 생성) |
|
anchor_output = model(**anchor_encodings)[0][:, 0, :] # [CLS] 토큰의 벡터 |
|
positive_output = model(**positive_encodings)[0][:, 0, :] |
|
negative_output = model(**negative_encodings)[0][:, 0, :] |
|
# 삼중항 손실 계산 |
|
if loss==None: |
|
loss = triplet_loss(anchor_output, positive_output, negative_output) |
|
else: |
|
loss += triplet_loss(anchor_output, positive_output, negative_output) |
|
loss /= len(batch["context"]) |
|
loss.backward() |
|
optimizer.step() |
|
``` |
|
|
|
## Evaluation |
|
|
|
Code : |
|
``` |
|
import torch |
|
import numpy as np |
|
from sklearn.metrics import pairwise_distances |
|
from tqdm import tqdm |
|
|
|
|
|
dataset = datasets.load_dataset("jaeyong2/Thai-emb-PreView") |
|
validation_dataset = dataset["test"].select(range((1000))) |
|
|
|
model.eval() |
|
|
|
def evaluate(validation_dataset): |
|
correct_count = 0 |
|
|
|
for item in tqdm(validation_dataset): |
|
query_embedding = get_embedding(item["context"], model, tokenizer) |
|
document_embedding = get_embedding(item["Title"], model, tokenizer) |
|
negative_embedding = get_embedding(item["Fake Title"], model, tokenizer) |
|
|
|
|
|
# 쿼리와 모든 문서 간의 유사도 계산 (코사인 거리 사용) |
|
positive_distances = pairwise_distances(query_embedding.detach().cpu().float().numpy(), document_embedding.detach().cpu().float().numpy(), metric="cosine") |
|
negative_distances = pairwise_distances(query_embedding.detach().cpu().float().numpy(), negative_embedding.detach().cpu().float().numpy(), metric="cosine") |
|
|
|
if positive_distances < negative_distances: |
|
correct_count += 1 |
|
|
|
accuracy = correct_count / len(validation_dataset) |
|
return accuracy |
|
|
|
results = evaluate(validation_dataset) |
|
print(f"Validation Results: {results}") |
|
``` |
|
|
|
Accuracy |
|
- Alibaba-NLP/gte-multilingual-base : 0.953 |
|
- jaeyong2/gte-multilingual-base-Thai-embedding : 0.991 |
|
|
|
|
|
### License |
|
- Alibaba-NLP/gte-multilingual-base : https://choosealicense.com/licenses/apache-2.0/ |