|
import os |
|
from datetime import datetime |
|
|
|
from pathlib import Path |
|
|
|
import polars as pl |
|
import torch |
|
from transformers import AutoModel, AutoTokenizer |
|
from transformers import Trainer, TrainingArguments |
|
from accelerate import Accelerator, DistributedType |
|
from torch.optim import AdamW |
|
from torch.utils.data import DataLoader |
|
|
|
from utils._constants import * |
|
from utils._nlp import get_transformers_word_embeddings |
|
from utils._polars import concat_str_columns, slice_join_dataframes |
|
from utils._articles import ( |
|
convert_text2encoding_with_transformers, |
|
create_article_id_to_value_mapping |
|
) |
|
from utils._behaviors import ( |
|
create_binary_labels_column, |
|
sampling_strategy_wu2019, |
|
truncate_history, |
|
) |
|
from dataset.pytorch_dataloader import ( |
|
ebnerd_from_path, |
|
NRMSDataset, |
|
) |
|
from evaluation import ( |
|
MetricEvaluator, |
|
AucScore, |
|
NdcgScore, |
|
MrrScore, |
|
F1Score, |
|
LogLossScore, |
|
RootMeanSquaredError, |
|
AccuracyScore |
|
) |
|
from models.nrms import NRMSModel |
|
from datasets import Dataset, DatasetDict |
|
import pyarrow as pa |
|
import pyarrow.parquet as pq |
|
import polars as pl |
|
|
|
|
|
COLUMNS = ["impression_id", DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL, DEFAULT_INVIEW_ARTICLES_COL] |
|
|
|
test_first_df = pl.read_parquet("testset_joined.parquet") |
|
schema = pa.schema([ |
|
("impression_id", pa.int32()), |
|
("user_id", pa.int32()), |
|
("article_id_fixed", pa.list_(pa.int32())), |
|
("article_ids_inview", pa.list_(pa.int32())), |
|
]) |
|
exp_writer = pq.ParquetWriter("merged_0412_final.parquet", schema) |
|
only_writer = pq.ParquetWriter("merged_0412_joined_only.parquet", schema) |
|
|
|
for idx, rows in enumerate(test_first_df.select(COLUMNS).iter_slices()): |
|
print(idx, "\n") |
|
org_table = pa.Table.from_pandas(rows.to_pandas(), schema=schema) |
|
only_writer.write_table(org_table) |
|
|
|
df = rows.explode("article_ids_inview").with_columns(pl.col("article_ids_inview").map_elements(lambda x: [x])) |
|
exp_table = pa.Table.from_pandas(df.to_pandas(), schema=schema) |
|
exp_writer.write_table(exp_table) |
|
|
|
only_writer.close() |
|
exp_writer.close() |
|
|
|
del test_first_df |
|
del schema |
|
|
|
merged_0412_joined_only_df = Dataset.from_parquet("merged_0412_joined_only.parquet") |
|
ebnerd_testset = DatasetDict({ |
|
"testset": merged_0412_joined_only_df, |
|
}) |
|
ebnerd_testset.push_to_hub( |
|
repo_id="mbhr/EB-NeRD", |
|
config_name="join_test", |
|
data_dir="data/join_test", |
|
) |
|
|
|
del merged_0412_joined_only_df |
|
del ebnerd_testset |
|
|
|
merged_0412_final_df = Dataset.from_parquet("merged_0412_final.parquet") |
|
ebnerd_testset = DatasetDict({ |
|
"testset": merged_0412_final_df, |
|
}) |
|
ebnerd_testset.push_to_hub( |
|
repo_id="mbhr/EB-NeRD", |
|
config_name="join_test_exp", |
|
data_dir="data/join_test_exp", |
|
) |
|
|