import os from datetime import datetime from pathlib import Path import polars as pl import torch from transformers import AutoModel, AutoTokenizer from transformers import Trainer, TrainingArguments from accelerate import Accelerator, DistributedType from torch.optim import AdamW from torch.utils.data import DataLoader from utils._constants import * from utils._nlp import get_transformers_word_embeddings from utils._polars import concat_str_columns, slice_join_dataframes from utils._articles import ( convert_text2encoding_with_transformers, create_article_id_to_value_mapping ) from utils._behaviors import ( create_binary_labels_column, sampling_strategy_wu2019, truncate_history, ) from dataset.pytorch_dataloader import ( ebnerd_from_path, NRMSDataset, ) from evaluation import ( MetricEvaluator, AucScore, NdcgScore, MrrScore, F1Score, LogLossScore, RootMeanSquaredError, AccuracyScore ) from models.nrms import NRMSModel from datasets import Dataset, DatasetDict import pyarrow as pa import pyarrow.parquet as pq import polars as pl COLUMNS = ["impression_id", DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL, DEFAULT_INVIEW_ARTICLES_COL] test_first_df = pl.read_parquet("testset_joined.parquet") schema = pa.schema([ ("impression_id", pa.int32()), ("user_id", pa.int32()), ("article_id_fixed", pa.list_(pa.int32())), ("article_ids_inview", pa.list_(pa.int32())), ]) exp_writer = pq.ParquetWriter("merged_0412_final.parquet", schema) only_writer = pq.ParquetWriter("merged_0412_joined_only.parquet", schema) for idx, rows in enumerate(test_first_df.select(COLUMNS).iter_slices()): print(idx, "\n") org_table = pa.Table.from_pandas(rows.to_pandas(), schema=schema) only_writer.write_table(org_table) df = rows.explode("article_ids_inview").with_columns(pl.col("article_ids_inview").map_elements(lambda x: [x])) exp_table = pa.Table.from_pandas(df.to_pandas(), schema=schema) exp_writer.write_table(exp_table) only_writer.close() exp_writer.close() del test_first_df del schema merged_0412_joined_only_df = Dataset.from_parquet("merged_0412_joined_only.parquet") ebnerd_testset = DatasetDict({ "testset": merged_0412_joined_only_df, }) ebnerd_testset.push_to_hub( repo_id="mbhr/EB-NeRD", config_name="join_test", data_dir="data/join_test", ) del merged_0412_joined_only_df del ebnerd_testset merged_0412_final_df = Dataset.from_parquet("merged_0412_final.parquet") ebnerd_testset = DatasetDict({ "testset": merged_0412_final_df, }) ebnerd_testset.push_to_hub( repo_id="mbhr/EB-NeRD", config_name="join_test_exp", data_dir="data/join_test_exp", )