pko89403 commited on
Commit
ee41ae8
1 Parent(s): f97e310

Upload upload_final_test_dataset.py

Browse files
Files changed (1) hide show
  1. upload_final_test_dataset.py +95 -0
upload_final_test_dataset.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime
3
+
4
+ from pathlib import Path
5
+
6
+ import polars as pl
7
+ import torch
8
+ from transformers import AutoModel, AutoTokenizer
9
+ from transformers import Trainer, TrainingArguments
10
+ from accelerate import Accelerator, DistributedType
11
+ from torch.optim import AdamW
12
+ from torch.utils.data import DataLoader
13
+
14
+ from utils._constants import *
15
+ from utils._nlp import get_transformers_word_embeddings
16
+ from utils._polars import concat_str_columns, slice_join_dataframes
17
+ from utils._articles import (
18
+ convert_text2encoding_with_transformers,
19
+ create_article_id_to_value_mapping
20
+ )
21
+ from utils._behaviors import (
22
+ create_binary_labels_column,
23
+ sampling_strategy_wu2019,
24
+ truncate_history,
25
+ )
26
+ from dataset.pytorch_dataloader import (
27
+ ebnerd_from_path,
28
+ NRMSDataset,
29
+ )
30
+ from evaluation import (
31
+ MetricEvaluator,
32
+ AucScore,
33
+ NdcgScore,
34
+ MrrScore,
35
+ F1Score,
36
+ LogLossScore,
37
+ RootMeanSquaredError,
38
+ AccuracyScore
39
+ )
40
+ from models.nrms import NRMSModel
41
+ from datasets import Dataset, DatasetDict
42
+ import pyarrow as pa
43
+ import pyarrow.parquet as pq
44
+ import polars as pl
45
+
46
+
47
+ COLUMNS = ["impression_id", DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL, DEFAULT_INVIEW_ARTICLES_COL]
48
+
49
+ test_first_df = pl.read_parquet("testset_joined.parquet")
50
+ schema = pa.schema([
51
+ ("impression_id", pa.int32()),
52
+ ("user_id", pa.int32()),
53
+ ("article_id_fixed", pa.list_(pa.int32())),
54
+ ("article_ids_inview", pa.list_(pa.int32())),
55
+ ])
56
+ exp_writer = pq.ParquetWriter("merged_0412_final.parquet", schema)
57
+ only_writer = pq.ParquetWriter("merged_0412_joined_only.parquet", schema)
58
+
59
+ for idx, rows in enumerate(test_first_df.select(COLUMNS).iter_slices()):
60
+ print(idx, "\n")
61
+ org_table = pa.Table.from_pandas(rows.to_pandas(), schema=schema)
62
+ only_writer.write_table(org_table)
63
+
64
+ df = rows.explode("article_ids_inview").with_columns(pl.col("article_ids_inview").map_elements(lambda x: [x]))
65
+ exp_table = pa.Table.from_pandas(df.to_pandas(), schema=schema)
66
+ exp_writer.write_table(exp_table)
67
+
68
+ only_writer.close()
69
+ exp_writer.close()
70
+
71
+ del test_first_df
72
+ del schema
73
+
74
+ merged_0412_joined_only_df = Dataset.from_parquet("merged_0412_joined_only.parquet")
75
+ ebnerd_testset = DatasetDict({
76
+ "testset": merged_0412_joined_only_df,
77
+ })
78
+ ebnerd_testset.push_to_hub(
79
+ repo_id="mbhr/EB-NeRD",
80
+ config_name="join_test",
81
+ data_dir="data/join_test",
82
+ )
83
+
84
+ del merged_0412_joined_only_df
85
+ del ebnerd_testset
86
+
87
+ merged_0412_final_df = Dataset.from_parquet("merged_0412_final.parquet")
88
+ ebnerd_testset = DatasetDict({
89
+ "testset": merged_0412_final_df,
90
+ })
91
+ ebnerd_testset.push_to_hub(
92
+ repo_id="mbhr/EB-NeRD",
93
+ config_name="join_test_exp",
94
+ data_dir="data/join_test_exp",
95
+ )