Upload 4 files

Browse files

upload testset inference

Files changed (4) hide show

final-testset.ipynb +200 -0
nrms_model.epoch0.step10001.pth +3 -0
nrms_model.epoch0.step20001.pth +3 -0
testset.py +198 -0

final-testset.ipynb ADDED Viewed

	@@ -0,0 +1,200 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "0562d8a6-e8e3-4659-ab21-e99d76adcf3c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "35982,9796527,0.12911629676818848\n",
+      "\n",
+      "35982,9796527,0.12911629676818848\n",
+      "\n",
+      "35982,9796527,0.12911629676818848\n",
+      "\n",
+      "35982,9796527,0.12911629676818848\n",
+      "\n",
+      "35982,9796527,0.12911629676818848\n",
+      "\n",
+      "35982,9796527,0.12911629676818848\n",
+      "\n",
+      "35982,9796527,0.12911629676818848\n",
+      "\n",
+      "35982,9796527,0.12911629676818848\n",
+      "\n",
+      "35982,9796527,0.12911629676818848\n",
+      "\n",
+      "35982,9796527,0.12911629676818848\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(10):\n",
+    "    with open(\"test_set.txt\") as f:\n",
+    "        print(f.readline())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9e72123e-5a81-4fd1-a07b-f847aee5a590",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "test_behavior_path = \"/work/Blue/ebnerd/ebnerd_testset/test/behaviors.parquet\"\n",
+    "\n",
+    "import polars as pl\n",
+    "\n",
+    "test_behavior_df = pl.read_parquet(test_behavior_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "id": "7c337f1c-8a0e-4a61-9916-0c86887f320e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 13536710/13536710 [18:13<00:00, 12380.33it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Zipping predictions.txt to predictions.zip\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm\n",
+    "import numpy as np\n",
+    "from pathlib import Path\n",
+    "import zipfile\n",
+    "\n",
+    "\n",
+    "def transform_list(input_list):\n",
+    "    # 입력 리스트를 Numpy 배열로 변환합니다.\n",
+    "    arr = np.array(input_list)\n",
+    "\n",
+    "    # 내림차순으로 정렬된 인덱스를 가져옵니다.\n",
+    "    sorted_indices = np.argsort(-arr)\n",
+    "\n",
+    "    # 순위를 매깁니다 (1부터 시작).\n",
+    "    ranks = np.empty_like(sorted_indices)\n",
+    "    ranks[sorted_indices] = np.arange(1, len(arr) + 1)\n",
+    "\n",
+    "    return ranks.tolist()\n",
+    "\n",
+    "def zip_submission_file(\n",
+    "    path: Path,\n",
+    "    filename_zip: str = None,\n",
+    "    verbose: bool = True,\n",
+    "    rm_file: bool = True,\n",
+    ") -> None:\n",
+    "    \"\"\"\n",
+    "    Compresses a specified file into a ZIP archive within the same directory.\n",
+    "\n",
+    "    Args:\n",
+    "        path (Path): The directory path where the file to be zipped and the resulting zip file will be located.\n",
+    "        filename_input (str, optional): The name of the file to be compressed. Defaults to the path.name.\n",
+    "        filename_zip (str, optional): The name of the output ZIP file. Defaults to \"prediction.zip\".\n",
+    "        verbose (bool, optional): If set to True, the function will print the process details. Defaults to True.\n",
+    "        rm_file (bool, optional): If set to True, the original file will be removed after compression. Defaults to True.\n",
+    "\n",
+    "    Returns:\n",
+    "        None: This function does not return any value.\n",
+    "    \"\"\"\n",
+    "    path = Path(path)\n",
+    "    if filename_zip:\n",
+    "        path_zip = path.parent.joinpath(filename_zip)\n",
+    "    else:\n",
+    "        path_zip = path.with_suffix(\".zip\")\n",
+    "\n",
+    "    if path_zip.suffix != \".zip\":\n",
+    "        raise ValueError(f\"suffix for {path_zip.name} has to be '.zip'\")\n",
+    "    if verbose:\n",
+    "        print(f\"Zipping {path} to {path_zip}\")\n",
+    "    f = zipfile.ZipFile(path_zip, \"w\", zipfile.ZIP_DEFLATED)\n",
+    "    f.write(path, arcname=path.name)\n",
+    "    f.close()\n",
+    "    if rm_file:\n",
+    "        path.unlink()\n",
+    "\n",
+    "with open(\"predictions.txt\", 'w') as wf:\n",
+    "    with open(\"test_set.txt\", 'r') as f:\n",
+    "        behaviors_iter = test_behavior_df.select(\"impression_id\", \"user_id\", \"article_ids_inview\").iter_rows()\n",
+    "        index = 0\n",
+    "        for data in tqdm(behaviors_iter, total=len(test_behavior_df)):\n",
+    "            impression_id = data[0]\n",
+    "            user_id = data[1]\n",
+    "            article_ids_inview = data[2]\n",
+    "\n",
+    "            scores = []\n",
+    "\n",
+    "            for article_id in article_ids_inview:\n",
+    "                preds = f.readline().split(\",\")\n",
+    "\n",
+    "                p_user_id = preds[0]\n",
+    "                p_article_id = preds[1]\n",
+    "                p_score = preds[2]\n",
+    "\n",
+    "                if str(article_id) == str(p_article_id):\n",
+    "                    scores.append(float(p_score))\n",
+    "                else:\n",
+    "                    print(\"Different 0.0\")\n",
+    "                    scores.append(float(0.0))\n",
+    "\n",
+    "            index_ranked = transform_list(scores)\n",
+    "            preds = \"[\" + \",\".join([str(ir) for ir in index_ranked]) + \"]\"\n",
+    "\n",
+    "            wf.write(\" \".join([str(impression_id), preds]) + \"\\n\")\n",
+    "\n",
+    "zip_submission_file(path=Path(\"predictions.txt\"), rm_file=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d5c1bcc-e4b0-4217-93ec-4ca3e24dc6ab",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "blue",
+   "language": "python",
+   "name": "blue"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

nrms_model.epoch0.step10001.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fb936151025cc80032c0b77f62c4c9264dc4bcea4dcbd1cfe9b5ff1c9b2f5c7
+size 324331194

nrms_model.epoch0.step20001.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0785e0dc9d9b39b97988b06bbd2135bc1f1065aab60de50717ada15d4e10e6a4
+size 324331194

testset.py ADDED Viewed

	@@ -0,0 +1,198 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[1]:
+import os
+from datetime import datetime
+from pathlib import Path
+import polars as pl
+import torch
+from transformers import AutoModel, AutoTokenizer
+from transformers import Trainer, TrainingArguments
+from accelerate import Accelerator, DistributedType
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+from utils._constants import *
+from utils._nlp import get_transformers_word_embeddings
+from utils._polars import concat_str_columns, slice_join_dataframes
+from utils._articles import (
+    convert_text2encoding_with_transformers,
+    create_article_id_to_value_mapping
+)
+from utils._python import make_lookup_objects
+from utils._behaviors import (
+    create_binary_labels_column,
+    sampling_strategy_wu2019,
+    truncate_history,
+)
+from utils._articles_behaviors import map_list_article_id_to_value
+from dataset.pytorch_dataloader import (
+    ebnerd_from_path,
+    NRMSDataset,
+    NewsrecDataset,
+)
+from evaluation import (
+    MetricEvaluator,
+    AucScore,
+    NdcgScore,
+    MrrScore,
+    F1Score,
+    LogLossScore,
+    RootMeanSquaredError,
+    AccuracyScore
+)
+from models.nrms import NRMSModel
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# In[2]:
+TEST_DATA_PATH = "merged_0412_final.parquet"
+# In[3]:
+test_df = pl.read_parquet(TEST_DATA_PATH).with_columns(pl.Series("labels", [[]]))
+# In[4]:
+from transformers import AutoModel, AutoTokenizer
+model_name = "Maltehb/danish-bert-botxo"
+model = AutoModel.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+word2vec_embeddimg = get_transformers_word_embeddings(model)
+# In[5]:
+ARTICLES_DATA_PATH = "/work/Blue/ebnerd/ebnerd_testset/articles.parquet"
+ARTICLE_COLUMNS = [DEFAULT_TITLE_COL, DEFAULT_SUBTITLE_COL]
+TEXT_MAX_LENGTH = 30
+articles_df = pl.read_parquet(ARTICLES_DATA_PATH)
+df_articles, cat_col = concat_str_columns(articles_df, columns=ARTICLE_COLUMNS)
+df_articles, token_col_title = convert_text2encoding_with_transformers(
+    df_articles, tokenizer, cat_col, max_length=TEXT_MAX_LENGTH
+)
+article_mapping = create_article_id_to_value_mapping(df=df_articles, value_col=token_col_title)
+# In[6]:
+from dataclasses import dataclass, field
+import numpy as np
+@dataclass
+class NRMSTestDataset(NewsrecDataset):
+    def __post_init__(self):
+        """
+        Post-initialization method. Loads the data and sets additional attributes.
+        """
+        self.lookup_article_index = {id: i for i, id in enumerate(self.article_dict, start=1)}
+        self.lookup_article_matrix = np.array(list(self.article_dict.values()))
+        UNKNOWN_ARRAY = np.zeros(self.lookup_article_matrix.shape[1], dtype=self.lookup_article_matrix.dtype)
+        self.lookup_article_matrix = np.vstack([UNKNOWN_ARRAY, self.lookup_article_matrix])
+        self.unknown_index = [0]
+        self.X, self.y = self.load_data()
+        if self.kwargs is not None:
+            self.set_kwargs(self.kwargs)
+    def __getitem__(self, idx) -> dict:
+        """
+        history_input_tensor:    (samples, history_size, document_dimension)
+        candidate_input_title:   (samples, npratio, document_dimension)
+        label:            (samples, npratio)
+        """
+        batch_X = self.X[idx]
+        article_id_fixed = [self.lookup_article_index.get(f, 0) for f in batch_X["article_id_fixed"].to_list()[0]]
+        history_input_tensor = self.lookup_article_matrix[article_id_fixed]
+        article_id_inview = [self.lookup_article_index.get(f, 0) for f in batch_X["article_ids_inview"].to_list()[0]]
+        candidate_input_title = self.lookup_article_matrix[article_id_inview]
+        return {
+            "user_id": self.X[idx]["user_id"][0],
+            "history_input_tensor": history_input_tensor,
+            "candidate_article_id" : self.X[idx]["article_ids_inview"][0][0],
+            "candidate_input_title": candidate_input_title,
+            "labels" : np.int32(0)
+        }
+# In[7]:
+test_dataset = NRMSTestDataset(
+    behaviors=test_df,
+    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
+    article_dict=article_mapping,
+    unknown_representation="zeros",
+    eval_mode=False,
+)
+# In[8]:
+nrms_model = NRMSModel(
+    pretrained_weight=torch.tensor(word2vec_embeddimg),
+    emb_dim=768,
+    num_heads=16,
+    hidden_dim=128,
+    item_dim=64,
+)
+state_dict = torch.load("nrms_model.epoch0.step20001.pth")
+nrms_model = torch.compile(nrms_model)
+nrms_model.load_state_dict(state_dict["model"])
+nrms_model.to("cuda:1")
+# In[ ]:
+import torch._dynamo
+from tqdm import tqdm
+import os
+from torch.utils.data import DataLoader
+BATCH_SIZE = 256
+test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=60)
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+torch._dynamo.config.suppress_errors = True
+nrms_model.eval()
+with open("test_set.txt", 'w') as f:
+    with torch.no_grad():
+        for i, batch in enumerate(tqdm(test_dataloader)):
+            user_id = batch["user_id"].cpu().tolist()
+            candidate_article_id = batch["candidate_article_id"].cpu().tolist()
+            history_input_tensor = batch["history_input_tensor"].to("cuda:1")
+            candidate_input_title = batch["candidate_input_title"].to("cuda:1")
+            output_logits = nrms_model(history_input_tensor, candidate_input_title, None)[:,0].cpu().tolist()
+            for j in range(len(user_id)):
+                line = f"{user_id[j]},{candidate_article_id[j]},{output_logits[j]}\n"
+                f.write(line)
+# In[ ]: