andrewzamai
commited on
Commit
•
26b46c0
1
Parent(s):
b164e8a
Update README.md
Browse files
README.md
CHANGED
@@ -1,3 +1,57 @@
|
|
1 |
-
---
|
2 |
-
license: llama2
|
3 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: llama2
|
3 |
+
---
|
4 |
+
|
5 |
+
\begin{table*}[htb]
|
6 |
+
\centering
|
7 |
+
\footnotesize
|
8 |
+
\resizebox{\textwidth}{!}{
|
9 |
+
\begin{tabular}{llc|cc|ccccc|c}
|
10 |
+
\toprule
|
11 |
+
|
12 |
+
%{ll|ccccccc|c} \textbf{Model} & \textbf{Backbone} & \textbf{Movie} & \textbf{Restaurant} & \textbf{AI} & \textbf{Literature} & \textbf{Music} & \textbf{Politics} & \textbf{Science} & \textbf{AVG}\\
|
13 |
+
%Model & Backbone & Movie & Restaurant & AI & Literature & Music & Politics & Science & AVG\\
|
14 |
+
|
15 |
+
\textbf{Model} & \textbf{Backbone} & \textbf{\#Params} &
|
16 |
+
\multicolumn{2}{c|}{\textbf{MIT}} & \multicolumn{5}{c|}{\textbf{CrossNER}} & \textbf{AVG}\\
|
17 |
+
|
18 |
+
\textbf{} & \textbf{} & \textbf{} & Movie & Restaurant & AI & Literature & Music & Politics & Science & \\
|
19 |
+
\midrule
|
20 |
+
|
21 |
+
|
22 |
+
ChatGPT & gpt-3.5-turbo & - & 5.3 & 32.8 & 52.4 & 39.8 & 66.6 & 68.5 & 67.0 & 47.5\\
|
23 |
+
|
24 |
+
InstructUIE & Flan-T5-xxl & 11B & 63.0 & 21.0 & 49.0 & 47.2 & 53.2 & 48.2 & 49.3 & 47.3\\
|
25 |
+
|
26 |
+
UniNER-type & LLaMA-1 & 7B & 42.4 & 31.7 & 53.5 & 59.4 & 65.0 & 60.8 & 61.1 & 53.4\\
|
27 |
+
|
28 |
+
UniNER-def & LLaMA-1 & 7B & 27.1 & 27.9 & 44.5 & 49.2 & 55.8 & 57.5 & 52.9 & 45.0\\
|
29 |
+
|
30 |
+
UniNER-type+sup. & LLaMA-1 & 7B & 61.2 & 35.2 & 62.9 & 64.9 & 70.6 & 66.9 & 70.8 & 61.8\\
|
31 |
+
|
32 |
+
GoLLIE & Code-LLaMA & 7B & 63.0 & 43.4 & 59.1 & 62.7 & 67.8 & 57.2 & 55.5 & 58.4 \\ % todo uniNER eval
|
33 |
+
|
34 |
+
GLiNER-L & DeBERTa-v3 & 0.3B & 57.2 & 42.9 & 57.2 & 64.4 & 69.6 & 72.6 & 62.6 & 60.9 \\ % todo uniNER eval
|
35 |
+
|
36 |
+
GNER-T5 & Flan-T5-xxl & 11B & 62.5 & 51.0 & 68.2 & 68.7 & 81.2 & 75.1 & 76.7 & 69.1\\
|
37 |
+
|
38 |
+
GNER-LLaMA & LLaMA-1 & 7B & 68.6 & 47.5 & 63.1 & 68.2 & 75.7 & 69.4 & 69.9 & 66.1\\
|
39 |
+
|
40 |
+
%\midrule
|
41 |
+
%FullPileNER w/o D\&G & LLaMA-2-7B-chat & 49.8 & 33.7 & 52.9 & 60.2 & 67.7 & 60.3 & 60.1 & 55.0\\
|
42 |
+
\midrule
|
43 |
+
|
44 |
+
%our-391x5-FDef & LLaMA-2-7B-chat & 47.2 & 39.3 & 51.0 & 57.3 & 56.9 & 56.3 & 51.8 & $51.4 \pm {x}$\\
|
45 |
+
SLIMER w/o D\&G & LLaMA-2-chat & 7B & $46.4 \pm {1.8}$ & $36.3\pm{2.1}$ & $49.6\pm{3.2}$ & $58.4\pm{1.7}$ & $56.8\pm{2.1}$ & $57.9\pm{2.1}$ & $53.8\pm{1.7}$ & $51.3 \pm {2.0}$\\
|
46 |
+
%our-391x5-TDef & LLaMA-2-7B-chat & 48.6 & 38.1 & 52.2 & 57.5 & 58.4 & 62.2 & 56.0 & $53.3 \pm {x}$\\
|
47 |
+
\textbf{SLIMER} & \textbf{LLaMA-2-chat} & \textbf{7B} & $\textbf{50.9}\pm\textbf{0.9}$ & $\textbf{38.2}\pm\textbf{0.3}$ & $\textbf{50.1}\pm\textbf{2.4}$ & $\textbf{58.7}\pm\textbf{0.2}$ & $\textbf{60.0}\pm\textbf{0.5}$ & $\textbf{63.9}\pm\textbf{1.0}$ & $\textbf{56.3}\pm\textbf{0.6}$ & $\textbf{54.0}\pm\textbf{0.5}$\\
|
48 |
+
|
49 |
+
\bottomrule
|
50 |
+
\end{tabular}
|
51 |
+
}
|
52 |
+
\caption{Comparison of OOD performance for SLIMER and state-of-the-art models on MIT and CrossNER benchmark. %for the purpose of evaluating generalization to out-of-distribution input domains.
|
53 |
+
With the exception of UniNER-def, all the competitors' results are taken from their respective papers as listed in Section~\ref{sec:compared_models}.}
|
54 |
+
\label{tab:MIT_CrossNER_comparison}
|
55 |
+
|
56 |
+
%\caption{\st{Zero-shot} Out-Of-Domain evaluation results on CrossNER and MIT datasets. While most models have been trained on large NER datasets, either by collecting existing human annotated datasets or by using synthetic annotated data composed of many samples and containing up to 13020 different NEs, we show that with only 5 positive and 5 negative samples per NE, using only the most frequent 391 NEs in pileNER, we can achieve competitive performance on MIT and CrossNER datasets. In particular, while other models have already seen most of the NEs (in the case of models trained on pileNER-type, the overlap can be considered 100\%), our model is new to the majority of the NEs. Nevertheless, it shows a strong generezability, not only on an out-of-domain input, but also on never seen Named Entities. When guidelines are employed, the model further improves its performance by 1.9 points. A LLaMA-2-7B model was trained on the fullPileNER type for comparison with the UniNER type-7B, which is based on a LLaMA-1 model instead and uses a different template for instruction tuning.}
|
57 |
+
\end{table*}
|