sschet commited on
Commit
e08f35a
1 Parent(s): 415b05c

Upload 8 files

Browse files
README.md ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ tags:
5
+ - ner
6
+ - ncbi
7
+ - disease
8
+ - pubmed
9
+ - bioinfomatics
10
+ license: apache-2.0
11
+ datasets:
12
+ - ncbi-disease
13
+ - bc5cdr
14
+ - tner/bc5cdr
15
+ - jnlpba
16
+ - bc2gm_corpus
17
+ - drAbreu/bc4chemd_ner
18
+ - linnaeus
19
+ - ncbi_disease
20
+ widget:
21
+ - text: "Hepatocyte nuclear factor 4 alpha (HNF4α) is regulated by different promoters to generate two isoforms, one of which functions as a tumor suppressor. Here, the authors reveal that induction of the alternative isoform in hepatocellular carcinoma inhibits the circadian clock by repressing BMAL1, and the reintroduction of BMAL1 prevents HCC tumor growth."
22
+
23
+ ---
24
+
25
+ # NER to find Gene & Gene products
26
+ > The model was trained on ncbi-disease, BC5CDR dataset, pretrained on this [pubmed-pretrained roberta model](/raynardj/roberta-pubmed)
27
+ All the labels, the possible token classes.
28
+ ```json
29
+ {"label2id": {
30
+ "O": 0,
31
+ "Disease":1,
32
+ }
33
+ }
34
+ ```
35
+
36
+ Notice, we removed the 'B-','I-' etc from data label.🗡
37
+
38
+ ## This is the template we suggest for using the model
39
+ ```python
40
+ from transformers import pipeline
41
+ PRETRAINED = "raynardj/ner-disease-ncbi-bionlp-bc5cdr-pubmed"
42
+ ner = pipeline(task="ner",model=PRETRAINED, tokenizer=PRETRAINED)
43
+ ner("Your text", aggregation_strategy="first")
44
+ ```
45
+ And here is to make your output more consecutive ⭐️
46
+ ```python
47
+ import pandas as pd
48
+ from transformers import AutoTokenizer
49
+ tokenizer = AutoTokenizer.from_pretrained(PRETRAINED)
50
+ def clean_output(outputs):
51
+ results = []
52
+ current = []
53
+ last_idx = 0
54
+ # make to sub group by position
55
+ for output in outputs:
56
+ if output["index"]-1==last_idx:
57
+ current.append(output)
58
+ else:
59
+ results.append(current)
60
+ current = [output, ]
61
+ last_idx = output["index"]
62
+ if len(current)>0:
63
+ results.append(current)
64
+
65
+ # from tokens to string
66
+ strings = []
67
+ for c in results:
68
+ tokens = []
69
+ starts = []
70
+ ends = []
71
+ for o in c:
72
+ tokens.append(o['word'])
73
+ starts.append(o['start'])
74
+ ends.append(o['end'])
75
+ new_str = tokenizer.convert_tokens_to_string(tokens)
76
+ if new_str!='':
77
+ strings.append(dict(
78
+ word=new_str,
79
+ start = min(starts),
80
+ end = max(ends),
81
+ entity = c[0]['entity']
82
+ ))
83
+ return strings
84
+ def entity_table(pipeline, **pipeline_kw):
85
+ if "aggregation_strategy" not in pipeline_kw:
86
+ pipeline_kw["aggregation_strategy"] = "first"
87
+ def create_table(text):
88
+ return pd.DataFrame(
89
+ clean_output(
90
+ pipeline(text, **pipeline_kw)
91
+ )
92
+ )
93
+ return create_table
94
+ # will return a dataframe
95
+ entity_table(ner)(YOUR_VERY_CONTENTFUL_TEXT)
96
+ ```
97
+ > check our NER model on
98
+ * [gene and gene products](/raynardj/ner-gene-dna-rna-jnlpba-pubmed)
99
+ * [chemical substance](/raynardj/ner-chemical-bionlp-bc5cdr-pubmed).
100
+ * [disease](/raynardj/ner-disease-ncbi-bionlp-bc5cdr-pubmed)
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "raynardj/roberta-pubmed",
3
+ "architectures": [
4
+ "RobertaForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "O",
15
+ "1": "Disease"
16
+ },
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "label2id": {
20
+ "Disease": 1,
21
+ "O": 0
22
+ },
23
+ "layer_norm_eps": 1e-05,
24
+ "max_position_embeddings": 514,
25
+ "model_type": "roberta",
26
+ "num_attention_heads": 12,
27
+ "num_hidden_layers": 12,
28
+ "pad_token_id": 1,
29
+ "position_embedding_type": "absolute",
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.9.1",
32
+ "type_vocab_size": 1,
33
+ "use_cache": true,
34
+ "vocab_size": 50265
35
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e10d8bbbd5c112c44762c48d04ce312c964b98391fb044685a09a9f7da4b5cdb
3
+ size 496313335
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": true, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "raynardj/roberta-pubmed", "tokenizer_class": "RobertaTokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff