Update geneformer/tokenizer.py
Browse files- geneformer/tokenizer.py +8 -2
geneformer/tokenizer.py
CHANGED
@@ -126,8 +126,11 @@ def sum_ensembl_ids(
|
|
126 |
gene_ids_collapsed = [
|
127 |
gene_mapping_dict.get(gene_id.upper()) for gene_id in data.ra.ensembl_id
|
128 |
]
|
|
|
|
|
|
|
129 |
|
130 |
-
if len(set(gene_ids_in_dict)) == len(set(
|
131 |
# Keep original Ensembl IDs as `ensembl_id_original`
|
132 |
rename_attr(data.ra, "ensembl_id", "ensembl_id_original")
|
133 |
data.ra["ensembl_id"] = gene_ids_collapsed
|
@@ -223,7 +226,10 @@ def sum_ensembl_ids(
|
|
223 |
gene_ids_collapsed = [
|
224 |
gene_mapping_dict.get(gene_id.upper()) for gene_id in data.var.ensembl_id
|
225 |
]
|
226 |
-
|
|
|
|
|
|
|
227 |
data.var.ensembl_id = data.var.ensembl_id.map(gene_mapping_dict)
|
228 |
return data
|
229 |
|
|
|
126 |
gene_ids_collapsed = [
|
127 |
gene_mapping_dict.get(gene_id.upper()) for gene_id in data.ra.ensembl_id
|
128 |
]
|
129 |
+
gene_ids_collapsed_in_dict = [
|
130 |
+
gene for gene in gene_ids_collapsed if gene in gene_token_dict.keys()
|
131 |
+
]
|
132 |
|
133 |
+
if len(set(gene_ids_in_dict)) == len(set(gene_ids_collapsed_in_dict)):
|
134 |
# Keep original Ensembl IDs as `ensembl_id_original`
|
135 |
rename_attr(data.ra, "ensembl_id", "ensembl_id_original")
|
136 |
data.ra["ensembl_id"] = gene_ids_collapsed
|
|
|
226 |
gene_ids_collapsed = [
|
227 |
gene_mapping_dict.get(gene_id.upper()) for gene_id in data.var.ensembl_id
|
228 |
]
|
229 |
+
gene_ids_collapsed_in_dict = [
|
230 |
+
gene for gene in gene_ids_collapsed if gene in gene_token_dict.keys()
|
231 |
+
]
|
232 |
+
if len(set(gene_ids_in_dict)) == len(set(gene_ids_collapsed_in_dict)):
|
233 |
data.var.ensembl_id = data.var.ensembl_id.map(gene_mapping_dict)
|
234 |
return data
|
235 |
|