LucaOne
LucaOne: Generalized Biological Foundation Model with Unified Nucleic Acid and Protein Language.
Github Page: https://github.com/LucaOne/LucaOne
This repo contains weights (checkpoint=17600000) and core codes (modified to suit HF API, might be unstable in the current stage) for LucaOne general-purpose language model (LucaOneGPLM).
To calculate the embedding of a nucleotide/protein sequence:
import torch
from transformers import AutoModel, AutoTokenizer
def gene_seq_replace(seq):
'''
Nucleic acid (gene replace: A->1, U/T->2, C->3, G->4, N->5
:param seq:
:return:
'''
new_seq = ""
for ch in seq:
if ch in ["A", "a"]:
new_seq += "1"
elif ch in ["T", "U", "t", "u"]:
new_seq += "2"
elif ch in ["C", "c"]:
new_seq += "3"
elif ch in ["G", "g"]:
new_seq += "4"
else: # unknown
new_seq += "5"
return new_seq
model = AutoModel.from_pretrained("Yuanfei/LucaOne", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("Yuanfei/LucaOne", trust_remote_code=True)
# Test input
seq = "ATCGCGAGTAGCGAGNNNAGCGAT"
seq_type = "gene" # or "prot"
if seq_type == "gene":
seq = gene_seq_replace(seq)
print("seq len: %d:" % len(seq))
# Test run
seq_encoded = tokenizer.encode(seq)
input_ids = torch.tensor(seq_encoded, dtype=torch.int64).unsqueeze(0)
print("input_ids:")
print(input_ids)
if seq_type == "gene":
token_type_ids = torch.zeros_like(input_ids)
else:
token_type_ids = torch.ones_like(input_ids)
encoding = {
"input_ids": input_ids,
"token_type_ids": token_type_ids,
}
if seq_type == "prot":
new_encoding = {}
for item in encoding.items():
new_encoding[item[0] + "_b"] = item[1]
encoding = new_encoding
batch = encoding
batch["return_dict"] = True
res = model(**batch)
if seq_type == "prot":
embedding = res.hidden_states_b
else:
embedding = res.hidden_states
print("embedding matrix(include [CLS] and [SEP]):")
print(embedding)
print(embedding.shape)
print("[CLS] embedding vector:")
cls_vec = embedding[0, 0, :]
print(cls_vec)
print(cls_vec.shape)
If there is an error when loading tokenizer: "ValueError: Tokenizer class AlphabetTokenizer does not exist or is not currently imported." then try to run the alphabet.py first.
- Downloads last month
- 44