Update README.md
Browse files
README.md
CHANGED
@@ -19,10 +19,31 @@ We carry out byte-pair encoding (BPE) tokenization on our dataset, tailored for
|
|
19 |
|
20 |
## **Usage**
|
21 |
```python
|
22 |
-
|
|
|
23 |
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
```
|
27 |
|
28 |
## **Benchmark Performance**
|
|
|
19 |
|
20 |
## **Usage**
|
21 |
```python
|
22 |
+
import torch
|
23 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
24 |
|
25 |
+
# Load the tokenizer and model
|
26 |
+
tokenizer = AutoTokenizer.from_pretrained("metagene-ai/METAGENE-1")
|
27 |
+
model = AutoModelForCausalLM.from_pretrained("metagene-ai/METAGENE-1", torch_dtype=torch.bfloat16)
|
28 |
+
|
29 |
+
# Example input: Hexamita inflata 5.8S ribosomal RNA gene sequence
|
30 |
+
input_sequence = (
|
31 |
+
"TCACCGTTCTACAATCCCAAGCTGGAGTCAAGCTCAACAGGGTCTTCTTGCCCCGCTGAGGGTTACACTCGCCCGTTCCCGAGTCTGTGGTTTCGCGAAGATATGACCAGGGACAGTAAGAACC"
|
32 |
+
)
|
33 |
+
|
34 |
+
# Tokenize the input sequence and truncate to the first 12 tokens
|
35 |
+
input_tokens = tokenizer.encode(input_sequence, return_tensors="pt", add_special_tokens=False)[..., :12]
|
36 |
+
|
37 |
+
# Generate output from the model with a max sequence length of 32 tokens
|
38 |
+
generated_tokens = model.generate(input_tokens, max_length=32)
|
39 |
+
|
40 |
+
# Decode the generated output and clean up the result
|
41 |
+
generated_sequence = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
|
42 |
+
generated_sequence = generated_sequence.replace(" ", "").replace("_", "")
|
43 |
+
|
44 |
+
# Print the original input and the model's output
|
45 |
+
print(f"馃搫 Input Sequence:\n{input_sequence}")
|
46 |
+
print(f"馃敩 Generated Sequence:\n{generated_sequence}")
|
47 |
```
|
48 |
|
49 |
## **Benchmark Performance**
|