|
import gradio as gr |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
import torch.nn.functional as F |
|
|
|
placeholder = 'ACATGCTAAATTAGTTGGCAATTTTTTCTCAGGTAGCTGGGCACAATTTGGTAGTCCAGTTGAACAAAATCCATTAGCTTCTTTTAGCAAGTCCCCTGGTTTGGGCCCTGCCAGTCCCATTAATACCAACCATTTGTCTGGATTGGCTGCAATTCTTTCCCCACAAGCAACAACCTCTACCAAGATTGCACCGATTGGCAAGGACCCTGGAAGGGCTGCAAATCAGATGTTTTCTAACTCTGGATCAACACAAGGAGCAGCTTTTCAGCATTCTATATCCTTTCCTGAGCAAAATGTAAAGGCAAGTCCTAGGCCTATATCTACTTTTGGTGAATCAAGTTCTAGTGCATCAAGTATTGGAACACTGTCCGGTCCTCAATTTCTTTGGGGAAGCCCAACTCCTTACTCTGAGCATTCAAACACTTCTGCCTGGTCTTCATCTTCGGTGGGGCTTCCATTTACATCTAGTGTCCAAAGGCAGGGTTTCCCATATACTAGTAATCACAGTCCTTTTCTTGGCTCCCACTCTCATCATCATGTTGGATCTGCTCCATCTGGCCTTCCGCTTGATAGGCATTTTAGCTACTTCCCTGAGTCACCTGAAGCTTCTCTCATGAGCCCGGTTGCATTTGGGAATTTAAATCACGGTGATGGGAATTTTATGATGAACAACATTAGTGCTCGTGCATCTGTAGGAGCCGGTGTTGGTCTTTCTGGAAATACCCCTGAAATTAGTTCACCCAATTTCAGAATGATGTCTCTGCCTAGGCATGGTTCCTTGTTCCATGGAAATAGTTTGTATTCTGGACCTGGAGCAACTAACATTGAGGGATTAGCTGAACGTGGACGAAGTAGACGACCTGAAAATGGTGGGAACCAAATTGATAGTAAGAAGCTGTACCAGCTTGATCTTGACAAAATCGTCTGTGGTGAAGATACAAGGACTACTTTAATGATTAAAAACATTCCTAACAAGTAAGAATAACTAAACATCTATCCT' |
|
model_names = ['plant-dnabert', 'plant-dnagpt', 'plant-nucleotide-transformer', 'plant-dnagemma', |
|
'dnabert2', 'nucleotide-transformer-v2-100m', 'agront-1b'] |
|
tokenizer_type = "6mer" |
|
model_names = [x + '-' + tokenizer_type if x.startswith("plant") else x for x in model_names] |
|
task_map = { |
|
"promoter": ["Not promoter", "Core promoter"], |
|
"conservation": ["Not conserved", "Conserved"], |
|
"H3K27ac": ["Not H3K27ac", "H3K27ac"], |
|
"H3K27me3": ["Not H3K27me3", "H3K27me3"], |
|
"H3K4me3": ["Not H3K4me3", "H3K4me3"], |
|
"lncRNAs": ["Not lncRNA", "lncRNA"], |
|
"open_chromatin": ['Not open chromatin', 'Full open chromatin', 'Partial open chromatin'], |
|
} |
|
task_lists = task_map.keys() |
|
|
|
def inference(seq,model,task): |
|
if not seq: |
|
gr.Warning("No sequence provided, use the default sequence.") |
|
seq = placeholder |
|
|
|
model_name = f'zhangtaolab/{model}-{task}' |
|
model = AutoModelForSequenceClassification.from_pretrained(model_name,ignore_mismatched_sizes=True) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
inputs = tokenizer(seq, return_tensors='pt', padding=True, truncation=True, max_length=512) |
|
outputs = model(**inputs) |
|
probabilities = F.softmax(outputs.logits,dim=-1).tolist()[0] |
|
|
|
labels = task_map[task] |
|
result = {labels[i]: probabilities[i] for i in range(len(labels))} |
|
return result |
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.HTML( |
|
""" |
|
<h1 style="text-align: center;">Prediction of sequence conservation in plant with LLMs</h1> |
|
""" |
|
) |
|
with gr.Row(): |
|
drop1 = gr.Dropdown(choices=task_lists, |
|
label="Selected Task", |
|
interactive=False, |
|
value='conservation') |
|
drop2 = gr.Dropdown(choices=model_names, |
|
label="Select Model", |
|
interactive=True, |
|
value=model_names[0]) |
|
seq_input = gr.Textbox(label="Input Sequence", lines=6, placeholder=placeholder) |
|
with gr.Row(): |
|
predict_btn = gr.Button("Predict",variant="primary") |
|
clear_btn = gr.Button("Clear") |
|
output = gr.Label(label="Predict result") |
|
|
|
predict_btn.click(inference, inputs=[seq_input,drop2, drop1], outputs=output) |
|
clear_btn.click(lambda: ("", None), inputs=[], outputs=[seq_input, output]) |
|
|
|
|
|
demo.launch() |