In [1]:
from transforna import load,predict_transforna_all_models,predict_transforna,fold_sequences
models_path = '/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/models/tcga/'
lc_path = '/media/ftp_share/hbdx/annotation/feature_annotation/ANNOTATION/HBDxBase_annotation/TransfoRNA/compare_binning_strategies/v05/2024-04-19__230126_LC_DI_HB_GEL_v23.01.00/sRNA_anno_aggregated_on_seq.csv'
tcga_path = '/media/ftp_share/hbdx/data_for_upload/TransfoRNA/data/TCGA__ngs__miRNA_log2RPM-24.04.0__var.csv'

tcga_df = load(tcga_path)
lc_df = load(lc_path)

lc_df = lc_df[lc_df.sequence.str.len() <= 30]

all_seqs = lc_df.sequence.tolist()+tcga_df.sequence.tolist()

mapping_dict_path = '/media/ftp_share/hbdx/data_for_upload/TransfoRNA//data/subclass_to_annotation.json'
mapping_dict = load(mapping_dict_path)
 

 from .autonotebook import tqdm as notebook_tqdm


In [None]:
predictions = predict_transforna_all_models(all_seqs,trained_on='full',path_to_models=models_path)
predictions.to_csv('predictions_lc_tcga.csv',index=False)

In [2]:
#read predictions
predictions = load('predictions_lc_tcga.csv')

In [None]:
umaps = {}
models = predictions['Model'].unique()
for model in models:
 if model == 'Ensemble':
 continue
 #get predictions
 model_predictions = predictions[predictions['Model']==model]
 #get is familiar rows
 familiar_df = model_predictions[model_predictions['Is Familiar?']==True]
 #get umap
 umap_df = predict_transforna(model_predictions['Sequence'].tolist(),model=model,trained_on='full',path_to_models=models_path,umap_flag=True)
 umaps[model] = umap_df

In [None]:

import plotly.express as px
import numpy as np
mcs = np.unique(umaps['Seq']['Net-Label'].map(mapping_dict))
#filter out the classes that contain ;
mcs = [mc for mc in mcs if ';' not in mc]
colors = px.colors.qualitative.Plotly
color_mapping = dict(zip(mcs,colors))
for model,umap_df in umaps.items():
 umap_df['Major Class'] = umap_df['Net-Label'].map(mapping_dict)
 umap_df_copy = umap_df.copy()
 #remove rows with Major Class containing ;
 umap_df = umap_df[~umap_df['Major Class'].str.contains(';')]
 fig = px.scatter(umap_df,x='UMAP1',y='UMAP2',color='Major Class',hover_data
 =['Sequence'],title=model,\
 width = 800, height=800,color_discrete_map=color_mapping)
 fig.update_traces(marker=dict(size=1))
 #white background
 fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
 #only show UMAP1 from 4.3 to 11
 fig.update_xaxes(range=[4.3,11])
 #and UMAP2 from -2.3 to 6.8
 fig.update_yaxes(range=[-2.3,6.8])
 #fig.show()
 fig.write_image(f'lc_figures/lc_tcga_umap_selected_{model}.png')
 fig.write_image(f'lc_figures/lc_tcga_umap_selected_{model}.svg')


In [None]:
import plotly.express as px
import numpy as np
mcs = np.unique(umaps['Seq']['Net-Label'].map(mapping_dict))
#filter out the classes that contain ;
mcs = [mc for mc in mcs if ';' not in mc]
colors = px.colors.qualitative.Plotly + px.colors.qualitative.Light24
color_mapping = dict(zip(mcs,colors))
for model,umap_df in umaps.items():
 umap_df['Major Class'] = umap_df['Net-Label'].map(mapping_dict)
 umap_df_copy = umap_df.copy()
 #remove rows with Major Class containing ;
 umap_df = umap_df[~umap_df['Major Class'].str.contains(';')]
 fig = px.scatter(umap_df,x='UMAP1',y='UMAP2',color='Major Class',hover_data
 =['Sequence'],title=model,\
 width = 800, height=800,color_discrete_map=color_mapping)
 fig.update_traces(marker=dict(size=1))
 #white background
 fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
 #fig.show()
 fig.write_image(f'lc_figures/lc_tcga_umap_{model}.png')
 fig.write_image(f'lc_figures/lc_tcga_umap_{model}.svg')


In [None]:
#plot umap using px.scatter for each model
import plotly.express as px
import numpy as np
mcs = np.unique(umaps['Seq']['Net-Label'].map(mapping_dict))
#filter out the classes that contain ;
mcs = [mc for mc in mcs if ';' not in mc]
colors = px.colors.qualitative.Plotly
color_mapping = dict(zip(mcs,colors))
umap_df = umaps['Seq']
umap_df['Major Class'] = umap_df['Net-Label'].map(mapping_dict)
umap_df_copy = umap_df.copy()
#display points contained within the circle at center (7.9,2.5) and radius 4.3
umap_df_copy['distance'] = np.sqrt((umap_df_copy['UMAP1']-7.9)**2+(umap_df_copy['UMAP2']-2.5)**2)
umap_df_copy = umap_df_copy[umap_df_copy['distance']<=4.3]
#remove rows with Major Class containing ;
umap_df_copy = umap_df_copy[~umap_df_copy['Major Class'].str.contains(';')]
fig = px.scatter(umap_df_copy,x='UMAP1',y='UMAP2',color='Major Class',hover_data
 =['Sequence'],title=model,\
 width = 800, height=800,color_discrete_map=color_mapping)
fig.update_traces(marker=dict(size=1))
#white background
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
fig.show()
#fig.write_image(f'lc_figures/lc_tcga_umap_selected_{model}.png')
#fig.write_image(f'lc_figures/lc_tcga_umap_selected_{model}.svg')


In [None]:
#plot
sec_struct = fold_sequences(model_predictions['Sequence'].tolist())['structure_37']
#sec struct ratio is calculated as the number of non '.' characters divided by the length of the sequence
sec_struct_ratio = sec_struct.apply(lambda x: (len(x)-x.count('.'))/len(x))


In [40]:
umap_df = umaps['Seq-Struct']
fig = px.scatter(umap_df,x='UMAP1',y='UMAP2',color=sec_struct_ratio,hover_data=['Sequence'],title=model,\
 width = 800, height=800,color_continuous_scale='Viridis')
fig.update_traces(marker=dict(size=1))
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
#save
fig.write_image(f'lc_figures/lc_tcga_umap_{model}_dot_bracket.png')
fig.write_image(f'lc_figures/lc_tcga_umap_{model}_dot_bracket.svg')