In [1]:

import pandas as pd
scores = {'major_class':{},'sub_class':{}}
models = ['Baseline','Seq','Seq-Seq','Seq-Struct','Seq-Rev']
models_path = '/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/models/tcga/TransfoRNA_ID'
for model1 in models:
 summary_pd = pd.read_csv(models_path+'/major_class/'+model1+'/summary_pd.tsv',sep='\t')
 scores['major_class'][model1] = str(summary_pd['B. Acc'].mean()*100)+'+/-'+' ('+str(summary_pd['B. Acc'].std()*100)+')'
 summary_pd = pd.read_csv(models_path+'/sub_class/'+model1+'/summary_pd.tsv',sep='\t')
 scores['sub_class'][model1] = str(summary_pd['B. Acc'].mean()*100)+'+/-'+' ('+str(summary_pd['B. Acc'].std()*100) +')'

In [2]:
scores['sub_class']

{'Baseline': '52.83789870060305+/- (1.0961119898709506)',
 'Seq': '97.70018230805728+/- (0.3819207447704567)',
 'Seq-Seq': '95.65091330992355+/- (0.4963151975035616)',
 'Seq-Struct': '97.71071590680333+/- (0.6173598637101496)',
 'Seq-Rev': '97.51224133899979+/- (0.3418133671042992)'}

In [3]:

import json
import pandas as pd
with open('/media/ftp_share/hbdx/data_for_upload/TransfoRNA//data/subclass_to_annotation.json') as f:
 mapping_dict = json.load(f)

b_acc_sc_to_mc = {}
for model1 in models:
 b_acc = []
 for idx in range(5):
 confusion_matrix = pd.read_csv(models_path+'/sub_class/'+model1+f'/embedds/confusion_matrix_{idx}.csv',sep=',',index_col=0)
 confusion_matrix.index = confusion_matrix.index.map(mapping_dict)
 confusion_matrix.columns = confusion_matrix.columns.map(mapping_dict)
 confusion_matrix = confusion_matrix.groupby(confusion_matrix.index).sum().groupby(confusion_matrix.columns,axis=1).sum()
 b_acc.append(confusion_matrix.values.diagonal().sum()/confusion_matrix.values.sum())
 b_acc_sc_to_mc[model1] = str(pd.Series(b_acc).mean()*100)+'+/-'+' ('+str(pd.Series(b_acc).std()*100)+')'


In [4]:
b_acc_sc_to_mc

{'Baseline': '89.6182558114013+/- (0.6372156071358975)',
 'Seq': '99.66714304286457+/- (0.1404591049684126)',
 'Seq-Seq': '99.40702944026852+/- (0.18268320317601783)',
 'Seq-Struct': '99.77114728744993+/- (0.06976258667467564)',
 'Seq-Rev': '99.70878801385821+/- (0.11954774341354062)'}

In [5]:

import plotly.express as px
no_annotation_predictions = {}
for model1 in models:
 #multiindex
 no_annotation_predictions[model1] = pd.read_csv(models_path+'/sub_class/'+model1+'/embedds/no_annotation_embedds.tsv',sep='\t',header=[0,1],index_col=[0])
 no_annotation_predictions[model1].set_index([('RNA Sequences','0')] ,inplace=True)
 no_annotation_predictions[model1].index.name = 'RNA Sequences'
 no_annotation_predictions[model1] = no_annotation_predictions[model1]['Logits'].idxmax(axis=1)


In [None]:
from transforna.src.utils.tcga_post_analysis_utils import correct_labels
import pandas as pd
correlation = pd.DataFrame(index=models,columns=models)
for model1 in models:
 for model2 in models:
 model1_predictions = correct_labels(no_annotation_predictions[model1],no_annotation_predictions[model2],mapping_dict)
 is_equal = model1_predictions == no_annotation_predictions[model2].values
 correlation.loc[model1,model2] = is_equal.sum()/len(is_equal)
font_size = 20
fig = px.imshow(correlation, color_continuous_scale='Blues')
#annotate
for i in range(len(models)):
 for j in range(len(models)):
 if i != j:
 font = dict(color='black', size=font_size)
 else:
 font = dict(color='white', size=font_size) 
 
 fig.add_annotation(
 x=j, y=i,
 text=str(round(correlation.iloc[i,j],2)),
 showarrow=False,
 font=font
 )

#set figure size: width and height
fig.update_layout(width=800, height=800)

fig.update_layout(title='Correlation between models for each sub_class model')
#set x and y axis to Models
fig.update_xaxes(title_text='Models', tickfont=dict(size=font_size))
fig.update_yaxes(title_text='Models', tickfont=dict(size=font_size))
fig.show()
#save
fig.write_image('/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/transforna/bin/figures/correlation_id_models_sub_class.png')
fig.write_image('/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/transforna/bin/figures/correlation_id_models_sub_class.svg')

In [7]:
#create umap for every model from embedds folder
models_path = '/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/models/tcga/TransfoRNA_ID'

#read
sc_embedds = {}
mc_embedds = {}
sc_to_mc_labels = {}
sc_labels = {}
mc_labels = {}
for model in models:
 df = pd.read_csv(models_path+'/sub_class/'+model+'/embedds/train_embedds.tsv',sep='\t',header=[0,1],index_col=[0])
 sc_embedds[model] = df['RNA Embedds'].values
 sc_labels[model] = df['Labels']['0']
 sc_to_mc_labels[model] = sc_labels[model].map(mapping_dict).values

 #major class
 df = pd.read_csv(models_path+'/major_class/'+model+'/embedds/train_embedds.tsv',sep='\t',header=[0,1],index_col=[0])
 mc_embedds[model] = df['RNA Embedds'].values
 mc_labels[model] = df['Labels']['0']

In [8]:
import umap
#compute umap coordinates
sc_umap_coords = {}
mc_umap_coords = {}
for model in models:
 sc_umap_coords[model] = umap.UMAP(n_neighbors=5, min_dist=0.3, n_components=2, metric='euclidean').fit_transform(sc_embedds[model])
 mc_umap_coords[model] = umap.UMAP(n_neighbors=5, min_dist=0.3, n_components=2, metric='euclidean').fit_transform(mc_embedds[model])

In [None]:
#plot umap
import plotly.express as px
import numpy as np

mcs = np.unique(sc_to_mc_labels[models[0]])
colors = px.colors.qualitative.Plotly
color_mapping = dict(zip(mcs,colors))
for model in models:
 fig = px.scatter(x=sc_umap_coords[model][:,0],y=sc_umap_coords[model][:,1],color=sc_to_mc_labels[model],labels={'color':'Major Class'},title=model, width=800, height=800,\

 hover_data={'Major Class':sc_labels[model],'Sub Class':sc_to_mc_labels[model]},color_discrete_map=color_mapping)

 fig.update_traces(marker=dict(size=1))
 #white background
 fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')

 fig.write_image(models_path+'/sub_class/'+model+'/figures/sc_umap.svg')
 fig.write_image(models_path+'/sub_class/'+model+'/figures/sc_umap.png')
 fig.show()

 #plot umap for major class
 fig = px.scatter(x=mc_umap_coords[model][:,0],y=mc_umap_coords[model][:,1],color=mc_labels[model],labels={'color':'Major Class'},title=model, width=800, height=800,\

 hover_data={'Major Class':mc_labels[model]},color_discrete_map=color_mapping)
 fig.update_traces(marker=dict(size=1))
 #white background
 fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')

 fig.write_image(models_path+'/major_class/'+model+'/figures/mc_umap.svg')
 fig.write_image(models_path+'/major_class/'+model+'/figures/mc_umap.png')
 fig.show()

In [None]:
from transforna import fold_sequences
df = pd.read_csv(models_path+'/major_class/Seq-Struct/embedds/train_embedds.tsv',sep='\t',header=[0,1],index_col=[0])
sec_struct = fold_sequences(df['RNA Sequences']['0'])['structure_37']
#sec struct ratio is calculated as the number of non '.' characters divided by the length of the sequence
sec_struct_ratio = sec_struct.apply(lambda x: (len(x)-x.count('.'))/len(x))
fig = px.scatter(x=mc_umap_coords['Seq-Struct'][:,0],y=mc_umap_coords['Seq-Struct'][:,1],color=sec_struct_ratio,labels={'color':'Base Pairing'},title='Seq-Struct', width=800, height=800,\
 hover_data={'Major Class':mc_labels['Seq-Struct']}, color_continuous_scale='Viridis',range_color=[0,1])

fig.update_traces(marker=dict(size=3))
#white background
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
fig.show()
fig.write_image(models_path+'/major_class/Seq-Struct/figures/mc_umap_sec_struct.svg')


In [None]:

from transforna import fold_sequences
df = pd.read_csv(models_path+'/sub_class/Seq-Struct/embedds/train_embedds.tsv',sep='\t',header=[0,1],index_col=[0])
sec_struct = fold_sequences(df['RNA Sequences']['0'])['structure_37']
#sec struct ratio is calculated as the number of non '.' characters divided by the length of the sequence
sec_struct_ratio = sec_struct.apply(lambda x: (len(x)-x.count('.'))/len(x))
fig = px.scatter(x=sc_umap_coords['Seq-Struct'][:,0],y=sc_umap_coords['Seq-Struct'][:,1],color=sec_struct_ratio,labels={'color':'Base Pairing'},title='Seq-Struct', width=800, height=800,\
 hover_data={'Major Class':mc_labels['Seq-Struct']}, color_continuous_scale='Viridis',range_color=[0,1])

fig.update_traces(marker=dict(size=3))
#white background
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
fig.show()
fig.write_image(models_path+'/sub_class/Seq-Struct/figures/sc_umap_sec_struct.svg')

In [11]:
from transforna import Results_Handler,get_closest_ngbr_per_split

splits = ['train','valid','test','ood','artificial','no_annotation']
splits_to_plot = ['test','ood','random','recombined','artificial_affix']
renaming_dict= {'test':'ID (test)','ood':'Rare sub-classes','random':'Random','artificial_affix':'Putative 5\'-adapter prefixes','recombined':'Recombined'}

lev_dist_df = pd.DataFrame()
for model in models:
 results = Results_Handler(models_path+f'/sub_class/{model}/embedds',splits=splits,read_dataset=True)
 results.append_loco_variants()
 results.get_knn_model()
 
 #compute levenstein distance per split
 for split in splits_to_plot:
 split_seqs,split_labels,top_n_seqs,top_n_labels,distances,lev_dist = get_closest_ngbr_per_split(results,split)
 #create df from split and levenstein distance
 lev_dist_split_df = pd.DataFrame({'split':split,'lev_dist':lev_dist,'seqs':split_seqs,'labels':split_labels,'top_n_seqs':top_n_seqs,'top_n_labels':top_n_labels})
 #rename 
 lev_dist_split_df['split'] = lev_dist_split_df['split'].map(renaming_dict)
 lev_dist_split_df['model'] = model
 #append 
 lev_dist_df = pd.concat([lev_dist_df,lev_dist_split_df],axis=0)



In [None]:
#plot the distribution of lev_dist for each split for each model
model_thresholds = {'Baseline':0.267,'Seq':0.246,'Seq-Seq':0.272,'Seq-Struct': 0.242,'Seq-Rev':0.237}
model_aucs = {'Baseline':0.76,'Seq':0.97,'Seq-Seq':0.96,'Seq-Struct': 0.97,'Seq-Rev':0.97}
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
sns.set(rc={'figure.figsize':(15,10)})
sns.set(font_scale=1.5)
ax = sns.boxplot(x="model", y="lev_dist", hue="split", data=lev_dist_df, palette="Set3",order=models,showfliers = True)
#add title
ax.set_facecolor('None')
plt.title('Levenshtein Distance Distribution per Model on ID')
ax.set(xlabel='Model', ylabel='Normalized Levenshtein Distance')
#legend background should transparent
ax.legend(loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0.,facecolor=None,framealpha=0.0)
# add horizontal lines for thresholds for each model while making sure the line is within the boxplot
min_val = 0 
for model in models:
 thresh = model_thresholds[model]
 plt.axhline(y=thresh, color='g', linestyle='--',xmin=min_val,xmax=min_val+0.2)
 min_val+=0.2

