#%% #read all files ending with dist_df in bin/lc_files/ import pandas as pd import glob from plotly import graph_objects as go from transforna import load,predict_transforna all_df = pd.DataFrame() files = glob.glob('/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/transforna/bin/lc_files/*lev_dist_df.csv') for file in files: df = pd.read_csv(file) all_df = pd.concat([all_df,df]) all_df = all_df.drop(columns=['Unnamed: 0']) all_df.loc[all_df.split.isnull(),'split'] = 'NA' #%% #draw a box plot for the Ensemble model for each of the splits using seaborn ensemble_df = all_df[all_df.Model == 'Ensemble'].reset_index(drop=True) #remove other_affixes ensemble_df = ensemble_df[ensemble_df.split != 'other_affixes'].reset_index(drop=True) #rename ensemble_df['split'] = ensemble_df['split'].replace({'5\'A-affixes':'Putative 5’-adapter prefixes','Fused':'Recombined'}) ensemble_df['split'] = ensemble_df['split'].replace({'Relaxed-miRNA':'Isomirs'}) #%% #plot the boxplot using seaborn import seaborn as sns import matplotlib.pyplot as plt sns.set_theme(style="whitegrid") sns.set(rc={'figure.figsize':(15,10)}) sns.set(font_scale=1.5) order = ['LC-familiar','LC-novel','Random','Putative 5’-adapter prefixes','Recombined','NA','LOCO','Isomirs'] ax = sns.boxplot(x="split", y="NLD", data=ensemble_df, palette="Set3",order=order,showfliers = True) #add Novelty Threshold line ax.axhline(y=ensemble_df['Novelty Threshold'].mean(), color='g', linestyle='--',xmin=0,xmax=1) #annotate mean of Novelty Threshold ax.annotate('NLD threshold', xy=(1.5, ensemble_df['Novelty Threshold'].mean()), xytext=(1.5, ensemble_df['Novelty Threshold'].mean()-0.07), arrowprops=dict(facecolor='black', shrink=0.05)) #rename ax.set_xticklabels(['LC-Familiar','LC-Novel','Random','5’-adapter artefacts','Recombined','NA','LOCO','IsomiRs']) #add title ax.set_facecolor('None') plt.title('Levenshtein Distance Distribution per Split on LC') ax.set(xlabel='Split', ylabel='Normalized Levenshtein Distance') #save legend plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0.,facecolor=None,framealpha=0.0) plt.savefig('/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/transforna/bin/lc_figures/lev_dist_no_out_boxplot.svg',dpi=400) #tilt x axis labels plt.xticks(rotation=-22.5) #save svg plt.savefig('/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/transforna/bin/lc_figures/lev_dist_seaboarn_boxplot.svg',dpi=1000) ##save png plt.savefig('/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/transforna/bin/lc_figures/lev_dist_seaboarn_boxplot.png',dpi=1000) #%% bars = [r for r in ax.get_children()] colors = [] for c in bars[:-1]: try: colors.append(c.get_facecolor()) except: pass isomir_color = colors[len(order)-1] isomir_color = [255*x for x in isomir_color] #covert to rgb('r','g','b','a') isomir_color = 'rgb(%s,%s,%s,%s)'%(isomir_color[0],isomir_color[1],isomir_color[2],isomir_color[3]) #%% relaxed_mirna_df = all_df[all_df.split == 'Relaxed-miRNA'] models = relaxed_mirna_df.Model.unique() percentage_dict = {} for model in models: model_df = relaxed_mirna_df[relaxed_mirna_df.Model == model] #compute the % of sequences with NLD < Novelty Threshold for each model percentage_dict[model] = len(model_df[model_df['NLD'] > model_df['Novelty Threshold']])/len(model_df) percentage_dict[model]*=100 fig = go.Figure() for model in ['Baseline','Seq','Seq-Seq','Seq-Struct','Seq-Rev','Ensemble']: fig.add_trace(go.Bar(x=[model],y=[percentage_dict[model]],name=model,marker_color=isomir_color)) #add percentage on top of each bar fig.add_annotation(x=model,y=percentage_dict[model]+2,text='%s%%'%(round(percentage_dict[model],2)),showarrow=False) #increase size of annotation fig.update_annotations(dict(font_size=13)) #add title in the center fig.update_layout(title='Percentage of Isomirs considered novel per model') fig.update_layout(xaxis_tickangle=+22.5) fig.update_layout(showlegend=False) #make transparent background fig.update_layout(plot_bgcolor='rgba(0,0,0,0)') #y axis label fig.update_yaxes(title_text='Percentage of Novel Sequences') #save svg fig.show() #save svg #fig.write_image('relaxed_mirna_novel_perc_lc_barplot.svg') #%% #here we explore the false familiar of the ood lc set ood_df = pd.read_csv('/nfs/home/yat_ldap/VS_Projects/TransfoRNA/bin/lc_files/LC-novel_lev_dist_df.csv') mapping_dict_path = '/media/ftp_share/hbdx/annotation/feature_annotation/ANNOTATION/HBDxBase_annotation/TransfoRNA/compare_binning_strategies/v02/subclass_to_annotation.json' mapping_dict = load(mapping_dict_path) LC_path = '/media/ftp_share/hbdx/annotation/feature_annotation/ANNOTATION/HBDxBase_annotation/TransfoRNA/compare_binning_strategies/v02/LC__ngs__DI_HB_GEL-23.1.2.h5ad' ad = load(LC_path) #%% model = 'Ensemble' ood_seqs = ood_df[(ood_df.Model == model).values * (ood_df['Is Familiar?'] == True).values].Sequence.tolist() ood_predicted_labels = ood_df[(ood_df.Model == model).values * (ood_df['Is Familiar?'] == True).values].Labels.tolist() ood_actual_labels = ad.var.loc[ood_seqs]['subclass_name'].values.tolist() from transforna import correct_labels ood_predicted_labels = correct_labels(ood_predicted_labels,ood_actual_labels,mapping_dict) #get indices where ood_predicted_labels == ood_actual_labels correct_indices = [i for i, x in enumerate(ood_predicted_labels) if x != ood_actual_labels[i]] #remove the indices from ood_seqs, ood_predicted_labels, ood_actual_labels ood_seqs = [ood_seqs[i] for i in correct_indices] ood_predicted_labels = [ood_predicted_labels[i] for i in correct_indices] ood_actual_labels = [ood_actual_labels[i] for i in correct_indices] #get the major class of the actual labels ood_actual_major_class = [mapping_dict[label] if label in mapping_dict else 'None' for label in ood_actual_labels] ood_predicted_major_class = [mapping_dict[label] if label in mapping_dict else 'None' for label in ood_predicted_labels ] #get frequencies of each major class from collections import Counter ood_actual_major_class_freq = Counter(ood_actual_major_class) ood_predicted_major_class_freq = Counter(ood_predicted_major_class) # %% import plotly.express as px major_classes = list(ood_actual_major_class_freq.keys()) ood_seqs_len = [len(seq) for seq in ood_seqs] ood_seqs_len_freq = Counter(ood_seqs_len) fig = px.bar(x=list(ood_seqs_len_freq.keys()),y=list(ood_seqs_len_freq.values())) fig.show() #%% import plotly.graph_objects as go fig = go.Figure() for major_class in major_classes: len_dist = [len(ood_seqs[i]) for i, x in enumerate(ood_actual_major_class) if x == major_class] len_dist_freq = Counter(len_dist) fig.add_trace(go.Bar(x=list(len_dist_freq.keys()),y=list(len_dist_freq.values()),name=major_class)) #stack fig.update_layout(barmode='stack') #make transparent background fig.update_layout(plot_bgcolor='rgba(0,0,0,0)') #set y axis label to Count and x axis label to Length fig.update_layout(yaxis_title='Count',xaxis_title='Length') #set title fig.update_layout(title_text="Length distribution of false familiar sequences per major class") #save as svg fig.write_image('false_familiar_length_distribution_per_major_class_stacked.svg') fig.show() # %% #for each model, for each split, print Is Familiar? == True and print the number of sequences for model in all_df.Model.unique(): print('\n\n') model_df = all_df[all_df.Model == model] num_hicos = 0 total_samples = 0 for split in ['LC-familiar','LC-novel','LOCO','NA','Relaxed-miRNA']: split_df = model_df[model_df.split == split] #print('Model: %s, Split: %s, Familiar: %s, Number of Sequences: %s'%(model,split,len(split_df[split_df['Is Familiar?'] == True]),len(split_df))) #print model, split % print('%s %s %s'%(model,split,len(split_df[split_df['Is Familiar?'] == True])/len(split_df)*100)) if split != 'LC-novel': num_hicos+=len(split_df[split_df['Is Familiar?'] == True]) total_samples+=len(split_df) #print % of hicos print('%s %s %s'%(model,'HICO',num_hicos/total_samples*100)) print(total_samples) # %%