File size: 8,108 Bytes
0b11a42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
#%%
#read all files ending with dist_df in bin/lc_files/
import pandas as pd
import glob
from plotly import graph_objects as go
from transforna import load,predict_transforna
all_df = pd.DataFrame()
files = glob.glob('/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/transforna/bin/lc_files/*lev_dist_df.csv')
for file in files:
df = pd.read_csv(file)
all_df = pd.concat([all_df,df])
all_df = all_df.drop(columns=['Unnamed: 0'])
all_df.loc[all_df.split.isnull(),'split'] = 'NA'
#%%
#draw a box plot for the Ensemble model for each of the splits using seaborn
ensemble_df = all_df[all_df.Model == 'Ensemble'].reset_index(drop=True)
#remove other_affixes
ensemble_df = ensemble_df[ensemble_df.split != 'other_affixes'].reset_index(drop=True)
#rename
ensemble_df['split'] = ensemble_df['split'].replace({'5\'A-affixes':'Putative 5’-adapter prefixes','Fused':'Recombined'})
ensemble_df['split'] = ensemble_df['split'].replace({'Relaxed-miRNA':'Isomirs'})
#%%
#plot the boxplot using seaborn
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
sns.set(rc={'figure.figsize':(15,10)})
sns.set(font_scale=1.5)
order = ['LC-familiar','LC-novel','Random','Putative 5’-adapter prefixes','Recombined','NA','LOCO','Isomirs']
ax = sns.boxplot(x="split", y="NLD", data=ensemble_df, palette="Set3",order=order,showfliers = True)
#add Novelty Threshold line
ax.axhline(y=ensemble_df['Novelty Threshold'].mean(), color='g', linestyle='--',xmin=0,xmax=1)
#annotate mean of Novelty Threshold
ax.annotate('NLD threshold', xy=(1.5, ensemble_df['Novelty Threshold'].mean()), xytext=(1.5, ensemble_df['Novelty Threshold'].mean()-0.07), arrowprops=dict(facecolor='black', shrink=0.05))
#rename
ax.set_xticklabels(['LC-Familiar','LC-Novel','Random','5’-adapter artefacts','Recombined','NA','LOCO','IsomiRs'])
#add title
ax.set_facecolor('None')
plt.title('Levenshtein Distance Distribution per Split on LC')
ax.set(xlabel='Split', ylabel='Normalized Levenshtein Distance')
#save legend
plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0.,facecolor=None,framealpha=0.0)
plt.savefig('/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/transforna/bin/lc_figures/lev_dist_no_out_boxplot.svg',dpi=400)
#tilt x axis labels
plt.xticks(rotation=-22.5)
#save svg
plt.savefig('/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/transforna/bin/lc_figures/lev_dist_seaboarn_boxplot.svg',dpi=1000)
##save png
plt.savefig('/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/transforna/bin/lc_figures/lev_dist_seaboarn_boxplot.png',dpi=1000)
#%%
bars = [r for r in ax.get_children()]
colors = []
for c in bars[:-1]:
try: colors.append(c.get_facecolor())
except: pass
isomir_color = colors[len(order)-1]
isomir_color = [255*x for x in isomir_color]
#covert to rgb('r','g','b','a')
isomir_color = 'rgb(%s,%s,%s,%s)'%(isomir_color[0],isomir_color[1],isomir_color[2],isomir_color[3])
#%%
relaxed_mirna_df = all_df[all_df.split == 'Relaxed-miRNA']
models = relaxed_mirna_df.Model.unique()
percentage_dict = {}
for model in models:
model_df = relaxed_mirna_df[relaxed_mirna_df.Model == model]
#compute the % of sequences with NLD < Novelty Threshold for each model
percentage_dict[model] = len(model_df[model_df['NLD'] > model_df['Novelty Threshold']])/len(model_df)
percentage_dict[model]*=100
fig = go.Figure()
for model in ['Baseline','Seq','Seq-Seq','Seq-Struct','Seq-Rev','Ensemble']:
fig.add_trace(go.Bar(x=[model],y=[percentage_dict[model]],name=model,marker_color=isomir_color))
#add percentage on top of each bar
fig.add_annotation(x=model,y=percentage_dict[model]+2,text='%s%%'%(round(percentage_dict[model],2)),showarrow=False)
#increase size of annotation
fig.update_annotations(dict(font_size=13))
#add title in the center
fig.update_layout(title='Percentage of Isomirs considered novel per model')
fig.update_layout(xaxis_tickangle=+22.5)
fig.update_layout(showlegend=False)
#make transparent background
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
#y axis label
fig.update_yaxes(title_text='Percentage of Novel Sequences')
#save svg
fig.show()
#save svg
#fig.write_image('relaxed_mirna_novel_perc_lc_barplot.svg')
#%%
#here we explore the false familiar of the ood lc set
ood_df = pd.read_csv('/nfs/home/yat_ldap/VS_Projects/TransfoRNA/bin/lc_files/LC-novel_lev_dist_df.csv')
mapping_dict_path = '/media/ftp_share/hbdx/annotation/feature_annotation/ANNOTATION/HBDxBase_annotation/TransfoRNA/compare_binning_strategies/v02/subclass_to_annotation.json'
mapping_dict = load(mapping_dict_path)
LC_path = '/media/ftp_share/hbdx/annotation/feature_annotation/ANNOTATION/HBDxBase_annotation/TransfoRNA/compare_binning_strategies/v02/LC__ngs__DI_HB_GEL-23.1.2.h5ad'
ad = load(LC_path)
#%%
model = 'Ensemble'
ood_seqs = ood_df[(ood_df.Model == model).values * (ood_df['Is Familiar?'] == True).values].Sequence.tolist()
ood_predicted_labels = ood_df[(ood_df.Model == model).values * (ood_df['Is Familiar?'] == True).values].Labels.tolist()
ood_actual_labels = ad.var.loc[ood_seqs]['subclass_name'].values.tolist()
from transforna import correct_labels
ood_predicted_labels = correct_labels(ood_predicted_labels,ood_actual_labels,mapping_dict)
#get indices where ood_predicted_labels == ood_actual_labels
correct_indices = [i for i, x in enumerate(ood_predicted_labels) if x != ood_actual_labels[i]]
#remove the indices from ood_seqs, ood_predicted_labels, ood_actual_labels
ood_seqs = [ood_seqs[i] for i in correct_indices]
ood_predicted_labels = [ood_predicted_labels[i] for i in correct_indices]
ood_actual_labels = [ood_actual_labels[i] for i in correct_indices]
#get the major class of the actual labels
ood_actual_major_class = [mapping_dict[label] if label in mapping_dict else 'None' for label in ood_actual_labels]
ood_predicted_major_class = [mapping_dict[label] if label in mapping_dict else 'None' for label in ood_predicted_labels ]
#get frequencies of each major class
from collections import Counter
ood_actual_major_class_freq = Counter(ood_actual_major_class)
ood_predicted_major_class_freq = Counter(ood_predicted_major_class)
# %%
import plotly.express as px
major_classes = list(ood_actual_major_class_freq.keys())
ood_seqs_len = [len(seq) for seq in ood_seqs]
ood_seqs_len_freq = Counter(ood_seqs_len)
fig = px.bar(x=list(ood_seqs_len_freq.keys()),y=list(ood_seqs_len_freq.values()))
fig.show()
#%%
import plotly.graph_objects as go
fig = go.Figure()
for major_class in major_classes:
len_dist = [len(ood_seqs[i]) for i, x in enumerate(ood_actual_major_class) if x == major_class]
len_dist_freq = Counter(len_dist)
fig.add_trace(go.Bar(x=list(len_dist_freq.keys()),y=list(len_dist_freq.values()),name=major_class))
#stack
fig.update_layout(barmode='stack')
#make transparent background
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
#set y axis label to Count and x axis label to Length
fig.update_layout(yaxis_title='Count',xaxis_title='Length')
#set title
fig.update_layout(title_text="Length distribution of false familiar sequences per major class")
#save as svg
fig.write_image('false_familiar_length_distribution_per_major_class_stacked.svg')
fig.show()
# %%
#for each model, for each split, print Is Familiar? == True and print the number of sequences
for model in all_df.Model.unique():
print('\n\n')
model_df = all_df[all_df.Model == model]
num_hicos = 0
total_samples = 0
for split in ['LC-familiar','LC-novel','LOCO','NA','Relaxed-miRNA']:
split_df = model_df[model_df.split == split]
#print('Model: %s, Split: %s, Familiar: %s, Number of Sequences: %s'%(model,split,len(split_df[split_df['Is Familiar?'] == True]),len(split_df)))
#print model, split %
print('%s %s %s'%(model,split,len(split_df[split_df['Is Familiar?'] == True])/len(split_df)*100))
if split != 'LC-novel':
num_hicos+=len(split_df[split_df['Is Familiar?'] == True])
total_samples+=len(split_df)
#print % of hicos
print('%s %s %s'%(model,'HICO',num_hicos/total_samples*100))
print(total_samples)
# %%
|