|
|
|
|
|
|
|
|
|
import pandas as pd |
|
import glob |
|
from plotly import graph_objects as go |
|
from transforna import load,predict_transforna |
|
all_df = pd.DataFrame() |
|
files = glob.glob('/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/transforna/bin/lc_files/*lev_dist_df.csv') |
|
for file in files: |
|
df = pd.read_csv(file) |
|
all_df = pd.concat([all_df,df]) |
|
all_df = all_df.drop(columns=['Unnamed: 0']) |
|
all_df.loc[all_df.split.isnull(),'split'] = 'NA' |
|
|
|
|
|
|
|
ensemble_df = all_df[all_df.Model == 'Ensemble'].reset_index(drop=True) |
|
|
|
ensemble_df = ensemble_df[ensemble_df.split != 'other_affixes'].reset_index(drop=True) |
|
|
|
ensemble_df['split'] = ensemble_df['split'].replace({'5\'A-affixes':'Putative 5’-adapter prefixes','Fused':'Recombined'}) |
|
ensemble_df['split'] = ensemble_df['split'].replace({'Relaxed-miRNA':'Isomirs'}) |
|
|
|
|
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
sns.set_theme(style="whitegrid") |
|
sns.set(rc={'figure.figsize':(15,10)}) |
|
sns.set(font_scale=1.5) |
|
order = ['LC-familiar','LC-novel','Random','Putative 5’-adapter prefixes','Recombined','NA','LOCO','Isomirs'] |
|
ax = sns.boxplot(x="split", y="NLD", data=ensemble_df, palette="Set3",order=order,showfliers = True) |
|
|
|
|
|
ax.axhline(y=ensemble_df['Novelty Threshold'].mean(), color='g', linestyle='--',xmin=0,xmax=1) |
|
|
|
ax.annotate('NLD threshold', xy=(1.5, ensemble_df['Novelty Threshold'].mean()), xytext=(1.5, ensemble_df['Novelty Threshold'].mean()-0.07), arrowprops=dict(facecolor='black', shrink=0.05)) |
|
|
|
ax.set_xticklabels(['LC-Familiar','LC-Novel','Random','5’-adapter artefacts','Recombined','NA','LOCO','IsomiRs']) |
|
|
|
ax.set_facecolor('None') |
|
plt.title('Levenshtein Distance Distribution per Split on LC') |
|
ax.set(xlabel='Split', ylabel='Normalized Levenshtein Distance') |
|
|
|
plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0.,facecolor=None,framealpha=0.0) |
|
plt.savefig('/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/transforna/bin/lc_figures/lev_dist_no_out_boxplot.svg',dpi=400) |
|
|
|
plt.xticks(rotation=-22.5) |
|
|
|
plt.savefig('/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/transforna/bin/lc_figures/lev_dist_seaboarn_boxplot.svg',dpi=1000) |
|
|
|
plt.savefig('/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/transforna/bin/lc_figures/lev_dist_seaboarn_boxplot.png',dpi=1000) |
|
|
|
bars = [r for r in ax.get_children()] |
|
colors = [] |
|
for c in bars[:-1]: |
|
try: colors.append(c.get_facecolor()) |
|
except: pass |
|
isomir_color = colors[len(order)-1] |
|
isomir_color = [255*x for x in isomir_color] |
|
|
|
isomir_color = 'rgb(%s,%s,%s,%s)'%(isomir_color[0],isomir_color[1],isomir_color[2],isomir_color[3]) |
|
|
|
|
|
relaxed_mirna_df = all_df[all_df.split == 'Relaxed-miRNA'] |
|
models = relaxed_mirna_df.Model.unique() |
|
percentage_dict = {} |
|
for model in models: |
|
model_df = relaxed_mirna_df[relaxed_mirna_df.Model == model] |
|
|
|
percentage_dict[model] = len(model_df[model_df['NLD'] > model_df['Novelty Threshold']])/len(model_df) |
|
percentage_dict[model]*=100 |
|
|
|
fig = go.Figure() |
|
for model in ['Baseline','Seq','Seq-Seq','Seq-Struct','Seq-Rev','Ensemble']: |
|
fig.add_trace(go.Bar(x=[model],y=[percentage_dict[model]],name=model,marker_color=isomir_color)) |
|
|
|
fig.add_annotation(x=model,y=percentage_dict[model]+2,text='%s%%'%(round(percentage_dict[model],2)),showarrow=False) |
|
|
|
fig.update_annotations(dict(font_size=13)) |
|
|
|
fig.update_layout(title='Percentage of Isomirs considered novel per model') |
|
fig.update_layout(xaxis_tickangle=+22.5) |
|
fig.update_layout(showlegend=False) |
|
|
|
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)') |
|
|
|
fig.update_yaxes(title_text='Percentage of Novel Sequences') |
|
|
|
fig.show() |
|
|
|
|
|
|
|
|
|
ood_df = pd.read_csv('/nfs/home/yat_ldap/VS_Projects/TransfoRNA/bin/lc_files/LC-novel_lev_dist_df.csv') |
|
mapping_dict_path = '/media/ftp_share/hbdx/annotation/feature_annotation/ANNOTATION/HBDxBase_annotation/TransfoRNA/compare_binning_strategies/v02/subclass_to_annotation.json' |
|
mapping_dict = load(mapping_dict_path) |
|
|
|
LC_path = '/media/ftp_share/hbdx/annotation/feature_annotation/ANNOTATION/HBDxBase_annotation/TransfoRNA/compare_binning_strategies/v02/LC__ngs__DI_HB_GEL-23.1.2.h5ad' |
|
ad = load(LC_path) |
|
|
|
model = 'Ensemble' |
|
ood_seqs = ood_df[(ood_df.Model == model).values * (ood_df['Is Familiar?'] == True).values].Sequence.tolist() |
|
ood_predicted_labels = ood_df[(ood_df.Model == model).values * (ood_df['Is Familiar?'] == True).values].Labels.tolist() |
|
ood_actual_labels = ad.var.loc[ood_seqs]['subclass_name'].values.tolist() |
|
from transforna import correct_labels |
|
ood_predicted_labels = correct_labels(ood_predicted_labels,ood_actual_labels,mapping_dict) |
|
|
|
|
|
correct_indices = [i for i, x in enumerate(ood_predicted_labels) if x != ood_actual_labels[i]] |
|
|
|
ood_seqs = [ood_seqs[i] for i in correct_indices] |
|
ood_predicted_labels = [ood_predicted_labels[i] for i in correct_indices] |
|
ood_actual_labels = [ood_actual_labels[i] for i in correct_indices] |
|
|
|
ood_actual_major_class = [mapping_dict[label] if label in mapping_dict else 'None' for label in ood_actual_labels] |
|
ood_predicted_major_class = [mapping_dict[label] if label in mapping_dict else 'None' for label in ood_predicted_labels ] |
|
|
|
from collections import Counter |
|
ood_actual_major_class_freq = Counter(ood_actual_major_class) |
|
ood_predicted_major_class_freq = Counter(ood_predicted_major_class) |
|
|
|
|
|
|
|
|
|
import plotly.express as px |
|
major_classes = list(ood_actual_major_class_freq.keys()) |
|
|
|
ood_seqs_len = [len(seq) for seq in ood_seqs] |
|
ood_seqs_len_freq = Counter(ood_seqs_len) |
|
fig = px.bar(x=list(ood_seqs_len_freq.keys()),y=list(ood_seqs_len_freq.values())) |
|
fig.show() |
|
|
|
|
|
import plotly.graph_objects as go |
|
fig = go.Figure() |
|
for major_class in major_classes: |
|
len_dist = [len(ood_seqs[i]) for i, x in enumerate(ood_actual_major_class) if x == major_class] |
|
len_dist_freq = Counter(len_dist) |
|
fig.add_trace(go.Bar(x=list(len_dist_freq.keys()),y=list(len_dist_freq.values()),name=major_class)) |
|
|
|
fig.update_layout(barmode='stack') |
|
|
|
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)') |
|
|
|
fig.update_layout(yaxis_title='Count',xaxis_title='Length') |
|
|
|
fig.update_layout(title_text="Length distribution of false familiar sequences per major class") |
|
|
|
fig.write_image('false_familiar_length_distribution_per_major_class_stacked.svg') |
|
fig.show() |
|
|
|
|
|
|
|
for model in all_df.Model.unique(): |
|
print('\n\n') |
|
model_df = all_df[all_df.Model == model] |
|
num_hicos = 0 |
|
total_samples = 0 |
|
for split in ['LC-familiar','LC-novel','LOCO','NA','Relaxed-miRNA']: |
|
|
|
split_df = model_df[model_df.split == split] |
|
|
|
|
|
print('%s %s %s'%(model,split,len(split_df[split_df['Is Familiar?'] == True])/len(split_df)*100)) |
|
if split != 'LC-novel': |
|
num_hicos+=len(split_df[split_df['Is Familiar?'] == True]) |
|
total_samples+=len(split_df) |
|
|
|
print('%s %s %s'%(model,'HICO',num_hicos/total_samples*100)) |
|
print(total_samples) |
|
|
|
|