|
|
|
import numpy as np |
|
import pandas as pd |
|
|
|
import transforna |
|
from transforna import IDModelAugmenter, load |
|
|
|
|
|
model_name = 'Seq' |
|
config_path = f'/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/models/tcga/TransfoRNA_ID/sub_class/{model_name}/meta/hp_settings.yaml' |
|
config = load(config_path) |
|
model_augmenter = IDModelAugmenter(df=None,config=config) |
|
df = model_augmenter.predict_transforna_na() |
|
tcga_df = load('/media/ftp_share/hbdx/data_for_upload/TransfoRNA/data/TCGA__ngs__miRNA_log2RPM-24.04.0__var.csv') |
|
|
|
tcga_df.set_index('sequence',inplace=True) |
|
tcga_df['Labels'] = tcga_df['subclass_name'][tcga_df['hico'] == True] |
|
tcga_df['Labels'] = tcga_df['Labels'].astype('category') |
|
|
|
tcga_df.loc[df.Sequence.values,'Labels'] = df['Net-Label'].values |
|
|
|
loco_labels_df = tcga_df['subclass_name'].str.split(';',expand=True).loc[df['Sequence']] |
|
|
|
loco_labels_df = loco_labels_df.iloc[~(loco_labels_df[0] == 'no_annotation').values] |
|
|
|
|
|
novelty_prediction_loco_df = df[df['Sequence'].isin(loco_labels_df.index)].set_index('Sequence')['Is Familiar?'] |
|
|
|
id_predictions_df = tcga_df.loc[loco_labels_df.index]['Labels'] |
|
|
|
id_predictions_df = pd.concat([id_predictions_df]*loco_labels_df.shape[1],axis=1) |
|
id_predictions_df.columns = np.arange(loco_labels_df.shape[1]) |
|
equ_mask = loco_labels_df == id_predictions_df |
|
|
|
num_true = equ_mask.any(axis=1).sum() |
|
print('percentage of all loco RNAs: ',num_true/equ_mask.shape[0]) |
|
|
|
|
|
|
|
fam_loco_labels_df = loco_labels_df[novelty_prediction_loco_df] |
|
novel_loco_labels__df = loco_labels_df[~novelty_prediction_loco_df] |
|
|
|
id_predictions_fam_df = id_predictions_df[novelty_prediction_loco_df] |
|
id_predictions_novel_df = id_predictions_df[~novelty_prediction_loco_df] |
|
|
|
num_true_fam = (fam_loco_labels_df == id_predictions_fam_df).any(axis=1).sum() |
|
num_true_novel = (novel_loco_labels__df == id_predictions_novel_df).any(axis=1).sum() |
|
|
|
print('percentage of similar predictions in familiar: ',num_true_fam/fam_loco_labels_df.shape[0]) |
|
print('percentage of similar predictions not in novel: ',num_true_novel/novel_loco_labels__df.shape[0]) |
|
print('') |
|
|
|
|
|
fam_loco_labels_no_overlap_df = fam_loco_labels_df[~equ_mask.any(axis=1)] |
|
id_predictions_fam_no_overlap_df = id_predictions_fam_df[~equ_mask.any(axis=1)] |
|
|
|
collapsed_loco_labels_df = fam_loco_labels_no_overlap_df.apply(lambda x: ';'.join(x.dropna().astype(str)),axis=1) |
|
|
|
predicted_fam_but_ann_novel_df = pd.concat([collapsed_loco_labels_df,id_predictions_fam_no_overlap_df[0]],axis=1) |
|
|
|
predicted_fam_but_ann_novel_df.columns = ['KBA_labels','predicted_label'] |
|
|
|
|
|
mapping_dict_path = '/media/ftp_share/hbdx/data_for_upload/TransfoRNA/data/subclass_to_annotation.json' |
|
sc_to_mc_mapper_dict = load(mapping_dict_path) |
|
|
|
predicted_fam_but_ann_novel_df['KBA_labels_mc'] = predicted_fam_but_ann_novel_df['KBA_labels'].str.split(';').apply(lambda x: ';'.join([sc_to_mc_mapper_dict[i] if i in sc_to_mc_mapper_dict.keys() else i for i in x])) |
|
predicted_fam_but_ann_novel_df['predicted_label_mc'] = predicted_fam_but_ann_novel_df['predicted_label'].apply(lambda x: sc_to_mc_mapper_dict[x] if x in sc_to_mc_mapper_dict.keys() else x) |
|
|
|
|
|
from transforna import predict_transforna |
|
|
|
sim_df = predict_transforna(model=model_name,sequences=predicted_fam_but_ann_novel_df.index.tolist(),similarity_flag=True,n_sim=1,trained_on='id',path_to_models='/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/models/tcga/') |
|
sim_df = sim_df.set_index('Sequence') |
|
|
|
|
|
predicted_fam_but_ann_novel_df = pd.concat([predicted_fam_but_ann_novel_df,sim_df.drop('Labels',axis=1)],axis=1) |
|
|
|
|
|
predicted_fam_but_ann_novel_df['predicted_label_mc'].value_counts().plot(kind='bar') |
|
|
|
x_labels = predicted_fam_but_ann_novel_df['predicted_label_mc'].value_counts().index.tolist() |
|
|
|
|
|
fig = predicted_fam_but_ann_novel_df.boxplot(column='NLD',by='predicted_label_mc',figsize=(20,10),rot=90,showfliers=False) |
|
|
|
fig.set_xticklabels(x_labels) |
|
|
|
fig.set_xlabel('Predicted Label',fontsize=20) |
|
fig.set_ylabel('Levenstein Distance',fontsize=20) |
|
fig.tick_params(axis='both', which='major', labelsize=20) |
|
|
|
pd.set_option('display.max_rows', None) |
|
|
|
|