Yak-hbdx's picture
uploaded TransfoRNA repo
0b11a42 verified
raw
history blame
5.29 kB
#%%
import numpy as np
import pandas as pd
import transforna
from transforna import IDModelAugmenter, load
#%%
model_name = 'Seq'
config_path = f'/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/models/tcga/TransfoRNA_ID/sub_class/{model_name}/meta/hp_settings.yaml'
config = load(config_path)
model_augmenter = IDModelAugmenter(df=None,config=config)
df = model_augmenter.predict_transforna_na()
tcga_df = load('/media/ftp_share/hbdx/data_for_upload/TransfoRNA/data/TCGA__ngs__miRNA_log2RPM-24.04.0__var.csv')
tcga_df.set_index('sequence',inplace=True)
tcga_df['Labels'] = tcga_df['subclass_name'][tcga_df['hico'] == True]
tcga_df['Labels'] = tcga_df['Labels'].astype('category')
#%%
tcga_df.loc[df.Sequence.values,'Labels'] = df['Net-Label'].values
loco_labels_df = tcga_df['subclass_name'].str.split(';',expand=True).loc[df['Sequence']]
#filter the rows having no_annotation in the first row of loco_labels_df
loco_labels_df = loco_labels_df.iloc[~(loco_labels_df[0] == 'no_annotation').values]
#%%
#get the Is Familiar? column from df based on index of loco_labels_df
novelty_prediction_loco_df = df[df['Sequence'].isin(loco_labels_df.index)].set_index('Sequence')['Is Familiar?']
#%%
id_predictions_df = tcga_df.loc[loco_labels_df.index]['Labels']
#copy the columns of id_predictions_df nuber of times equal to the number of columns in loco_labels_df
id_predictions_df = pd.concat([id_predictions_df]*loco_labels_df.shape[1],axis=1)
id_predictions_df.columns = np.arange(loco_labels_df.shape[1])
equ_mask = loco_labels_df == id_predictions_df
#check how many rows in eq_mask has atleast one True
num_true = equ_mask.any(axis=1).sum()
print('percentage of all loco RNAs: ',num_true/equ_mask.shape[0])
#split loco_labels_df into two dataframes. familiar and novel
fam_loco_labels_df = loco_labels_df[novelty_prediction_loco_df]
novel_loco_labels__df = loco_labels_df[~novelty_prediction_loco_df]
#seperate id_predictions_df into two dataframes. novel and familiar
id_predictions_fam_df = id_predictions_df[novelty_prediction_loco_df]
id_predictions_novel_df = id_predictions_df[~novelty_prediction_loco_df]
#%%
num_true_fam = (fam_loco_labels_df == id_predictions_fam_df).any(axis=1).sum()
num_true_novel = (novel_loco_labels__df == id_predictions_novel_df).any(axis=1).sum()
print('percentage of similar predictions in familiar: ',num_true_fam/fam_loco_labels_df.shape[0])
print('percentage of similar predictions not in novel: ',num_true_novel/novel_loco_labels__df.shape[0])
print('')
# %%
#remove the rows in fam_loco_labels_df and id_predictions_fam_df that have atleast one True in equ_mask
fam_loco_labels_no_overlap_df = fam_loco_labels_df[~equ_mask.any(axis=1)]
id_predictions_fam_no_overlap_df = id_predictions_fam_df[~equ_mask.any(axis=1)]
#collapse the dataframe of fam_loco_labels_df with a ';' seperator
collapsed_loco_labels_df = fam_loco_labels_no_overlap_df.apply(lambda x: ';'.join(x.dropna().astype(str)),axis=1)
#combined collapsed_loco_labels_df with id_predictions_fam_df[0]
predicted_fam_but_ann_novel_df = pd.concat([collapsed_loco_labels_df,id_predictions_fam_no_overlap_df[0]],axis=1)
#rename columns
predicted_fam_but_ann_novel_df.columns = ['KBA_labels','predicted_label']
# %%
#get major class for each column in KBA_labels and predicted_label
mapping_dict_path = '/media/ftp_share/hbdx/data_for_upload/TransfoRNA/data/subclass_to_annotation.json'
sc_to_mc_mapper_dict = load(mapping_dict_path)
predicted_fam_but_ann_novel_df['KBA_labels_mc'] = predicted_fam_but_ann_novel_df['KBA_labels'].str.split(';').apply(lambda x: ';'.join([sc_to_mc_mapper_dict[i] if i in sc_to_mc_mapper_dict.keys() else i for i in x]))
predicted_fam_but_ann_novel_df['predicted_label_mc'] = predicted_fam_but_ann_novel_df['predicted_label'].apply(lambda x: sc_to_mc_mapper_dict[x] if x in sc_to_mc_mapper_dict.keys() else x)
# %%
#for the each of the sequence in predicted_fam_but_ann_novel_df, compute the sim seq along with the lv distance
from transforna import predict_transforna
sim_df = predict_transforna(model=model_name,sequences=predicted_fam_but_ann_novel_df.index.tolist(),similarity_flag=True,n_sim=1,trained_on='id',path_to_models='/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/models/tcga/')
sim_df = sim_df.set_index('Sequence')
#append the sim_df to predicted_fam_but_ann_novel_df except for the Labels column
predicted_fam_but_ann_novel_df = pd.concat([predicted_fam_but_ann_novel_df,sim_df.drop('Labels',axis=1)],axis=1)
# %%
#plot the mc proportions of predicted_label_mc
predicted_fam_but_ann_novel_df['predicted_label_mc'].value_counts().plot(kind='bar')
#get order of labels on x axis
x_labels = predicted_fam_but_ann_novel_df['predicted_label_mc'].value_counts().index.tolist()
# %%
#plot the LV distance per predicted_label_mc and order the x axis based on the order of x_labels
fig = predicted_fam_but_ann_novel_df.boxplot(column='NLD',by='predicted_label_mc',figsize=(20,10),rot=90,showfliers=False)
#reorder x axis in fig by x_labels
fig.set_xticklabels(x_labels)
#increase font of axis labels and ticks
fig.set_xlabel('Predicted Label',fontsize=20)
fig.set_ylabel('Levenstein Distance',fontsize=20)
fig.tick_params(axis='both', which='major', labelsize=20)
#display pandas full rows
pd.set_option('display.max_rows', None)
# %%