Yak-hbdx's picture
uploaded TransfoRNA repo
0b11a42 verified
raw
history blame
3.03 kB
#%%
from transforna import load
from transforna import Results_Handler
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
path = '/media/ftp_share/hbdx/analysis/tcga/TransfoRNA_I_ID_V4/sub_class/Seq/embedds/'
results:Results_Handler = Results_Handler(path=path,splits=['train','valid','test','ood'],read_ad=True)
mapping_dict = load('/media/ftp_share/hbdx/annotation/feature_annotation/ANNOTATION/HBDxBase_annotation/TransfoRNA/compare_binning_strategies/v02/subclass_to_annotation.json')
mapping_dict['artificial_affix'] = 'artificial_affix'
train_df = results.splits_df_dict['train_df']
valid_df = results.splits_df_dict['valid_df']
test_df = results.splits_df_dict['test_df']
ood_df = results.splits_df_dict['ood_df']
#remove RNA Sequences from the dataframe if not in results.ad.var.index
train_df = train_df[train_df['RNA Sequences'].isin(results.ad.var[results.ad.var['hico'] == True].index)['0'].values]
valid_df = valid_df[valid_df['RNA Sequences'].isin(results.ad.var.index[results.ad.var['hico'] == True])['0'].values]
test_df = test_df[test_df['RNA Sequences'].isin(results.ad.var.index[results.ad.var['hico'] == True])['0'].values]
ood_df = ood_df[ood_df['RNA Sequences'].isin(results.ad.var.index[results.ad.var['hico'] == True])['0'].values]
#concatenate train,valid and test
train_val_test_df = train_df.append(valid_df).append(test_df)
#map Labels to annotation
hico_id_labels = train_val_test_df['Labels','0'].map(mapping_dict).values
hico_ood_labels = ood_df['Labels','0'].map(mapping_dict).values
#read ad
ad = results.ad
hico_loco_df = pd.DataFrame(columns=['mc','hico_id','hico_ood','loco'])
for mc in ad.var['small_RNA_class_annotation'][ad.var['hico'] == True].unique():
hico_loco_df = hico_loco_df.append({'mc':mc,
'hico_id':sum([mc in i for i in hico_id_labels]),
'hico_ood':sum([mc in i for i in hico_ood_labels]),
'loco':sum([mc in i for i in ad.var['small_RNA_class_annotation'][ad.var['hico'] != True].values])},ignore_index=True)
# %%
order = ['rRNA','tRNA','snoRNA','protein_coding','snRNA','miRNA','miscRNA','lncRNA','piRNA','YRNA','vtRNA']
fig = go.Figure()
fig.add_trace(go.Bar(x=hico_loco_df['mc'],y=hico_loco_df['hico_id'],name='HICO ID',marker_color='#00CC96'))
fig.add_trace(go.Bar(x=hico_loco_df['mc'],y=hico_loco_df['hico_ood'],name='HICO OOD',marker_color='darkcyan'))
fig.add_trace(go.Bar(x=hico_loco_df['mc'],y=hico_loco_df['loco'],name='LOCO',marker_color='#7f7f7f'))
fig.update_layout(barmode='group')
fig.update_layout(width=800,height=800)
#order the x axis
fig.update_layout(xaxis={'categoryorder':'array','categoryarray':order})
fig.update_layout(xaxis_title='Major Class',yaxis_title='Number of Sequences')
fig.update_layout(title='Number of Sequences per Major Class in ID, OOD and LOCO')
fig.update_layout(yaxis_type="log")
#save as png
pio.write_image(fig,'hico_id_ood_loco_proportion.png')
#save as svg
pio.write_image(fig,'hico_id_ood_loco_proportion.svg')
fig.show()
# %%