|
|
|
from transforna import load |
|
from transforna import Results_Handler |
|
import pandas as pd |
|
import plotly.graph_objects as go |
|
import plotly.io as pio |
|
path = '/media/ftp_share/hbdx/analysis/tcga/TransfoRNA_I_ID_V4/sub_class/Seq/embedds/' |
|
results:Results_Handler = Results_Handler(path=path,splits=['train','valid','test','ood'],read_ad=True) |
|
|
|
mapping_dict = load('/media/ftp_share/hbdx/annotation/feature_annotation/ANNOTATION/HBDxBase_annotation/TransfoRNA/compare_binning_strategies/v02/subclass_to_annotation.json') |
|
mapping_dict['artificial_affix'] = 'artificial_affix' |
|
train_df = results.splits_df_dict['train_df'] |
|
valid_df = results.splits_df_dict['valid_df'] |
|
test_df = results.splits_df_dict['test_df'] |
|
ood_df = results.splits_df_dict['ood_df'] |
|
|
|
train_df = train_df[train_df['RNA Sequences'].isin(results.ad.var[results.ad.var['hico'] == True].index)['0'].values] |
|
valid_df = valid_df[valid_df['RNA Sequences'].isin(results.ad.var.index[results.ad.var['hico'] == True])['0'].values] |
|
test_df = test_df[test_df['RNA Sequences'].isin(results.ad.var.index[results.ad.var['hico'] == True])['0'].values] |
|
ood_df = ood_df[ood_df['RNA Sequences'].isin(results.ad.var.index[results.ad.var['hico'] == True])['0'].values] |
|
|
|
train_val_test_df = train_df.append(valid_df).append(test_df) |
|
|
|
hico_id_labels = train_val_test_df['Labels','0'].map(mapping_dict).values |
|
hico_ood_labels = ood_df['Labels','0'].map(mapping_dict).values |
|
|
|
ad = results.ad |
|
|
|
hico_loco_df = pd.DataFrame(columns=['mc','hico_id','hico_ood','loco']) |
|
for mc in ad.var['small_RNA_class_annotation'][ad.var['hico'] == True].unique(): |
|
hico_loco_df = hico_loco_df.append({'mc':mc, |
|
'hico_id':sum([mc in i for i in hico_id_labels]), |
|
'hico_ood':sum([mc in i for i in hico_ood_labels]), |
|
'loco':sum([mc in i for i in ad.var['small_RNA_class_annotation'][ad.var['hico'] != True].values])},ignore_index=True) |
|
|
|
|
|
|
|
order = ['rRNA','tRNA','snoRNA','protein_coding','snRNA','miRNA','miscRNA','lncRNA','piRNA','YRNA','vtRNA'] |
|
|
|
fig = go.Figure() |
|
fig.add_trace(go.Bar(x=hico_loco_df['mc'],y=hico_loco_df['hico_id'],name='HICO ID',marker_color='#00CC96')) |
|
fig.add_trace(go.Bar(x=hico_loco_df['mc'],y=hico_loco_df['hico_ood'],name='HICO OOD',marker_color='darkcyan')) |
|
fig.add_trace(go.Bar(x=hico_loco_df['mc'],y=hico_loco_df['loco'],name='LOCO',marker_color='#7f7f7f')) |
|
fig.update_layout(barmode='group') |
|
fig.update_layout(width=800,height=800) |
|
|
|
fig.update_layout(xaxis={'categoryorder':'array','categoryarray':order}) |
|
fig.update_layout(xaxis_title='Major Class',yaxis_title='Number of Sequences') |
|
fig.update_layout(title='Number of Sequences per Major Class in ID, OOD and LOCO') |
|
fig.update_layout(yaxis_type="log") |
|
|
|
pio.write_image(fig,'hico_id_ood_loco_proportion.png') |
|
|
|
pio.write_image(fig,'hico_id_ood_loco_proportion.svg') |
|
fig.show() |
|
|
|
|