{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/nfs/home/yat_ldap/conda/envs/hbdx/envs/transforna/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from transforna import load,predict_transforna_all_models,predict_transforna,fold_sequences\n", "models_path = '/nfs/home/yat_ldap/VS_Projects/TransfoRNA-Framework/models/tcga/'\n", "lc_path = '/media/ftp_share/hbdx/annotation/feature_annotation/ANNOTATION/HBDxBase_annotation/TransfoRNA/compare_binning_strategies/v05/2024-04-19__230126_LC_DI_HB_GEL_v23.01.00/sRNA_anno_aggregated_on_seq.csv'\n", "tcga_path = '/media/ftp_share/hbdx/data_for_upload/TransfoRNA/data/TCGA__ngs__miRNA_log2RPM-24.04.0__var.csv'\n", "\n", "tcga_df = load(tcga_path)\n", "lc_df = load(lc_path)\n", "\n", "lc_df = lc_df[lc_df.sequence.str.len() <= 30]\n", "\n", "all_seqs = lc_df.sequence.tolist()+tcga_df.sequence.tolist()\n", "\n", "mapping_dict_path = '/media/ftp_share/hbdx/data_for_upload/TransfoRNA//data/subclass_to_annotation.json'\n", "mapping_dict = load(mapping_dict_path)\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions = predict_transforna_all_models(all_seqs,trained_on='full',path_to_models=models_path)\n", "predictions.to_csv('predictions_lc_tcga.csv',index=False)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#read predictions\n", "predictions = load('predictions_lc_tcga.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "umaps = {}\n", "models = predictions['Model'].unique()\n", "for model in models:\n", " if model == 'Ensemble':\n", " continue\n", " #get predictions\n", " model_predictions = predictions[predictions['Model']==model]\n", " #get is familiar rows\n", " familiar_df = model_predictions[model_predictions['Is Familiar?']==True]\n", " #get umap\n", " umap_df = predict_transforna(model_predictions['Sequence'].tolist(),model=model,trained_on='full',path_to_models=models_path,umap_flag=True)\n", " umaps[model] = umap_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "import plotly.express as px\n", "import numpy as np\n", "mcs = np.unique(umaps['Seq']['Net-Label'].map(mapping_dict))\n", "#filter out the classes that contain ;\n", "mcs = [mc for mc in mcs if ';' not in mc]\n", "colors = px.colors.qualitative.Plotly\n", "color_mapping = dict(zip(mcs,colors))\n", "for model,umap_df in umaps.items():\n", " umap_df['Major Class'] = umap_df['Net-Label'].map(mapping_dict)\n", " umap_df_copy = umap_df.copy()\n", " #remove rows with Major Class containing ;\n", " umap_df = umap_df[~umap_df['Major Class'].str.contains(';')]\n", " fig = px.scatter(umap_df,x='UMAP1',y='UMAP2',color='Major Class',hover_data\n", " =['Sequence'],title=model,\\\n", " width = 800, height=800,color_discrete_map=color_mapping)\n", " fig.update_traces(marker=dict(size=1))\n", " #white background\n", " fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')\n", " #only show UMAP1 from 4.3 to 11\n", " fig.update_xaxes(range=[4.3,11])\n", " #and UMAP2 from -2.3 to 6.8\n", " fig.update_yaxes(range=[-2.3,6.8])\n", " #fig.show()\n", " fig.write_image(f'lc_figures/lc_tcga_umap_selected_{model}.png')\n", " fig.write_image(f'lc_figures/lc_tcga_umap_selected_{model}.svg')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import plotly.express as px\n", "import numpy as np\n", "mcs = np.unique(umaps['Seq']['Net-Label'].map(mapping_dict))\n", "#filter out the classes that contain ;\n", "mcs = [mc for mc in mcs if ';' not in mc]\n", "colors = px.colors.qualitative.Plotly + px.colors.qualitative.Light24\n", "color_mapping = dict(zip(mcs,colors))\n", "for model,umap_df in umaps.items():\n", " umap_df['Major Class'] = umap_df['Net-Label'].map(mapping_dict)\n", " umap_df_copy = umap_df.copy()\n", " #remove rows with Major Class containing ;\n", " umap_df = umap_df[~umap_df['Major Class'].str.contains(';')]\n", " fig = px.scatter(umap_df,x='UMAP1',y='UMAP2',color='Major Class',hover_data\n", " =['Sequence'],title=model,\\\n", " width = 800, height=800,color_discrete_map=color_mapping)\n", " fig.update_traces(marker=dict(size=1))\n", " #white background\n", " fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')\n", " #fig.show()\n", " fig.write_image(f'lc_figures/lc_tcga_umap_{model}.png')\n", " fig.write_image(f'lc_figures/lc_tcga_umap_{model}.svg')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#plot umap using px.scatter for each model\n", "import plotly.express as px\n", "import numpy as np\n", "mcs = np.unique(umaps['Seq']['Net-Label'].map(mapping_dict))\n", "#filter out the classes that contain ;\n", "mcs = [mc for mc in mcs if ';' not in mc]\n", "colors = px.colors.qualitative.Plotly\n", "color_mapping = dict(zip(mcs,colors))\n", "umap_df = umaps['Seq']\n", "umap_df['Major Class'] = umap_df['Net-Label'].map(mapping_dict)\n", "umap_df_copy = umap_df.copy()\n", "#display points contained within the circle at center (7.9,2.5) and radius 4.3\n", "umap_df_copy['distance'] = np.sqrt((umap_df_copy['UMAP1']-7.9)**2+(umap_df_copy['UMAP2']-2.5)**2)\n", "umap_df_copy = umap_df_copy[umap_df_copy['distance']<=4.3]\n", "#remove rows with Major Class containing ;\n", "umap_df_copy = umap_df_copy[~umap_df_copy['Major Class'].str.contains(';')]\n", "fig = px.scatter(umap_df_copy,x='UMAP1',y='UMAP2',color='Major Class',hover_data\n", " =['Sequence'],title=model,\\\n", " width = 800, height=800,color_discrete_map=color_mapping)\n", "fig.update_traces(marker=dict(size=1))\n", "#white background\n", "fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')\n", "fig.show()\n", "#fig.write_image(f'lc_figures/lc_tcga_umap_selected_{model}.png')\n", "#fig.write_image(f'lc_figures/lc_tcga_umap_selected_{model}.svg')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#plot\n", "sec_struct = fold_sequences(model_predictions['Sequence'].tolist())['structure_37']\n", "#sec struct ratio is calculated as the number of non '.' characters divided by the length of the sequence\n", "sec_struct_ratio = sec_struct.apply(lambda x: (len(x)-x.count('.'))/len(x))\n" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "umap_df = umaps['Seq-Struct']\n", "fig = px.scatter(umap_df,x='UMAP1',y='UMAP2',color=sec_struct_ratio,hover_data=['Sequence'],title=model,\\\n", " width = 800, height=800,color_continuous_scale='Viridis')\n", "fig.update_traces(marker=dict(size=1))\n", "fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')\n", "#save\n", "fig.write_image(f'lc_figures/lc_tcga_umap_{model}_dot_bracket.png')\n", "fig.write_image(f'lc_figures/lc_tcga_umap_{model}_dot_bracket.svg')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "transforna", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 2 }