File size: 36,695 Bytes
0b11a42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 |
######################################################################################################
# annotate sequences based on mapping results
######################################################################################################
#%%
import os
import logging
import numpy as np
import pandas as pd
from difflib import get_close_matches
from Levenshtein import distance
import json
from joblib import Parallel, delayed
import multiprocessing
from utils import (fasta2df, fasta2df_subheader,log_time, reverse_complement)
from precursor_bins import get_bin_with_max_overlap
log = logging.getLogger(__name__)
pd.options.mode.chained_assignment = None
######################################################################################################
# paths to reference and mapping files
######################################################################################################
version = '_v4'
HBDxBase_csv = f'../../references/HBDxBase/HBDxBase_all{version}.csv'
miRBase_mature_path = '../../references/HBDxBase/miRBase/mature.fa'
mat_miRNA_pos_path = '../../references/HBDxBase/miRBase/hsa_mature_position.txt'
mapped_file = 'seqsmapped2HBDxBase_combined.txt'
unmapped_file = 'tmp_seqs3mm2HBDxBase_pseudo__unmapped.fa'
TE_file = 'tmp_seqsmapped2genome_intersect_TE.txt'
mapped_genome_file = 'seqsmapped2genome_combined.txt'
toomanyloci_genome_file = 'tmp_seqs0mm2genome__toomanyalign.fa'
unmapped_adapter_file = 'tmp_seqs3mm2adapters__unmapped.fa'
unmapped_genome_file = 'tmp_seqs0mm2genome__unmapped.fa'
unmapped_bacterial_file = 'tmp_seqs0mm2bacterial__unmapped.fa'
unmapped_viral_file = 'tmp_seqs0mm2viral__unmapped.fa'
sRNA_anno_file = 'sRNA_anno_from_mapping.csv'
aggreg_sRNA_anno_file = 'sRNA_anno_aggregated_on_seq.csv'
#%%
######################################################################################################
# specific functions
######################################################################################################
@log_time(log)
def extract_general_info(mapping_file):
# load mapping file
mapping_df = pd.read_csv(mapping_file, sep='\t', header=None)
mapping_df.columns = ['tmp_seq_id','reference','ref_start','sequence','other_alignments','mm_descriptors']
# add precursor length + number of bins that will be used for names
HBDxBase_df = pd.read_csv(HBDxBase_csv, index_col=0)
HBDxBase_df = HBDxBase_df[['precursor_length','precursor_bins','pseudo_class']].reset_index()
HBDxBase_df.rename(columns={'index': "reference"}, inplace=True)
mapping_df = mapping_df.merge(HBDxBase_df, left_on='reference', right_on='reference', how='left')
# extract information
mapping_df.loc[:,'mms'] = mapping_df.mm_descriptors.fillna('').str.count('>')
mapping_df.loc[:,'mm_descriptors'] = mapping_df.mm_descriptors.str.replace(',', ';')
mapping_df.loc[:,'small_RNA_class_annotation'] = mapping_df.reference.str.split('|').str[0]
mapping_df.loc[:,'subclass_type'] = mapping_df.reference.str.split('|').str[2]
mapping_df.loc[:,'precursor_name_full'] = mapping_df.reference.str.split('|').str[1].str.split('|').str[0]
mapping_df.loc[:,'precursor_name'] = mapping_df.precursor_name_full.str.split('__').str[0].str.split('|').str[0]
mapping_df.loc[:,'seq_length'] = mapping_df.sequence.apply(lambda x: len(x))
mapping_df.loc[:,'ref_end'] = mapping_df.ref_start + mapping_df.seq_length - 1
mapping_df.loc[:,'mitochondrial'] = np.where(mapping_df.reference.str.contains(r'(\|MT-)|(12S)|(16S)'), 'mito', 'nuclear')
return mapping_df
#%%
@log_time(log)
def tRNA_annotation(mapping_df):
"""Extract tRNA specific annotation from mapping.
"""
# keep only tRNA leader/trailer with right cutting sites (+/- 5nt)
# leader
tRF_leader_df = mapping_df[mapping_df['subclass_type'] == 'leader_tRF']
# assign as misc-leader-tRF if exceeding defined cutting site range
tRF_leader_df.loc[:,'subclass_type'] = np.where((tRF_leader_df.ref_start + tRF_leader_df.sequence.apply(lambda x: len(x))).between(45, 55, inclusive='both'), 'leader_tRF', 'misc-leader-tRF')
# trailer
tRF_trailer_df = mapping_df[mapping_df['subclass_type'] == 'trailer_tRF']
# assign as misc-trailer-tRF if exceeding defined cutting site range
tRF_trailer_df.loc[:,'subclass_type'] = np.where(tRF_trailer_df.ref_start.between(0, 5, inclusive='both'), 'trailer_tRF', 'misc-trailer-tRF')
# define tRF subclasses (leader_tRF and trailer_tRF have been assigned previously)
# NOTE: allow more flexibility at ends (similar to miRNA annotation)
tRNAs_df = mapping_df[((mapping_df['small_RNA_class_annotation'] == 'tRNA') & mapping_df['subclass_type'].isna())]
tRNAs_df.loc[((tRNAs_df.ref_start < 3) & (tRNAs_df.seq_length >= 30)),'subclass_type'] = '5p-tR-half'
tRNAs_df.loc[((tRNAs_df.ref_start < 3) & (tRNAs_df.seq_length < 30)),'subclass_type'] = '5p-tRF'
tRNAs_df.loc[(((tRNAs_df.precursor_length - (tRNAs_df.ref_end + 1)) < 6) & (tRNAs_df.seq_length >= 30)),'subclass_type'] = '3p-tR-half'
tRNAs_df.loc[(((tRNAs_df.precursor_length - (tRNAs_df.ref_end + 1)).between(3,6,inclusive='neither')) & (tRNAs_df.seq_length < 30)),'subclass_type'] = '3p-tRF'
tRNAs_df.loc[(((tRNAs_df.precursor_length - (tRNAs_df.ref_end + 1)) < 3) & (tRNAs_df.seq_length < 30)),'subclass_type'] = '3p-CCA-tRF'
tRNAs_df.loc[tRNAs_df.subclass_type.isna(),'subclass_type'] = 'misc-tRF'
# add ref_iso flag
tRNAs_df['tRNA_ref_iso'] = np.where(
(
(tRNAs_df.ref_start == 0)
| ((tRNAs_df.ref_end + 1) == tRNAs_df.precursor_length)
| ((tRNAs_df.ref_end + 1) == (tRNAs_df.precursor_length - 3))
), 'reftRF', 'isotRF'
)
# concat tRNA, leader & trailer dfs
tRNAs_df = pd.concat([tRNAs_df, tRF_leader_df, tRF_trailer_df],axis=0)
# adjust precursor name and create tRNA name
tRNAs_df['precursor_name'] = tRNAs_df.precursor_name.str.extract(r"((tRNA-...-...)|(MT-..)|(tRX-...-...)|(tRNA-i...-...))", expand=True)[0]
tRNAs_df['subclass_name'] = tRNAs_df.subclass_type + '__' + tRNAs_df.precursor_name
return tRNAs_df
#%%
def faustrules_check(row):
"""Check if isomiRs follow Faustrules (based on Tomasello et al. 2021).
"""
# mark seqs that are not in range +/- 2nt of mature start
# check if ref_start.between(miRNAs_df.mature_start-2, miRNAs_df.mature_start+2, inclusive='both')]
ref_start = row['ref_start']
mature_start = row['mature_start']
if ref_start < mature_start - 2 or ref_start > mature_start + 2:
return False
# mark seqs with mismatch unless A>G or C>T in seed region (= position 0-8) or 3' polyA/polyT (max 3nt)
if pd.isna(row['mm_descriptors']):
return True
seed_region_positions = set(range(9))
non_templated_ends = {'A', 'AA', 'AAA', 'T', 'TT', 'TTT'}
sequence = row['sequence']
mm_descriptors = row['mm_descriptors'].split(';')
seed_region_mismatches = 0
three_prime_end_mismatches = 0
for descriptor in mm_descriptors:
pos, change = descriptor.split(':')
pos = int(pos)
original, new = change.split('>')
if pos in seed_region_positions and (original == 'A' and new == 'G' or original == 'C' and new == 'T'):
seed_region_mismatches += 1
if pos >= len(sequence) - 3 and sequence[pos:] in non_templated_ends:
three_prime_end_mismatches += 1
total_mismatches = seed_region_mismatches + three_prime_end_mismatches
return total_mismatches == len(mm_descriptors)
@log_time(log)
def miRNA_annotation(mapping_df):
"""Extract miRNA specific annotation from mapping. RaH Faustrules are applied.
"""
miRNAs_df = mapping_df[mapping_df.small_RNA_class_annotation == 'miRNA']
nr_missing_alignments_expected = len(miRNAs_df.loc[miRNAs_df.duplicated(['tmp_seq_id','reference'], keep='first'),:])
# load positions of mature miRNAs within precursor
miRNA_pos_df = pd.read_csv(mat_miRNA_pos_path, sep='\t')
miRNA_pos_df.drop(columns=['precursor_length'], inplace=True)
miRNAs_df = miRNAs_df.merge(miRNA_pos_df, left_on='precursor_name_full', right_on='name_precursor', how='left')
# load mature miRNA sequences from miRBase
miRBase_mature_df = fasta2df_subheader(miRBase_mature_path,0)
# subset to human miRNAs
miRBase_mature_df = miRBase_mature_df.loc[miRBase_mature_df.index.str.contains('hsa-'),:]
miRBase_mature_df.index = miRBase_mature_df.index.str.replace('hsa-','')
miRBase_mature_df.reset_index(inplace=True)
miRBase_mature_df.columns = ['name_mature','ref_miR_seq']
# add 'ref_miR_seq'
miRNAs_df = miRNAs_df.merge(miRBase_mature_df, left_on='name_mature', right_on='name_mature', how='left')
# for each duplicated tmp_seq_id/reference combi, keep the one lowest lev dist of sequence to ref_miR_seq
miRNAs_df['lev_dist'] = miRNAs_df.apply(lambda x: distance(x['sequence'], x['ref_miR_seq']), axis=1)
miRNAs_df = miRNAs_df.sort_values(by=['tmp_seq_id','lev_dist'], ascending=[True, True]).drop_duplicates(['tmp_seq_id','reference'], keep='first')
# add ref_iso flag
miRNAs_df['miRNA_ref_iso'] = np.where(
(
(miRNAs_df.ref_start == miRNAs_df.mature_start)
& (miRNAs_df.ref_end == miRNAs_df.mature_end)
& (miRNAs_df.mms == 0)
), 'refmiR', 'isomiR'
)
# apply RaH Faustrules
miRNAs_df['faustrules_check'] = miRNAs_df.apply(faustrules_check, axis=1)
# set miRNA_ref_iso to 'misc-miR' if faustrules_check is False
miRNAs_df.loc[~miRNAs_df.faustrules_check,'miRNA_ref_iso'] = 'misc-miR'
# set subclass_name to name_mature if faustrules_check is True, else use precursor_name
miRNAs_df['subclass_name'] = np.where(miRNAs_df.faustrules_check, miRNAs_df.name_mature, miRNAs_df.precursor_name)
# store name_mature for functional analysis as miRNA_names, set miR- to mir- if faustrules_check is False
miRNAs_df['miRNA_names'] = np.where(miRNAs_df.faustrules_check, miRNAs_df.name_mature, miRNAs_df.name_mature.str.replace('miR-', 'mir-'))
# add subclass (NOTE: in cases where subclass is not part of mature name, use position relative to precursor half to define group )
miRNAs_df['subclass_type'] = np.where(miRNAs_df.name_mature.str.endswith('5p'), '5p', np.where(miRNAs_df.name_mature.str.endswith('3p'), '3p', 'tbd'))
miRNAs_df.loc[((miRNAs_df.subclass_type == 'tbd') & (miRNAs_df.mature_start < miRNAs_df.precursor_length/2)), 'subclass_type'] = '5p'
miRNAs_df.loc[((miRNAs_df.subclass_type == 'tbd') & (miRNAs_df.mature_start >= miRNAs_df.precursor_length/2)), 'subclass_type'] = '3p'
# subset to relevant columns
miRNAs_df = miRNAs_df[list(mapping_df.columns) + ['subclass_name','miRNA_ref_iso','miRNA_names','ref_miR_seq']]
return miRNAs_df, nr_missing_alignments_expected
#%%
######################################################################################################
# annotation of other sRNA classes
######################################################################################################
def get_bin_with_max_overlap_parallel(df):
return df.apply(get_bin_with_max_overlap, axis=1)
def applyParallel(df, func):
retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for group in np.array_split(df,30))
return pd.concat(retLst)
@log_time(log)
def other_sRNA_annotation_new_binning(mapping_df):
"""Generate subclass_name for non-tRNA/miRNA sRNAs by precursor-binning.
New binning approach: bin size is dynamically determined by the precursor length. Assignments are based on the bin with the highest overlap.
"""
other_sRNAs_df = mapping_df[~((mapping_df.small_RNA_class_annotation == 'miRNA') | (mapping_df.small_RNA_class_annotation == 'tRNA'))]
#create empty columns; bin start and bin end
other_sRNAs_df['bin_start'] = ''
other_sRNAs_df['bin_end'] = ''
other_sRNAs_df = applyParallel(other_sRNAs_df, get_bin_with_max_overlap_parallel)
return other_sRNAs_df
#%%
@log_time(log)
def extract_sRNA_class_specific_info(mapping_df):
tRNAs_df = tRNA_annotation(mapping_df)
miRNAs_df, nr_missing_alignments_expected = miRNA_annotation(mapping_df)
other_sRNAs_df = other_sRNA_annotation_new_binning(mapping_df)
# add miRNA columns
tRNAs_df[['miRNA_ref_iso', 'miRNA_names', 'ref_miR_seq']] = pd.DataFrame(columns=['miRNA_ref_iso', 'miRNA_names', 'ref_miR_seq'])
other_sRNAs_df[['miRNA_ref_iso', 'miRNA_names', 'ref_miR_seq']] = pd.DataFrame(columns=['miRNA_ref_iso', 'miRNA_names', 'ref_miR_seq'])
# re-concat sRNA class dfs
sRNA_anno_df = pd.concat([miRNAs_df, tRNAs_df, other_sRNAs_df],axis=0)
# TEST if alignments were lost or duplicated
assert ((len(mapping_df) - nr_missing_alignments_expected) == len(sRNA_anno_df)), "alignments were lost or duplicated"
return sRNA_anno_df
#%%
def get_nth_nt(row):
return row['sequence'][int(row['PTM_position_in_seq'])-1]
#%%
@log_time(log)
def aggregate_info_per_seq(sRNA_anno_df):
# fillna of 'subclass_name_bin_pos' with 'subclass_name'
sRNA_anno_df['subclass_name_bin_pos'] = sRNA_anno_df['subclass_name_bin_pos'].fillna(sRNA_anno_df['subclass_name'])
# get aggregated info per seq
aggreg_per_seq_df = sRNA_anno_df.groupby(['sequence']).agg({'small_RNA_class_annotation': lambda x: ';'.join(sorted(x.unique())), 'pseudo_class': lambda x: ';'.join(x.astype(str).sort_values(ascending=True).unique()), 'subclass_type': lambda x: ';'.join(x.astype(str).sort_values(ascending=True).unique()), 'subclass_name': lambda x: ';'.join(sorted(x.unique())), 'subclass_name_bin_pos': lambda x: ';'.join(sorted(x.unique())), 'miRNA_names': lambda x: ';'.join(x.fillna('').unique()), 'precursor_name_full': lambda x: ';'.join(sorted(x.unique())), 'mms': lambda x: ';'.join(x.astype(str).sort_values(ascending=True).unique()), 'reference': lambda x: len(x), 'mitochondrial': lambda x: ';'.join(x.astype(str).sort_values(ascending=True).unique()), 'ref_miR_seq': lambda x: ';'.join(x.fillna('').unique())})
aggreg_per_seq_df['miRNA_names'] = aggreg_per_seq_df.miRNA_names.str.replace(r';$','', regex=True)
aggreg_per_seq_df['ref_miR_seq'] = aggreg_per_seq_df.ref_miR_seq.str.replace(r';$','', regex=True)
aggreg_per_seq_df['mms'] = aggreg_per_seq_df['mms'].astype(int)
# re-add 'miRNA_ref_iso','tRNA_ref_iso'
refmir_df = sRNA_anno_df[['sequence','miRNA_ref_iso','tRNA_ref_iso']]
refmir_df.drop_duplicates('sequence', inplace=True)
refmir_df.set_index('sequence', inplace=True)
aggreg_per_seq_df = aggreg_per_seq_df.merge(refmir_df, left_index=True, right_index=True, how='left')
# TEST if sequences were lost
assert (len(aggreg_per_seq_df) == len(sRNA_anno_df.sequence.unique())), "sequences were lost by aggregation"
# load unmapped seqs, if it exits
if os.path.exists(unmapped_file):
unmapped_df = fasta2df(unmapped_file)
unmapped_df = pd.DataFrame(data='no_annotation', index=unmapped_df.sequence, columns=aggreg_per_seq_df.columns)
unmapped_df['mms'] = np.nan
unmapped_df['reference'] = np.nan
unmapped_df['pseudo_class'] = True # set no annotation as pseudo_class
# merge mapped and unmapped
annotation_df = pd.concat([aggreg_per_seq_df,unmapped_df])
else:
annotation_df = aggreg_per_seq_df.copy()
# load mapping to genome file
mapping_genome_df = pd.read_csv(mapped_genome_file, index_col=0, sep='\t', header=None)
mapping_genome_df.columns = ['strand','reference','ref_start','sequence','other_alignments','mm_descriptors']
mapping_genome_df = mapping_genome_df[['strand','reference','ref_start','sequence','other_alignments']]
# use reverse complement of 'sequence' for 'strand' == '-'
mapping_genome_df.loc[:,'sequence'] = np.where(mapping_genome_df.strand == '-', mapping_genome_df.sequence.apply(lambda x: reverse_complement(x)), mapping_genome_df.sequence)
# get aggregated info per seq
aggreg_per_seq__genome_df = mapping_genome_df.groupby('sequence').agg({'reference': lambda x: ';'.join(sorted(x.unique())), 'other_alignments': lambda x: len(x)})
aggreg_per_seq__genome_df['other_alignments'] = aggreg_per_seq__genome_df['other_alignments'].astype(int)
# number of genomic loci
genomic_loci_df = pd.DataFrame(mapping_genome_df.sequence.value_counts())
genomic_loci_df.columns = ['num_genomic_loci_maps']
# load too many aligments seqs
if os.path.exists(toomanyloci_genome_file):
toomanyloci_genome_df = fasta2df(toomanyloci_genome_file)
toomanyloci_genome_df = pd.DataFrame(data=101, index=toomanyloci_genome_df.sequence, columns=genomic_loci_df.columns)
else:
toomanyloci_genome_df = pd.DataFrame(columns=genomic_loci_df.columns)
# load unmapped seqs
if os.path.exists(unmapped_genome_file):
unmapped_genome_df = fasta2df(unmapped_genome_file)
unmapped_genome_df = pd.DataFrame(data=0, index=unmapped_genome_df.sequence, columns=genomic_loci_df.columns)
else:
unmapped_genome_df = pd.DataFrame(columns=genomic_loci_df.columns)
# concat toomanyloci, unmapped, and genomic_loci
num_genomic_loci_maps_df = pd.concat([genomic_loci_df,toomanyloci_genome_df,unmapped_genome_df])
# merge to annotation_df
annotation_df = annotation_df.merge(num_genomic_loci_maps_df, left_index=True, right_index=True, how='left')
annotation_df.reset_index(inplace=True)
# add 'miRNA_seed'
annotation_df.loc[:,"miRNA_seed"] = np.where(annotation_df.small_RNA_class_annotation.str.contains('miRNA', na=False), annotation_df.sequence.str[1:9], "")
# TEST if nan values in 'num_genomic_loci_maps'
assert (annotation_df.num_genomic_loci_maps.isna().any() == False), "nan values in 'num_genomic_loci_maps'"
return annotation_df
#%%
@log_time(log)
def get_five_prime_adapter_info(annotation_df, five_prime_adapter):
adapter_df = pd.DataFrame(index=annotation_df.sequence)
min_length = 6
is_prefixed = None
print("5' adapter affixes:")
for l in range(0, len(five_prime_adapter) - min_length):
is_prefixed_l = adapter_df.index.str.startswith(five_prime_adapter[l:])
print(f"{five_prime_adapter[l:].ljust(30, ' ')}{is_prefixed_l.sum()}")
adapter_df.loc[adapter_df.index.str.startswith(five_prime_adapter[l:]), "five_prime_adapter_length"] = len(five_prime_adapter[l:])
if is_prefixed is None:
is_prefixed = is_prefixed_l
else:
is_prefixed |= is_prefixed_l
print(f"There are {is_prefixed.sum()} prefixed features.")
print("\n")
adapter_df['five_prime_adapter_length'] = adapter_df['five_prime_adapter_length'].fillna(0)
adapter_df['five_prime_adapter_length'] = adapter_df['five_prime_adapter_length'].astype('int')
adapter_df['five_prime_adapter_filter'] = np.where(adapter_df['five_prime_adapter_length'] == 0, True, False)
adapter_df = adapter_df.reset_index()
return adapter_df
#%%
@log_time(log)
def reduce_ambiguity(annotation_df: pd.DataFrame) -> pd.DataFrame:
"""Reduce ambiguity by
a) using subclass_name of precursor with shortest genomic context, if all other assigned precursors overlap with its genomic region
b) using subclass_name whose bin is at the 5' or 3' end of the precursor
Parameters
----------
annotation_df : pd.DataFrame
A DataFrame containing the annotation of the sequences (var)
Returns
-------
pd.DataFrame
An improved version of the input DataFrame with reduced ambiguity
"""
# extract ambigious assignments for subclass name
ambigious_matches_df = annotation_df[annotation_df.subclass_name.str.contains(';',na=False)]
if len(ambigious_matches_df) == 0:
print('No ambigious assignments for subclass name found.')
return annotation_df
clear_matches_df = annotation_df[~annotation_df.subclass_name.str.contains(';',na=False)]
# extract required information from HBDxBase
HBDxBase_all_df = pd.read_csv(HBDxBase_csv, index_col=0)
bin_dict = HBDxBase_all_df[['precursor_name','precursor_bins']].set_index('precursor_name').to_dict()['precursor_bins']
sRNA_class_dict = HBDxBase_all_df[['precursor_name','small_RNA_class_annotation']].set_index('precursor_name').to_dict()['small_RNA_class_annotation']
pseudo_class_dict = HBDxBase_all_df[['precursor_name','pseudo_class']].set_index('precursor_name').to_dict()['pseudo_class']
sc_type_dict = HBDxBase_all_df[['precursor_name','subclass_type']].set_index('precursor_name').to_dict()['subclass_type']
genomic_context_bed = HBDxBase_all_df[['chr','start','end','precursor_name','score','strand']]
genomic_context_bed.columns = ['seq_id','start','end','name','score','strand']
genomic_context_bed.reset_index(drop=True, inplace=True)
genomic_context_bed['genomic_length'] = genomic_context_bed.end - genomic_context_bed.start
def get_overlaps(genomic_context_bed: pd.DataFrame, name: str = None, complement: bool = False) -> list:
"""Get genomic overlap of a given precursor name
Parameters
----------
genomic_context_bed : pd.DataFrame
A DataFrame containing genomic locations of precursors in bed format
with column names: 'chr','start','end','precursor_name','score','strand'
name : str
The name of the precursor to get genomic context for
complement : bool
If True, return all precursors that do not overlap with the given precursor
Returns
-------
list
A list containing the precursors in the genomic (anti-)context of the given precursor
(including the precursor itself)
"""
series_OI = genomic_context_bed[genomic_context_bed['name'] == name]
start = series_OI['start'].values[0]
end = series_OI['end'].values[0]
seq_id = series_OI['seq_id'].values[0]
strand = series_OI['strand'].values[0]
overlap_df = genomic_context_bed.copy()
condition = (((overlap_df.start > start) &
(overlap_df.start < end)) |
((overlap_df.end > start) &
(overlap_df.end < end)) |
((overlap_df.start < start) &
(overlap_df.end > start)) |
((overlap_df.start == start) &
(overlap_df.end == end)) |
((overlap_df.start == start) &
(overlap_df.end > end)) |
((overlap_df.start < start) &
(overlap_df.end == end)))
if not complement:
overlap_df = overlap_df[condition]
else:
overlap_df = overlap_df[~condition]
overlap_df = overlap_df[overlap_df.seq_id == seq_id]
if strand is not None:
overlap_df = overlap_df[overlap_df.strand == strand]
overlap_list = overlap_df['name'].tolist()
return overlap_list
def check_genomic_ctx_of_smallest_prec(precursor_name: str) -> str:
"""Check for a given ambigious precursor assignment (several names separated by ';')
if all assigned precursors overlap with the genomic region
of the precursor with the shortest genomic context
Parameters
----------
precursor_name: str
A string containing several precursor names separated by ';'
Returns
-------
str
The precursor suggested to be used instead of the multi assignment,
or None if the ambiguity could not be resolved
"""
assigned_names = precursor_name.split(';')
tmp_genomic_context = genomic_context_bed[genomic_context_bed.name.isin(assigned_names)]
# get name of smallest genomic region
if len(tmp_genomic_context) > 0:
smallest_name = tmp_genomic_context.name[tmp_genomic_context.genomic_length.idxmin()]
# check if all assigned names are in overlap of smallest genomic region
if set(assigned_names).issubset(set(get_overlaps(genomic_context_bed,smallest_name))):
return smallest_name
else:
return None
else:
return None
def get_subclass_name(subclass_name: str, short_prec_match_new_name: str) -> str:
"""Get subclass name matching to a precursor name from a ambigious assignment (several names separated by ';')
Parameters
----------
subclass_name: str
A string containing several subclass names separated by ';'
short_prec_match_new_name: str
The name of the precursor to be used instead of the multi assignment
Returns
-------
str
The subclass name suggested to be used instead of the multi assignment,
or None if the ambiguity could not be resolved
"""
if short_prec_match_new_name is not None:
matches = get_close_matches(short_prec_match_new_name,subclass_name.split(';'),cutoff=0.2)
if matches:
return matches[0]
else:
print(f"Could not find match for {short_prec_match_new_name} in {subclass_name}")
return subclass_name
else:
return None
def check_end_bins(subclass_name: str) -> str:
"""Check for a given ambigious subclass name assignment (several names separated by ';')
if ambiguity can be resolved by selecting the subclass name whose bin matches the 3'/5' end of the precursor
Parameters
----------
subclass_name: str
A string containing several subclass names separated by ';'
Returns
-------
str
The subclass name suggested to be used instead of the multi assignment,
or None if the ambiguity could not be resolved
"""
for name in subclass_name.split(';'):
if '_bin-' in name:
name_parts = name.split('_bin-')
if name_parts[0] in bin_dict and bin_dict[name_parts[0]] == int(name_parts[1]):
return name
elif int(name_parts[1]) == 1:
return name
return None
def adjust_4_resolved_cases(row: pd.Series) -> tuple:
"""For a resolved ambiguous subclass names return adjusted values of
precursor_name_full, small_RNA_class_annotation, pseudo_class, and subclass_type
Parameters
----------
row: pd.Series
A row of the var annotation containing the columns 'subclass_name', 'precursor_name_full',
'small_RNA_class_annotation', 'pseudo_class', 'subclass_type', and 'ambiguity_resolved'
Returns
-------
tuple
A tuple containing the adjusted values of 'precursor_name_full', 'small_RNA_class_annotation',
'pseudo_class', and 'subclass_type' for resolved ambiguous cases and the original values for unresolved cases
"""
if row.ambiguity_resolved:
matches_prec = get_close_matches(row.subclass_name, row.precursor_name_full.split(';'), cutoff=0.2)
if matches_prec:
return matches_prec[0], sRNA_class_dict[matches_prec[0]], pseudo_class_dict[matches_prec[0]], sc_type_dict[matches_prec[0]]
return row.precursor_name_full, row.small_RNA_class_annotation, row.pseudo_class, row.subclass_type
# resolve ambiguity by checking genomic context of smallest precursor
ambigious_matches_df['short_prec_match_new_name'] = ambigious_matches_df.precursor_name_full.apply(check_genomic_ctx_of_smallest_prec)
ambigious_matches_df['short_prec_match_new_name'] = ambigious_matches_df.apply(lambda x: get_subclass_name(x.subclass_name, x.short_prec_match_new_name), axis=1)
ambigious_matches_df['short_prec_match'] = ambigious_matches_df['short_prec_match_new_name'].notnull()
# resolve ambiguity by checking if bin matches 3'/5' end of precursor
ambigious_matches_df['end_bin_match_new_name'] = ambigious_matches_df.subclass_name.apply(check_end_bins)
ambigious_matches_df['end_bin_match'] = ambigious_matches_df['end_bin_match_new_name'].notnull()
# check if short_prec_match and end_bin_match are equal in any case
test_df = ambigious_matches_df[((ambigious_matches_df.short_prec_match == True) & (ambigious_matches_df.end_bin_match == True))]
if not (test_df.short_prec_match_new_name == test_df.end_bin_match_new_name).all():
print('Number of cases where short_prec_match is not matching end_bin_match_new_name:',len(test_df[(test_df.short_prec_match_new_name != test_df.end_bin_match_new_name)]))
# replace subclass_name with short_prec_match_new_name or end_bin_match_new_name
# NOTE: if short_prec_match and end_bin_match are True, short_prec_match_new_name is used
ambigious_matches_df['subclass_name'] = ambigious_matches_df.apply(lambda x: x.end_bin_match_new_name if x.end_bin_match == True else x.subclass_name, axis=1)
ambigious_matches_df['subclass_name'] = ambigious_matches_df.apply(lambda x: x.short_prec_match_new_name if x.short_prec_match == True else x.subclass_name, axis=1)
# generate column 'ambiguity_resolved' which is True if short_prec_match and/or end_bin_match is True
ambigious_matches_df['ambiguity_resolved'] = ambigious_matches_df.short_prec_match | ambigious_matches_df.end_bin_match
print("Ambiguity resolved?\n",ambigious_matches_df.ambiguity_resolved.value_counts(normalize=True))
# for resolved ambiguous matches, adjust precursor_name_full, small_RNA_class_annotation, pseudo_class, subclass_type
ambigious_matches_df[['precursor_name_full','small_RNA_class_annotation','pseudo_class','subclass_type']] = ambigious_matches_df.apply(adjust_4_resolved_cases, axis=1, result_type='expand')
# drop temporary columns
ambigious_matches_df.drop(columns=['short_prec_match_new_name','short_prec_match','end_bin_match_new_name','end_bin_match'], inplace=True)
# concat with clear_matches_df
clear_matches_df['ambiguity_resolved'] = False
improved_annotation_df = pd.concat([clear_matches_df, ambigious_matches_df], axis=0)
improved_annotation_df = improved_annotation_df.reindex(annotation_df.index)
return improved_annotation_df
#%%
######################################################################################################
# HICO (=high confidence) annotation
######################################################################################################
@log_time(log)
def add_hico_annotation(annotation_df, five_prime_adapter):
"""For miRNAs only use hico annotation if part of miRBase hico set AND refmiR
"""
# add 'TE_annotation'
TE_df = pd.read_csv(TE_file, sep='\t', header=None, names=['sequence','TE_annotation'])
annotation_df = annotation_df.merge(TE_df, left_on='sequence', right_on='sequence', how='left')
# add 'bacterial' mapping filter
bacterial_unmapped_df = fasta2df(unmapped_bacterial_file)
annotation_df.loc[:,'bacterial'] = np.where(annotation_df.sequence.isin(bacterial_unmapped_df.sequence), False, True)
# add 'viral' mapping filter
viral_unmapped_df = fasta2df(unmapped_viral_file)
annotation_df.loc[:,'viral'] = np.where(annotation_df.sequence.isin(viral_unmapped_df.sequence), False, True)
# add 'adapter_mapping_filter' column
adapter_unmapped_df = fasta2df(unmapped_adapter_file)
annotation_df.loc[:,'adapter_mapping_filter'] = np.where(annotation_df.sequence.isin(adapter_unmapped_df.sequence), True, False)
# add filter column 'five_prime_adapter_filter' and column 'five_prime_adapter_length' indicating the length of the prefixed 5' adapter sequence
adapter_df = get_five_prime_adapter_info(annotation_df, five_prime_adapter)
annotation_df = annotation_df.merge(adapter_df, left_on='sequence', right_on='sequence', how='left')
# apply ambiguity reduction
annotation_df = reduce_ambiguity(annotation_df)
# add 'single_class_annotation'
annotation_df.loc[:,'single_class_annotation'] = np.where(annotation_df.small_RNA_class_annotation.str.contains(';',na=True), False, True)
# add 'single_name_annotation'
annotation_df.loc[:,'single_name_annotation'] = np.where(annotation_df.subclass_name.str.contains(';',na=True), False, True)
# add 'hypermapper' for sequences where more than 50 potential mapping references are recorded
annotation_df.loc[annotation_df.reference > 50,'subclass_name'] = 'hypermapper_' + annotation_df.reference.fillna(0).astype(int).astype(str)
annotation_df.loc[annotation_df.reference > 50,'subclass_name_bin_pos'] = 'hypermapper_' + annotation_df.reference.fillna(0).astype(int).astype(str)
annotation_df.loc[annotation_df.reference > 50,'precursor_name_full'] = 'hypermapper_' + annotation_df.reference.fillna(0).astype(int).astype(str)
annotation_df.loc[:,'mitochondrial'] = np.where(annotation_df.mitochondrial.str.contains('mito',na=False), True, False)
# add 'hico'
annotation_df.loc[:,'hico'] = np.where((
(annotation_df.mms == 0)
& (annotation_df.single_name_annotation == True)
& (annotation_df.TE_annotation.isna() == True)
& (annotation_df.bacterial == False)
& (annotation_df.viral == False)
& (annotation_df.adapter_mapping_filter == True)
& (annotation_df.five_prime_adapter_filter == True)
), True, False)
## NOTE: for miRNAs only use hico annotation if part of refmiR set
annotation_df.loc[annotation_df.small_RNA_class_annotation == 'miRNA','hico'] = annotation_df.loc[annotation_df.small_RNA_class_annotation == 'miRNA','hico'] & (annotation_df.miRNA_ref_iso == 'refmiR')
print(annotation_df[annotation_df.single_class_annotation == True].groupby('small_RNA_class_annotation').hico.value_counts())
return annotation_df
#%%
######################################################################################################
# annotation pipeline
######################################################################################################
@log_time(log)
def main(five_prime_adapter):
"""Executes 'annotate_from_mapping'.
Uses:
- HBDxBase_csv
- miRBase_mature_path
- mat_miRNA_pos_path
- mapping_file
- unmapped_file
- mapped_genome_file
- toomanyloci_genome_file
- unmapped_genome_file
- TE_file
- unmapped_adapter_file
- unmapped_bacterial_file
- unmapped_viral_file
- five_prime_adapter
"""
print('-------- extract general information for sequences that mapped to the HBDxBase --------')
mapped_info_df = extract_general_info(mapped_file)
print("\n")
print('-------- extract sRNA class specific information for sequences that mapped to the HBDxBase --------')
mapped_sRNA_anno_df = extract_sRNA_class_specific_info(mapped_info_df)
print('-------- save to file --------')
mapped_sRNA_anno_df.to_csv(sRNA_anno_file)
print("\n")
print('-------- aggregate information for mapped and unmapped sequences (HBDxBase & human genome) --------')
sRNA_anno_per_seq_df = aggregate_info_per_seq(mapped_sRNA_anno_df)
print("\n")
print('-------- add hico annotation (based on aggregated infos + mapping to viral/bacterial genomes + intersection with TEs) --------')
sRNA_anno_per_seq_df = add_hico_annotation(sRNA_anno_per_seq_df, five_prime_adapter)
print("\n")
print('-------- save to file --------')
# set sequence as index again
sRNA_anno_per_seq_df.set_index('sequence', inplace=True)
sRNA_anno_per_seq_df.to_csv(aggreg_sRNA_anno_file)
print("\n")
print('-------- generate subclass_to_annotation dict --------')
result_df = sRNA_anno_per_seq_df[['subclass_name', 'small_RNA_class_annotation']].copy()
result_df.reset_index(drop=True, inplace=True)
result_df.drop_duplicates(inplace=True)
result_df = result_df[~result_df["subclass_name"].str.contains(";")]
subclass_to_annotation = dict(zip(result_df["subclass_name"],result_df["small_RNA_class_annotation"]))
with open('subclass_to_annotation.json', 'w') as fp:
json.dump(subclass_to_annotation, fp)
print('-------- delete tmp files --------')
os.system("rm *tmp_*")
#%%
|