comparison dante.py @ 10:d0431a839606 draft

Uploaded
author petr-novak
date Wed, 14 Aug 2019 11:24:15 -0400
parents a38efa4937d7
children a6c55d1bdb6c
comparison
equal deleted inserted replaced
9:ed4d9ede9cb4 10:d0431a839606
6 import time 6 import time
7 from operator import itemgetter 7 from operator import itemgetter
8 from collections import Counter 8 from collections import Counter
9 from itertools import groupby 9 from itertools import groupby
10 import os 10 import os
11 import re
11 import configuration 12 import configuration
12 from tempfile import NamedTemporaryFile 13 from tempfile import NamedTemporaryFile
13 import sys 14 import sys
14 import warnings 15 import warnings
15 import shutil 16 import shutil
16 from collections import defaultdict 17 from collections import defaultdict
17 18
18 np.set_printoptions(threshold=sys.maxsize) 19 np.set_printoptions(threshold=sys.maxsize)
19
20 20
21 def alignment_scoring(): 21 def alignment_scoring():
22 ''' Create hash table for alignment similarity counting: for every 22 ''' Create hash table for alignment similarity counting: for every
23 combination of aminoacids in alignment assign score from protein 23 combination of aminoacids in alignment assign score from protein
24 scoring matrix defined in configuration file ''' 24 scoring matrix defined in configuration file '''
232 232
233 233
234 def score_table(mins, maxs, data, annotations, scores, CLASSIFICATION): 234 def score_table(mins, maxs, data, annotations, scores, CLASSIFICATION):
235 ''' Score table is created based on the annotations occurance in the cluster. 235 ''' Score table is created based on the annotations occurance in the cluster.
236 Matrix axis y corresponds to individual annotations (indexed according to classes_dict), 236 Matrix axis y corresponds to individual annotations (indexed according to classes_dict),
237 axis x represents positions of analyzed seq in a given cluster. 237 axis x represents positions of analyzed seq in a given cluster.
238 For every hit within cluster, array of scores on the corresponding position 238 For every hit within cluster, array of scores on the corresponding position
239 is recorded to the table in case if the score on certain position is so far the highest 239 is recorded to the table in case if the score on certain position is so far the highest
240 for the certain position and certain annotation ''' 240 for the certain position and certain annotation '''
241 classes_dict = annotations_dict(annotations) 241 classes_dict = annotations_dict(annotations)
242 score_matrix = np.zeros((len(classes_dict), maxs - mins + 1), dtype=int) 242 score_matrix = np.zeros((len(classes_dict), maxs - mins + 1), dtype=int)
243 count = 0 243 count = 0
244 for item in annotations: 244 for item in annotations:
298 298
299 299
300 def create_gff3(domain_type, ann_substring, unique_annotations, ann_pos_counts, 300 def create_gff3(domain_type, ann_substring, unique_annotations, ann_pos_counts,
301 dom_start, dom_end, step, best_idx, annotation_best, 301 dom_start, dom_end, step, best_idx, annotation_best,
302 db_name_best, db_starts_best, db_ends_best, strand, score, 302 db_name_best, db_starts_best, db_ends_best, strand, score,
303 seq_id, db_seq, query_seq, domain_size, positions, gff): 303 seq_id, db_seq, query_seq, domain_size, positions, gff, consensus):
304 ''' Record obtained information about domain corresponding to individual cluster to common gff file ''' 304 ''' Record obtained information about domain corresponding to individual cluster to common gff file '''
305 best_start = positions[best_idx][0] 305 best_start = positions[best_idx][0]
306 best_end = positions[best_idx][1] 306 best_end = positions[best_idx][1]
307 best_score = score[best_idx] 307 best_score = score[best_idx]
308 ## proportion of length of the best hit to the whole region length found by base 308 ## proportion of length of the best hit to the whole region length found by base
339 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start, 339 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start,
340 dom_end, strand, configuration.PHASE, domain_type, 340 dom_end, strand, configuration.PHASE, domain_type,
341 unique_annotations)) 341 unique_annotations))
342 else: 342 else:
343 gff.write( 343 gff.write(
344 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Final_Classification={};Region_Hits_Classifications={};Best_Hit={}:{}-{}[{}percent];Best_Hit_DB_Pos={}:{}of{};DB_Seq={};Query_Seq={};Identity={};Similarity={};Relat_Length={};Relat_Interruptions={};Hit_to_DB_Length={}\n".format( 344 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Final_Classification={};Region_Hits_Classifications={};Best_Hit={}:{}-{}[{}percent];Best_Hit_DB_Pos={}:{}of{};DB_Seq={};Region_Seq={};Query_Seq={};Identity={};Similarity={};Relat_Length={};Relat_Interruptions={};Hit_to_DB_Length={}\n".format(
345 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start, 345 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start,
346 dom_end, best_score, strand, configuration.PHASE, domain_type, 346 dom_end, best_score, strand, configuration.PHASE, domain_type,
347 ann_substring, unique_annotations, annotation_best, best_start, 347 ann_substring, unique_annotations, annotation_best, best_start,
348 best_end, length_proportion, db_starts_best, db_ends_best, 348 best_end, length_proportion, db_starts_best, db_ends_best,
349 domain_size_best, db_seq_best, query_seq_best, percent_ident, 349 domain_size_best, db_seq_best, consensus, query_seq_best, percent_ident,
350 align_similarity, relat_align_len, relat_interrupt, 350 align_similarity, relat_align_len, relat_interrupt,
351 db_len_proportion)) 351 db_len_proportion))
352 352
353 353
354 def filter_params(db, query, protein_len): 354 def filter_params(db, query, protein_len):
449 dom_gff_tmp.write("{}\n".format(configuration.HEADER_GFF)) 449 dom_gff_tmp.write("{}\n".format(configuration.HEADER_GFF))
450 dom_gff_tmp.write(version_string) 450 dom_gff_tmp.write(version_string)
451 451
452 452
453 def domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN, 453 def domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
454 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM): 454 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX):
455 ''' Search for protein domains using our protein database and external tool LAST, 455 ''' Search for protein domains using our protein database and external tool LAST,
456 stdout is parsed in real time and hits for a single sequence undergo further processing 456 stdout is parsed in real time and hits for a single sequence undergo further processing
457 - tabular format(TAB) to get info about position, score, orientation 457 - tabular format(TAB) to get info about position, score, orientation
458 - MAF format to gain alignment and original sequence 458 - MAF format to gain alignment and original sequence
459 ''' 459 '''
460 460
461 step = WIN_DOM - OVERLAP_DOM 461 step = WIN_DOM - OVERLAP_DOM
462 [headers, above_win, below_win, lens_above_win, seq_starts, seq_ends 462 [headers, above_win, below_win, lens_above_win, seq_starts, seq_ends
463 ] = characterize_fasta(QUERY, WIN_DOM) 463 ] = characterize_fasta(QUERY, WIN_DOM)
464 query_temp = split_fasta(QUERY, WIN_DOM, step, headers, above_win, 464 query_temp = split_fasta(QUERY, WIN_DOM, step, headers, above_win,
465 below_win, lens_above_win, seq_starts, seq_ends) 465 below_win, lens_above_win, seq_starts, seq_ends)
466 466
467 ## TAB output contains all the alignment scores, positions, strands... 467 ## TAB output contains all the alignment scores, positions, strands...
468 lastal_columns = {
469 "BL80" : ("score, name_db, start_db, al_size_db, strand_db,"
470 " seq_size_db, name_q, start_q, al_size_q, strand_q, seq_size_q,"
471 " block1, block2, block3, db_seq, q_seq"),
472 "BL62" : ("score, name_db, start_db, al_size_db, strand_db,"
473 " seq_size_db, name_q, start_q, al_size_q, strand_q,"
474 " seq_size_q, block1, block2, block3, db_seq, q_seq"),
475 "MIQS" : ("score, name_db, start_db, al_size_db, strand_db,"
476 " seq_size_db, name_q, start_q, al_size_q, strand_q,"
477 " seq_size_q, block1, db_seq, q_seq"),
478 }
468 tab = subprocess.Popen( 479 tab = subprocess.Popen(
469 "lastal -F15 {} {} -L 10 -m 70 -p BL80 -f TAB".format(LAST_DB, 480 "lastal -F15 {} {} -L 10 -m 70 -p {} -e 80 -f TAB".format(LAST_DB,
470 query_temp), 481 query_temp,
482 SCORING_MATRIX),
471 stdout=subprocess.PIPE, 483 stdout=subprocess.PIPE,
472 shell=True) 484 shell=True)
473 ## MAF output contains alignment sequences 485 ## MAF output contains alignment sequences
474 maf = subprocess.Popen( 486 maf = subprocess.Popen(
475 "lastal -F15 {} {} -L 10 -m 70 -p BL80 -f MAF".format(LAST_DB, 487 "lastal -F15 {} {} -L 10 -m 70 -p {} -e 80 -f MAF".format(LAST_DB,
476 query_temp), 488 query_temp,
489 SCORING_MATRIX),
477 stdout=subprocess.PIPE, 490 stdout=subprocess.PIPE,
478 shell=True) 491 shell=True)
479 492
480 tab_pipe = tab.stdout 493 tab_pipe = tab.stdout
481 maf_pipe = maf.stdout 494 maf_pipe = maf.stdout
493 try: 506 try:
494 with warnings.catch_warnings(): 507 with warnings.catch_warnings():
495 warnings.simplefilter("ignore") 508 warnings.simplefilter("ignore")
496 sequence_hits = np.genfromtxt( 509 sequence_hits = np.genfromtxt(
497 line_generator(tab_pipe, maf_pipe, start), 510 line_generator(tab_pipe, maf_pipe, start),
498 names= 511 names=lastal_columns[SCORING_MATRIX],
499 "score, name_db, start_db, al_size_db, strand_db, seq_size_db, name_q, start_q, al_size_q, strand_q, seq_size_q, block1, block2, block3, db_seq, q_seq", 512 usecols=("score, name_q, start_q, al_size_q,"
500 usecols= 513 " strand_q, seq_size_q, name_db, db_seq,"
501 "score, name_q, start_q, al_size_q, strand_q, seq_size_q, name_db, db_seq, q_seq, seq_size_db, start_db, al_size_db", 514 " q_seq, seq_size_db, start_db, al_size_db"),
502 dtype=None, 515 dtype=None,
503 comments=None) 516 comments=None)
504 except RuntimeError: 517 except RuntimeError:
505 break 518 break
506 ## if there are no domains found 519 ## if there are no domains found
542 for region in indices_overal: 555 for region in indices_overal:
543 db_names = domain_db[np.array(region)] 556 db_names = domain_db[np.array(region)]
544 db_starts = db_start[np.array(region)] 557 db_starts = db_start[np.array(region)]
545 db_ends = db_end[np.array(region)] 558 db_ends = db_end[np.array(region)]
546 scores = score[np.array(region)] 559 scores = score[np.array(region)]
560 regions_above_threshold = [
561 region[i]
562 for i, _ in enumerate(region)
563 if max(scores) / 100 * THRESHOLD_SCORE < scores[i]
564 ]
565 ## sort by score first:
566 consensus = get_full_translation(
567 translation_alignments(
568 query_seq=sortby(query_seq[regions_above_threshold], score[regions_above_threshold], True),
569 start_hit=sortby(start_hit[regions_above_threshold], score[regions_above_threshold], True),
570 end_hit=sortby(end_hit[regions_above_threshold], score[regions_above_threshold], True))
571 )
572
547 annotations = domain_annotation(db_names, CLASSIFICATION) 573 annotations = domain_annotation(db_names, CLASSIFICATION)
548 [score_matrix, classes_dict] = score_table( 574 [score_matrix, classes_dict] = score_table(
549 mins[count_region], maxs[count_region], data[count_region], 575 mins[count_region], maxs[count_region], data[count_region],
550 annotations, scores, CLASSIFICATION) 576 annotations, scores, CLASSIFICATION)
551 ann_per_reg = score_matrix_evaluation(score_matrix, classes_dict, 577 ann_per_reg = score_matrix_evaluation(score_matrix, classes_dict,
558 db_starts_best = db_starts[best_idx_reg] 584 db_starts_best = db_starts[best_idx_reg]
559 db_ends_best = db_ends[best_idx_reg] 585 db_ends_best = db_ends[best_idx_reg]
560 if count_region == len(indices_plus): 586 if count_region == len(indices_plus):
561 strand_gff = "-" 587 strand_gff = "-"
562 create_gff3(domain_type, ann_substring, unique_annotations, 588 create_gff3(domain_type, ann_substring, unique_annotations,
563 ann_pos_counts, mins[count_region], maxs[count_region], 589 ann_pos_counts, min(start_hit[regions_above_threshold])-1,
590 max(end_hit[regions_above_threshold]),
564 step, best_idx, annotation_best, db_name_best, 591 step, best_idx, annotation_best, db_name_best,
565 db_starts_best, db_ends_best, strand_gff, score, 592 db_starts_best, db_ends_best, strand_gff, score,
566 seq_id, db_seq, query_seq, domain_size, positions, gff) 593 seq_id, db_seq, query_seq, domain_size, positions, gff, consensus)
567 count_region += 1 594 count_region += 1
568 seq_ids.append(seq_id) 595 seq_ids.append(seq_id)
569 os.unlink(query_temp) 596 os.unlink(query_temp)
570 gff.close() 597 gff.close()
571 dom_tmp.close() 598 dom_tmp.close()
574 adjust_gff(OUTPUT_DOMAIN, dom_tmp.name, WIN_DOM, OVERLAP_DOM, step) 601 adjust_gff(OUTPUT_DOMAIN, dom_tmp.name, WIN_DOM, OVERLAP_DOM, step)
575 ## otherwise use the temporary output as the final domains gff 602 ## otherwise use the temporary output as the final domains gff
576 else: 603 else:
577 shutil.copy2(dom_tmp.name, OUTPUT_DOMAIN) 604 shutil.copy2(dom_tmp.name, OUTPUT_DOMAIN)
578 os.unlink(dom_tmp.name) 605 os.unlink(dom_tmp.name)
606
607 def sortby(a, by, reverse=False):
608 ''' sort according values in the by list '''
609 a_sorted = [i[0] for i in
610 sorted(
611 zip(a, by),
612 key=lambda i: i[1],
613 reverse=reverse
614 )]
615 return a_sorted
616
617
618 def a2nnn(s):
619 s1 = "".join([c if c in ['/', '\\'] else c + c + c
620 for c in s.replace("-", "")])
621 # collapse frameshifts (/)
622 s2 = re.sub("[a-zA-Z*]{2}//[a-zA-Z*]{2}", "//", s1)
623 s3 = re.sub("[a-zA-Z*]/[a-zA-Z*]", "/", s2)
624 return (s3)
625
626
627
628 def rle(s):
629 '''run length encoding but max is set to 3 (codon)'''
630 prev = ""
631 count = 1
632 char = []
633 length = []
634 L = 0
635 for n in s:
636 if n == prev and count < (3 - L):
637 count += 1
638 else:
639 char.append(prev)
640 length.append(count)
641 L = 1 if prev == "/" else 0
642 prev = n
643 count = 1
644 char.append(prev)
645 length.append(count)
646 sequence = char[1:]
647 return sequence, length[1:]
648
649 def get_full_translation(translations):
650 '''get one full length translation from multiple partial
651 aligned translation '''
652 # find minimal set of alignements
653 minimal_set = []
654 not_filled_prev = len(translations[0])
655 for s in translations:
656 minimal_set.append(s)
657 # check defined position - is there only '-' character?
658 not_filled = sum([set(i) == {"-"} for i in zip(*minimal_set)])
659 if not_filled == 0:
660 break
661 if not_filled == not_filled_prev:
662 # last added sequence did not improve coverage - remove it.
663 minimal_set.pop()
664 not_filled_prev = not_filled
665 # merge translations
666 final_translation = minimal_set[0]
667 # record positions of joins to correct frameshifts reportings
668 position_of_joins = set()
669 position_of_joins_rle = set()
670 if len(minimal_set) > 1: # translation must be merged
671 for s in minimal_set[1:]:
672 s1 = re.search(r"-\w", final_translation)
673 s2 = re.search(r"\w-", final_translation)
674 if s1:
675 position_of_joins.add(s1.start())
676 if s2:
677 position_of_joins.add((s2.end() - 1))
678 final_translation = "".join(
679 [b if a == "-" else a for a, b in zip(final_translation, s)])
680 translation_rle = rle(final_translation)
681 cumsumed_positions = np.cumsum(translation_rle[1])
682 for p in position_of_joins:
683 position_of_joins_rle.add(sum(cumsumed_positions <= p))
684 # insert /\ when necessary
685 for p in position_of_joins_rle:
686 if translation_rle[0][p] not in ['/',"//","\\", "\\\\"]:
687 if translation_rle[1][p] == 2:
688 translation_rle[0][p] = translation_rle[0][p] + "/"
689 if translation_rle[1][p] == 1:
690 translation_rle[0][p] = "\\"
691 consensus = "".join(translation_rle[0])
692 return consensus
693
694
695 # helper function for debugging
696 def translation_alignments(query_seq, start_hit, end_hit):
697 pstart = min(start_hit)
698 pend = max(end_hit)
699 nnn = list()
700 for s, start, end in zip(query_seq, start_hit, end_hit):
701 nnn.append("-" * (start - pstart) + a2nnn(s) + "-" * (pend - end))
702 return (nnn)
703
579 704
580 705
581 def adjust_gff(OUTPUT_DOMAIN, gff, WIN_DOM, OVERLAP_DOM, step): 706 def adjust_gff(OUTPUT_DOMAIN, gff, WIN_DOM, OVERLAP_DOM, step):
582 ''' Original gff file is adjusted in case of containing cut parts 707 ''' Original gff file is adjusted in case of containing cut parts
583 - for consecutive sequences overlap is divided to half with first half 708 - for consecutive sequences overlap is divided to half with first half
677 NEW_LDB = args.new_ldb 802 NEW_LDB = args.new_ldb
678 OUTPUT_DIR = args.output_dir 803 OUTPUT_DIR = args.output_dir
679 THRESHOLD_SCORE = args.threshold_score 804 THRESHOLD_SCORE = args.threshold_score
680 WIN_DOM = args.win_dom 805 WIN_DOM = args.win_dom
681 OVERLAP_DOM = args.overlap_dom 806 OVERLAP_DOM = args.overlap_dom
807 SCORING_MATRIX = args.scoring_matrix
808 configuration.SC_MATRIX = configuration.SC_MATRIX_SKELETON.format(SCORING_MATRIX)
682 809
683 if OUTPUT_DOMAIN is None: 810 if OUTPUT_DOMAIN is None:
684 OUTPUT_DOMAIN = configuration.DOMAINS_GFF 811 OUTPUT_DOMAIN = configuration.DOMAINS_GFF
685 if os.path.isdir(LAST_DB): 812 if os.path.isdir(LAST_DB):
686 LAST_DB = os.path.join(LAST_DB, configuration.LAST_DB_FILE) 813 LAST_DB = os.path.join(LAST_DB, configuration.LAST_DB_FILE)
700 if not os.path.exists(OUTPUT_DIR): 827 if not os.path.exists(OUTPUT_DIR):
701 os.makedirs(OUTPUT_DIR) 828 os.makedirs(OUTPUT_DIR)
702 OUTPUT_DOMAIN = os.path.join(OUTPUT_DIR, 829 OUTPUT_DOMAIN = os.path.join(OUTPUT_DIR,
703 os.path.basename(OUTPUT_DOMAIN)) 830 os.path.basename(OUTPUT_DOMAIN))
704 domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN, 831 domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
705 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM) 832 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX)
706 833
707 print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t)) 834 print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t))
708 835
709 836
710 if __name__ == "__main__": 837 if __name__ == "__main__":
770 "-dir", 897 "-dir",
771 "--output_dir", 898 "--output_dir",
772 type=str, 899 type=str,
773 help="specify if you want to change the output directory") 900 help="specify if you want to change the output directory")
774 parser.add_argument( 901 parser.add_argument(
902 "-M",
903 "--scoring_matrix",
904 type=str,
905 default="BL80",
906 choices=['BL80', 'BL62', 'MIQS'],
907 help="specify scoring matrix to use for similarity search (BL80, BL62, MIQS)")
908 parser.add_argument(
775 "-thsc", 909 "-thsc",
776 "--threshold_score", 910 "--threshold_score",
777 type=int, 911 type=int,
778 default=80, 912 default=80,
779 help= 913 help=