Mercurial > repos > petr-novak > dante
comparison dante.py @ 10:d0431a839606 draft
Uploaded
author | petr-novak |
---|---|
date | Wed, 14 Aug 2019 11:24:15 -0400 |
parents | a38efa4937d7 |
children | a6c55d1bdb6c |
comparison
equal
deleted
inserted
replaced
9:ed4d9ede9cb4 | 10:d0431a839606 |
---|---|
6 import time | 6 import time |
7 from operator import itemgetter | 7 from operator import itemgetter |
8 from collections import Counter | 8 from collections import Counter |
9 from itertools import groupby | 9 from itertools import groupby |
10 import os | 10 import os |
11 import re | |
11 import configuration | 12 import configuration |
12 from tempfile import NamedTemporaryFile | 13 from tempfile import NamedTemporaryFile |
13 import sys | 14 import sys |
14 import warnings | 15 import warnings |
15 import shutil | 16 import shutil |
16 from collections import defaultdict | 17 from collections import defaultdict |
17 | 18 |
18 np.set_printoptions(threshold=sys.maxsize) | 19 np.set_printoptions(threshold=sys.maxsize) |
19 | |
20 | 20 |
21 def alignment_scoring(): | 21 def alignment_scoring(): |
22 ''' Create hash table for alignment similarity counting: for every | 22 ''' Create hash table for alignment similarity counting: for every |
23 combination of aminoacids in alignment assign score from protein | 23 combination of aminoacids in alignment assign score from protein |
24 scoring matrix defined in configuration file ''' | 24 scoring matrix defined in configuration file ''' |
232 | 232 |
233 | 233 |
234 def score_table(mins, maxs, data, annotations, scores, CLASSIFICATION): | 234 def score_table(mins, maxs, data, annotations, scores, CLASSIFICATION): |
235 ''' Score table is created based on the annotations occurance in the cluster. | 235 ''' Score table is created based on the annotations occurance in the cluster. |
236 Matrix axis y corresponds to individual annotations (indexed according to classes_dict), | 236 Matrix axis y corresponds to individual annotations (indexed according to classes_dict), |
237 axis x represents positions of analyzed seq in a given cluster. | 237 axis x represents positions of analyzed seq in a given cluster. |
238 For every hit within cluster, array of scores on the corresponding position | 238 For every hit within cluster, array of scores on the corresponding position |
239 is recorded to the table in case if the score on certain position is so far the highest | 239 is recorded to the table in case if the score on certain position is so far the highest |
240 for the certain position and certain annotation ''' | 240 for the certain position and certain annotation ''' |
241 classes_dict = annotations_dict(annotations) | 241 classes_dict = annotations_dict(annotations) |
242 score_matrix = np.zeros((len(classes_dict), maxs - mins + 1), dtype=int) | 242 score_matrix = np.zeros((len(classes_dict), maxs - mins + 1), dtype=int) |
243 count = 0 | 243 count = 0 |
244 for item in annotations: | 244 for item in annotations: |
298 | 298 |
299 | 299 |
300 def create_gff3(domain_type, ann_substring, unique_annotations, ann_pos_counts, | 300 def create_gff3(domain_type, ann_substring, unique_annotations, ann_pos_counts, |
301 dom_start, dom_end, step, best_idx, annotation_best, | 301 dom_start, dom_end, step, best_idx, annotation_best, |
302 db_name_best, db_starts_best, db_ends_best, strand, score, | 302 db_name_best, db_starts_best, db_ends_best, strand, score, |
303 seq_id, db_seq, query_seq, domain_size, positions, gff): | 303 seq_id, db_seq, query_seq, domain_size, positions, gff, consensus): |
304 ''' Record obtained information about domain corresponding to individual cluster to common gff file ''' | 304 ''' Record obtained information about domain corresponding to individual cluster to common gff file ''' |
305 best_start = positions[best_idx][0] | 305 best_start = positions[best_idx][0] |
306 best_end = positions[best_idx][1] | 306 best_end = positions[best_idx][1] |
307 best_score = score[best_idx] | 307 best_score = score[best_idx] |
308 ## proportion of length of the best hit to the whole region length found by base | 308 ## proportion of length of the best hit to the whole region length found by base |
339 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start, | 339 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start, |
340 dom_end, strand, configuration.PHASE, domain_type, | 340 dom_end, strand, configuration.PHASE, domain_type, |
341 unique_annotations)) | 341 unique_annotations)) |
342 else: | 342 else: |
343 gff.write( | 343 gff.write( |
344 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Final_Classification={};Region_Hits_Classifications={};Best_Hit={}:{}-{}[{}percent];Best_Hit_DB_Pos={}:{}of{};DB_Seq={};Query_Seq={};Identity={};Similarity={};Relat_Length={};Relat_Interruptions={};Hit_to_DB_Length={}\n".format( | 344 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Final_Classification={};Region_Hits_Classifications={};Best_Hit={}:{}-{}[{}percent];Best_Hit_DB_Pos={}:{}of{};DB_Seq={};Region_Seq={};Query_Seq={};Identity={};Similarity={};Relat_Length={};Relat_Interruptions={};Hit_to_DB_Length={}\n".format( |
345 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start, | 345 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start, |
346 dom_end, best_score, strand, configuration.PHASE, domain_type, | 346 dom_end, best_score, strand, configuration.PHASE, domain_type, |
347 ann_substring, unique_annotations, annotation_best, best_start, | 347 ann_substring, unique_annotations, annotation_best, best_start, |
348 best_end, length_proportion, db_starts_best, db_ends_best, | 348 best_end, length_proportion, db_starts_best, db_ends_best, |
349 domain_size_best, db_seq_best, query_seq_best, percent_ident, | 349 domain_size_best, db_seq_best, consensus, query_seq_best, percent_ident, |
350 align_similarity, relat_align_len, relat_interrupt, | 350 align_similarity, relat_align_len, relat_interrupt, |
351 db_len_proportion)) | 351 db_len_proportion)) |
352 | 352 |
353 | 353 |
354 def filter_params(db, query, protein_len): | 354 def filter_params(db, query, protein_len): |
449 dom_gff_tmp.write("{}\n".format(configuration.HEADER_GFF)) | 449 dom_gff_tmp.write("{}\n".format(configuration.HEADER_GFF)) |
450 dom_gff_tmp.write(version_string) | 450 dom_gff_tmp.write(version_string) |
451 | 451 |
452 | 452 |
453 def domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN, | 453 def domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN, |
454 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM): | 454 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX): |
455 ''' Search for protein domains using our protein database and external tool LAST, | 455 ''' Search for protein domains using our protein database and external tool LAST, |
456 stdout is parsed in real time and hits for a single sequence undergo further processing | 456 stdout is parsed in real time and hits for a single sequence undergo further processing |
457 - tabular format(TAB) to get info about position, score, orientation | 457 - tabular format(TAB) to get info about position, score, orientation |
458 - MAF format to gain alignment and original sequence | 458 - MAF format to gain alignment and original sequence |
459 ''' | 459 ''' |
460 | 460 |
461 step = WIN_DOM - OVERLAP_DOM | 461 step = WIN_DOM - OVERLAP_DOM |
462 [headers, above_win, below_win, lens_above_win, seq_starts, seq_ends | 462 [headers, above_win, below_win, lens_above_win, seq_starts, seq_ends |
463 ] = characterize_fasta(QUERY, WIN_DOM) | 463 ] = characterize_fasta(QUERY, WIN_DOM) |
464 query_temp = split_fasta(QUERY, WIN_DOM, step, headers, above_win, | 464 query_temp = split_fasta(QUERY, WIN_DOM, step, headers, above_win, |
465 below_win, lens_above_win, seq_starts, seq_ends) | 465 below_win, lens_above_win, seq_starts, seq_ends) |
466 | 466 |
467 ## TAB output contains all the alignment scores, positions, strands... | 467 ## TAB output contains all the alignment scores, positions, strands... |
468 lastal_columns = { | |
469 "BL80" : ("score, name_db, start_db, al_size_db, strand_db," | |
470 " seq_size_db, name_q, start_q, al_size_q, strand_q, seq_size_q," | |
471 " block1, block2, block3, db_seq, q_seq"), | |
472 "BL62" : ("score, name_db, start_db, al_size_db, strand_db," | |
473 " seq_size_db, name_q, start_q, al_size_q, strand_q," | |
474 " seq_size_q, block1, block2, block3, db_seq, q_seq"), | |
475 "MIQS" : ("score, name_db, start_db, al_size_db, strand_db," | |
476 " seq_size_db, name_q, start_q, al_size_q, strand_q," | |
477 " seq_size_q, block1, db_seq, q_seq"), | |
478 } | |
468 tab = subprocess.Popen( | 479 tab = subprocess.Popen( |
469 "lastal -F15 {} {} -L 10 -m 70 -p BL80 -f TAB".format(LAST_DB, | 480 "lastal -F15 {} {} -L 10 -m 70 -p {} -e 80 -f TAB".format(LAST_DB, |
470 query_temp), | 481 query_temp, |
482 SCORING_MATRIX), | |
471 stdout=subprocess.PIPE, | 483 stdout=subprocess.PIPE, |
472 shell=True) | 484 shell=True) |
473 ## MAF output contains alignment sequences | 485 ## MAF output contains alignment sequences |
474 maf = subprocess.Popen( | 486 maf = subprocess.Popen( |
475 "lastal -F15 {} {} -L 10 -m 70 -p BL80 -f MAF".format(LAST_DB, | 487 "lastal -F15 {} {} -L 10 -m 70 -p {} -e 80 -f MAF".format(LAST_DB, |
476 query_temp), | 488 query_temp, |
489 SCORING_MATRIX), | |
477 stdout=subprocess.PIPE, | 490 stdout=subprocess.PIPE, |
478 shell=True) | 491 shell=True) |
479 | 492 |
480 tab_pipe = tab.stdout | 493 tab_pipe = tab.stdout |
481 maf_pipe = maf.stdout | 494 maf_pipe = maf.stdout |
493 try: | 506 try: |
494 with warnings.catch_warnings(): | 507 with warnings.catch_warnings(): |
495 warnings.simplefilter("ignore") | 508 warnings.simplefilter("ignore") |
496 sequence_hits = np.genfromtxt( | 509 sequence_hits = np.genfromtxt( |
497 line_generator(tab_pipe, maf_pipe, start), | 510 line_generator(tab_pipe, maf_pipe, start), |
498 names= | 511 names=lastal_columns[SCORING_MATRIX], |
499 "score, name_db, start_db, al_size_db, strand_db, seq_size_db, name_q, start_q, al_size_q, strand_q, seq_size_q, block1, block2, block3, db_seq, q_seq", | 512 usecols=("score, name_q, start_q, al_size_q," |
500 usecols= | 513 " strand_q, seq_size_q, name_db, db_seq," |
501 "score, name_q, start_q, al_size_q, strand_q, seq_size_q, name_db, db_seq, q_seq, seq_size_db, start_db, al_size_db", | 514 " q_seq, seq_size_db, start_db, al_size_db"), |
502 dtype=None, | 515 dtype=None, |
503 comments=None) | 516 comments=None) |
504 except RuntimeError: | 517 except RuntimeError: |
505 break | 518 break |
506 ## if there are no domains found | 519 ## if there are no domains found |
542 for region in indices_overal: | 555 for region in indices_overal: |
543 db_names = domain_db[np.array(region)] | 556 db_names = domain_db[np.array(region)] |
544 db_starts = db_start[np.array(region)] | 557 db_starts = db_start[np.array(region)] |
545 db_ends = db_end[np.array(region)] | 558 db_ends = db_end[np.array(region)] |
546 scores = score[np.array(region)] | 559 scores = score[np.array(region)] |
560 regions_above_threshold = [ | |
561 region[i] | |
562 for i, _ in enumerate(region) | |
563 if max(scores) / 100 * THRESHOLD_SCORE < scores[i] | |
564 ] | |
565 ## sort by score first: | |
566 consensus = get_full_translation( | |
567 translation_alignments( | |
568 query_seq=sortby(query_seq[regions_above_threshold], score[regions_above_threshold], True), | |
569 start_hit=sortby(start_hit[regions_above_threshold], score[regions_above_threshold], True), | |
570 end_hit=sortby(end_hit[regions_above_threshold], score[regions_above_threshold], True)) | |
571 ) | |
572 | |
547 annotations = domain_annotation(db_names, CLASSIFICATION) | 573 annotations = domain_annotation(db_names, CLASSIFICATION) |
548 [score_matrix, classes_dict] = score_table( | 574 [score_matrix, classes_dict] = score_table( |
549 mins[count_region], maxs[count_region], data[count_region], | 575 mins[count_region], maxs[count_region], data[count_region], |
550 annotations, scores, CLASSIFICATION) | 576 annotations, scores, CLASSIFICATION) |
551 ann_per_reg = score_matrix_evaluation(score_matrix, classes_dict, | 577 ann_per_reg = score_matrix_evaluation(score_matrix, classes_dict, |
558 db_starts_best = db_starts[best_idx_reg] | 584 db_starts_best = db_starts[best_idx_reg] |
559 db_ends_best = db_ends[best_idx_reg] | 585 db_ends_best = db_ends[best_idx_reg] |
560 if count_region == len(indices_plus): | 586 if count_region == len(indices_plus): |
561 strand_gff = "-" | 587 strand_gff = "-" |
562 create_gff3(domain_type, ann_substring, unique_annotations, | 588 create_gff3(domain_type, ann_substring, unique_annotations, |
563 ann_pos_counts, mins[count_region], maxs[count_region], | 589 ann_pos_counts, min(start_hit[regions_above_threshold])-1, |
590 max(end_hit[regions_above_threshold]), | |
564 step, best_idx, annotation_best, db_name_best, | 591 step, best_idx, annotation_best, db_name_best, |
565 db_starts_best, db_ends_best, strand_gff, score, | 592 db_starts_best, db_ends_best, strand_gff, score, |
566 seq_id, db_seq, query_seq, domain_size, positions, gff) | 593 seq_id, db_seq, query_seq, domain_size, positions, gff, consensus) |
567 count_region += 1 | 594 count_region += 1 |
568 seq_ids.append(seq_id) | 595 seq_ids.append(seq_id) |
569 os.unlink(query_temp) | 596 os.unlink(query_temp) |
570 gff.close() | 597 gff.close() |
571 dom_tmp.close() | 598 dom_tmp.close() |
574 adjust_gff(OUTPUT_DOMAIN, dom_tmp.name, WIN_DOM, OVERLAP_DOM, step) | 601 adjust_gff(OUTPUT_DOMAIN, dom_tmp.name, WIN_DOM, OVERLAP_DOM, step) |
575 ## otherwise use the temporary output as the final domains gff | 602 ## otherwise use the temporary output as the final domains gff |
576 else: | 603 else: |
577 shutil.copy2(dom_tmp.name, OUTPUT_DOMAIN) | 604 shutil.copy2(dom_tmp.name, OUTPUT_DOMAIN) |
578 os.unlink(dom_tmp.name) | 605 os.unlink(dom_tmp.name) |
606 | |
607 def sortby(a, by, reverse=False): | |
608 ''' sort according values in the by list ''' | |
609 a_sorted = [i[0] for i in | |
610 sorted( | |
611 zip(a, by), | |
612 key=lambda i: i[1], | |
613 reverse=reverse | |
614 )] | |
615 return a_sorted | |
616 | |
617 | |
618 def a2nnn(s): | |
619 s1 = "".join([c if c in ['/', '\\'] else c + c + c | |
620 for c in s.replace("-", "")]) | |
621 # collapse frameshifts (/) | |
622 s2 = re.sub("[a-zA-Z*]{2}//[a-zA-Z*]{2}", "//", s1) | |
623 s3 = re.sub("[a-zA-Z*]/[a-zA-Z*]", "/", s2) | |
624 return (s3) | |
625 | |
626 | |
627 | |
628 def rle(s): | |
629 '''run length encoding but max is set to 3 (codon)''' | |
630 prev = "" | |
631 count = 1 | |
632 char = [] | |
633 length = [] | |
634 L = 0 | |
635 for n in s: | |
636 if n == prev and count < (3 - L): | |
637 count += 1 | |
638 else: | |
639 char.append(prev) | |
640 length.append(count) | |
641 L = 1 if prev == "/" else 0 | |
642 prev = n | |
643 count = 1 | |
644 char.append(prev) | |
645 length.append(count) | |
646 sequence = char[1:] | |
647 return sequence, length[1:] | |
648 | |
649 def get_full_translation(translations): | |
650 '''get one full length translation from multiple partial | |
651 aligned translation ''' | |
652 # find minimal set of alignements | |
653 minimal_set = [] | |
654 not_filled_prev = len(translations[0]) | |
655 for s in translations: | |
656 minimal_set.append(s) | |
657 # check defined position - is there only '-' character? | |
658 not_filled = sum([set(i) == {"-"} for i in zip(*minimal_set)]) | |
659 if not_filled == 0: | |
660 break | |
661 if not_filled == not_filled_prev: | |
662 # last added sequence did not improve coverage - remove it. | |
663 minimal_set.pop() | |
664 not_filled_prev = not_filled | |
665 # merge translations | |
666 final_translation = minimal_set[0] | |
667 # record positions of joins to correct frameshifts reportings | |
668 position_of_joins = set() | |
669 position_of_joins_rle = set() | |
670 if len(minimal_set) > 1: # translation must be merged | |
671 for s in minimal_set[1:]: | |
672 s1 = re.search(r"-\w", final_translation) | |
673 s2 = re.search(r"\w-", final_translation) | |
674 if s1: | |
675 position_of_joins.add(s1.start()) | |
676 if s2: | |
677 position_of_joins.add((s2.end() - 1)) | |
678 final_translation = "".join( | |
679 [b if a == "-" else a for a, b in zip(final_translation, s)]) | |
680 translation_rle = rle(final_translation) | |
681 cumsumed_positions = np.cumsum(translation_rle[1]) | |
682 for p in position_of_joins: | |
683 position_of_joins_rle.add(sum(cumsumed_positions <= p)) | |
684 # insert /\ when necessary | |
685 for p in position_of_joins_rle: | |
686 if translation_rle[0][p] not in ['/',"//","\\", "\\\\"]: | |
687 if translation_rle[1][p] == 2: | |
688 translation_rle[0][p] = translation_rle[0][p] + "/" | |
689 if translation_rle[1][p] == 1: | |
690 translation_rle[0][p] = "\\" | |
691 consensus = "".join(translation_rle[0]) | |
692 return consensus | |
693 | |
694 | |
695 # helper function for debugging | |
696 def translation_alignments(query_seq, start_hit, end_hit): | |
697 pstart = min(start_hit) | |
698 pend = max(end_hit) | |
699 nnn = list() | |
700 for s, start, end in zip(query_seq, start_hit, end_hit): | |
701 nnn.append("-" * (start - pstart) + a2nnn(s) + "-" * (pend - end)) | |
702 return (nnn) | |
703 | |
579 | 704 |
580 | 705 |
581 def adjust_gff(OUTPUT_DOMAIN, gff, WIN_DOM, OVERLAP_DOM, step): | 706 def adjust_gff(OUTPUT_DOMAIN, gff, WIN_DOM, OVERLAP_DOM, step): |
582 ''' Original gff file is adjusted in case of containing cut parts | 707 ''' Original gff file is adjusted in case of containing cut parts |
583 - for consecutive sequences overlap is divided to half with first half | 708 - for consecutive sequences overlap is divided to half with first half |
677 NEW_LDB = args.new_ldb | 802 NEW_LDB = args.new_ldb |
678 OUTPUT_DIR = args.output_dir | 803 OUTPUT_DIR = args.output_dir |
679 THRESHOLD_SCORE = args.threshold_score | 804 THRESHOLD_SCORE = args.threshold_score |
680 WIN_DOM = args.win_dom | 805 WIN_DOM = args.win_dom |
681 OVERLAP_DOM = args.overlap_dom | 806 OVERLAP_DOM = args.overlap_dom |
807 SCORING_MATRIX = args.scoring_matrix | |
808 configuration.SC_MATRIX = configuration.SC_MATRIX_SKELETON.format(SCORING_MATRIX) | |
682 | 809 |
683 if OUTPUT_DOMAIN is None: | 810 if OUTPUT_DOMAIN is None: |
684 OUTPUT_DOMAIN = configuration.DOMAINS_GFF | 811 OUTPUT_DOMAIN = configuration.DOMAINS_GFF |
685 if os.path.isdir(LAST_DB): | 812 if os.path.isdir(LAST_DB): |
686 LAST_DB = os.path.join(LAST_DB, configuration.LAST_DB_FILE) | 813 LAST_DB = os.path.join(LAST_DB, configuration.LAST_DB_FILE) |
700 if not os.path.exists(OUTPUT_DIR): | 827 if not os.path.exists(OUTPUT_DIR): |
701 os.makedirs(OUTPUT_DIR) | 828 os.makedirs(OUTPUT_DIR) |
702 OUTPUT_DOMAIN = os.path.join(OUTPUT_DIR, | 829 OUTPUT_DOMAIN = os.path.join(OUTPUT_DIR, |
703 os.path.basename(OUTPUT_DOMAIN)) | 830 os.path.basename(OUTPUT_DOMAIN)) |
704 domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN, | 831 domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN, |
705 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM) | 832 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX) |
706 | 833 |
707 print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t)) | 834 print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t)) |
708 | 835 |
709 | 836 |
710 if __name__ == "__main__": | 837 if __name__ == "__main__": |
770 "-dir", | 897 "-dir", |
771 "--output_dir", | 898 "--output_dir", |
772 type=str, | 899 type=str, |
773 help="specify if you want to change the output directory") | 900 help="specify if you want to change the output directory") |
774 parser.add_argument( | 901 parser.add_argument( |
902 "-M", | |
903 "--scoring_matrix", | |
904 type=str, | |
905 default="BL80", | |
906 choices=['BL80', 'BL62', 'MIQS'], | |
907 help="specify scoring matrix to use for similarity search (BL80, BL62, MIQS)") | |
908 parser.add_argument( | |
775 "-thsc", | 909 "-thsc", |
776 "--threshold_score", | 910 "--threshold_score", |
777 type=int, | 911 type=int, |
778 default=80, | 912 default=80, |
779 help= | 913 help= |