diff dante.py @ 10:d0431a839606 draft

Uploaded
author petr-novak
date Wed, 14 Aug 2019 11:24:15 -0400
parents a38efa4937d7
children a6c55d1bdb6c
line wrap: on
line diff
--- a/dante.py	Wed Jul 03 09:21:52 2019 -0400
+++ b/dante.py	Wed Aug 14 11:24:15 2019 -0400
@@ -8,6 +8,7 @@
 from collections import Counter
 from itertools import groupby
 import os
+import re
 import configuration
 from tempfile import NamedTemporaryFile
 import sys
@@ -17,7 +18,6 @@
 
 np.set_printoptions(threshold=sys.maxsize)
 
-
 def alignment_scoring():
     ''' Create hash table for alignment similarity counting: for every 
 	combination of aminoacids in alignment assign score from protein 
@@ -234,9 +234,9 @@
 def score_table(mins, maxs, data, annotations, scores, CLASSIFICATION):
     ''' Score table is created based on the annotations occurance in the cluster.
 	Matrix axis y corresponds to individual annotations (indexed according to classes_dict),
-	axis x represents positions of analyzed seq in a given cluster.
-	For every hit within cluster, array of scores on the corresponding position 
-	is recorded to the table in case if the score on certain position is so far the highest 
+    axis x represents positions of analyzed seq in a given cluster.
+    For every hit within cluster, array of scores on the corresponding position
+    is recorded to the table in case if the score on certain position is so far the highest
 	for the certain position and certain annotation '''
     classes_dict = annotations_dict(annotations)
     score_matrix = np.zeros((len(classes_dict), maxs - mins + 1), dtype=int)
@@ -300,7 +300,7 @@
 def create_gff3(domain_type, ann_substring, unique_annotations, ann_pos_counts,
                 dom_start, dom_end, step, best_idx, annotation_best,
                 db_name_best, db_starts_best, db_ends_best, strand, score,
-                seq_id, db_seq, query_seq, domain_size, positions, gff):
+                seq_id, db_seq, query_seq, domain_size, positions, gff, consensus):
     ''' Record obtained information about domain corresponding to individual cluster to common gff file '''
     best_start = positions[best_idx][0]
     best_end = positions[best_idx][1]
@@ -341,12 +341,12 @@
                 unique_annotations))
     else:
         gff.write(
-            "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Final_Classification={};Region_Hits_Classifications={};Best_Hit={}:{}-{}[{}percent];Best_Hit_DB_Pos={}:{}of{};DB_Seq={};Query_Seq={};Identity={};Similarity={};Relat_Length={};Relat_Interruptions={};Hit_to_DB_Length={}\n".format(
+            "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Final_Classification={};Region_Hits_Classifications={};Best_Hit={}:{}-{}[{}percent];Best_Hit_DB_Pos={}:{}of{};DB_Seq={};Region_Seq={};Query_Seq={};Identity={};Similarity={};Relat_Length={};Relat_Interruptions={};Hit_to_DB_Length={}\n".format(
                 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start,
                 dom_end, best_score, strand, configuration.PHASE, domain_type,
                 ann_substring, unique_annotations, annotation_best, best_start,
                 best_end, length_proportion, db_starts_best, db_ends_best,
-                domain_size_best, db_seq_best, query_seq_best, percent_ident,
+                domain_size_best, db_seq_best, consensus, query_seq_best, percent_ident,
                 align_similarity, relat_align_len, relat_interrupt,
                 db_len_proportion))
 
@@ -451,10 +451,10 @@
 
 
 def domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
-                  THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM):
+                  THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX):
     ''' Search for protein domains using our protein database and external tool LAST,
 	stdout is parsed in real time and hits for a single sequence undergo further processing
-	- tabular format(TAB) to get info about position, score, orientation 
+	- tabular format(TAB) to get info about position, score, orientation
 	- MAF format to gain alignment and original sequence
 	'''
 
@@ -465,15 +465,28 @@
                              below_win, lens_above_win, seq_starts, seq_ends)
 
     ## TAB output contains all the alignment scores, positions, strands...
+    lastal_columns = {
+        "BL80" : ("score, name_db, start_db, al_size_db, strand_db,"
+                   " seq_size_db, name_q, start_q, al_size_q, strand_q, seq_size_q,"
+                   " block1, block2, block3, db_seq, q_seq"),
+        "BL62" : ("score, name_db, start_db, al_size_db, strand_db,"
+                   " seq_size_db, name_q, start_q, al_size_q, strand_q,"
+                   " seq_size_q, block1, block2, block3, db_seq, q_seq"),
+        "MIQS" : ("score, name_db, start_db, al_size_db, strand_db,"
+                  " seq_size_db, name_q, start_q, al_size_q, strand_q,"
+                  " seq_size_q, block1, db_seq, q_seq"),
+    }
     tab = subprocess.Popen(
-        "lastal -F15 {} {} -L 10 -m 70 -p BL80 -f TAB".format(LAST_DB,
-                                                              query_temp),
+        "lastal -F15 {} {} -L 10 -m 70 -p {} -e 80 -f TAB".format(LAST_DB,
+                                                                  query_temp,
+                                                                  SCORING_MATRIX),
         stdout=subprocess.PIPE,
         shell=True)
     ## MAF output contains alignment sequences
     maf = subprocess.Popen(
-        "lastal -F15 {} {} -L 10 -m 70 -p BL80 -f MAF".format(LAST_DB,
-                                                              query_temp),
+        "lastal -F15 {} {} -L 10 -m 70 -p {}  -e 80 -f MAF".format(LAST_DB,
+                                                                   query_temp,
+                                                                   SCORING_MATRIX),
         stdout=subprocess.PIPE,
         shell=True)
 
@@ -495,10 +508,10 @@
                 warnings.simplefilter("ignore")
                 sequence_hits = np.genfromtxt(
                     line_generator(tab_pipe, maf_pipe, start),
-                    names=
-                    "score, name_db, start_db, al_size_db, strand_db, seq_size_db, name_q, start_q, al_size_q, strand_q, seq_size_q, block1, block2, block3, db_seq, q_seq",
-                    usecols=
-                    "score, name_q, start_q, al_size_q, strand_q, seq_size_q, name_db, db_seq, q_seq, seq_size_db, start_db, al_size_db",
+                    names=lastal_columns[SCORING_MATRIX],
+                    usecols=("score, name_q, start_q, al_size_q,"
+                             " strand_q, seq_size_q, name_db, db_seq,"
+                             " q_seq, seq_size_db, start_db, al_size_db"),
                     dtype=None,
                     comments=None)
         except RuntimeError:
@@ -544,6 +557,19 @@
             db_starts = db_start[np.array(region)]
             db_ends = db_end[np.array(region)]
             scores = score[np.array(region)]
+            regions_above_threshold = [
+                region[i]
+                for i, _ in enumerate(region)
+                if max(scores) / 100 * THRESHOLD_SCORE < scores[i]
+            ]
+            ## sort by score first:
+            consensus = get_full_translation(
+                translation_alignments(
+                    query_seq=sortby(query_seq[regions_above_threshold], score[regions_above_threshold], True),
+                    start_hit=sortby(start_hit[regions_above_threshold], score[regions_above_threshold], True),
+                    end_hit=sortby(end_hit[regions_above_threshold], score[regions_above_threshold], True))
+                )
+
             annotations = domain_annotation(db_names, CLASSIFICATION)
             [score_matrix, classes_dict] = score_table(
                 mins[count_region], maxs[count_region], data[count_region],
@@ -560,10 +586,11 @@
             if count_region == len(indices_plus):
                 strand_gff = "-"
             create_gff3(domain_type, ann_substring, unique_annotations,
-                        ann_pos_counts, mins[count_region], maxs[count_region],
+                        ann_pos_counts, min(start_hit[regions_above_threshold])-1,
+                        max(end_hit[regions_above_threshold]),
                         step, best_idx, annotation_best, db_name_best,
                         db_starts_best, db_ends_best, strand_gff, score,
-                        seq_id, db_seq, query_seq, domain_size, positions, gff)
+                        seq_id, db_seq, query_seq, domain_size, positions, gff, consensus)
             count_region += 1
         seq_ids.append(seq_id)
     os.unlink(query_temp)
@@ -577,6 +604,104 @@
         shutil.copy2(dom_tmp.name, OUTPUT_DOMAIN)
     os.unlink(dom_tmp.name)
 
+def  sortby(a, by, reverse=False):
+    ''' sort according values in the by list '''
+    a_sorted = [i[0] for i in
+                sorted(
+                    zip(a, by),
+                    key=lambda i: i[1],
+                    reverse=reverse
+                )]
+    return a_sorted
+
+
+def a2nnn(s):
+    s1 = "".join([c if c in ['/', '\\'] else c + c + c
+                  for c in s.replace("-", "")])
+    # collapse frameshifts (/)
+    s2 = re.sub("[a-zA-Z*]{2}//[a-zA-Z*]{2}", "//", s1)
+    s3 = re.sub("[a-zA-Z*]/[a-zA-Z*]", "/", s2)
+    return (s3)
+
+
+
+def rle(s):
+    '''run length encoding but max is set to 3 (codon)'''
+    prev = ""
+    count = 1
+    char = []
+    length = []
+    L = 0
+    for n in s:
+        if n == prev and count < (3 - L):
+            count += 1
+        else:
+            char.append(prev)
+            length.append(count)
+            L = 1 if prev == "/" else 0
+            prev = n
+            count = 1
+    char.append(prev)
+    length.append(count)
+    sequence = char[1:]
+    return sequence, length[1:]
+
+def get_full_translation(translations):
+    '''get one full length translation  from multiple partial
+    aligned translation '''
+    # find minimal set of alignements
+    minimal_set = []
+    not_filled_prev = len(translations[0])
+    for s in translations:
+        minimal_set.append(s)
+        # check defined position - is there only '-' character?
+        not_filled = sum([set(i) == {"-"} for i in  zip(*minimal_set)])
+        if not_filled == 0:
+            break
+        if not_filled == not_filled_prev:
+            # last added sequence did not improve coverage - remove it.
+            minimal_set.pop()
+        not_filled_prev = not_filled
+    # merge translations
+    final_translation = minimal_set[0]
+    # record positions of joins to correct frameshifts reportings
+    position_of_joins = set()
+    position_of_joins_rle = set()
+    if len(minimal_set) > 1:  # translation must be merged
+        for s in minimal_set[1:]:
+            s1 = re.search(r"-\w", final_translation)
+            s2 = re.search(r"\w-", final_translation)
+            if s1:
+                position_of_joins.add(s1.start())
+            if s2:
+                position_of_joins.add((s2.end() - 1))
+            final_translation = "".join(
+                [b if a == "-" else a for a, b in zip(final_translation, s)])
+    translation_rle = rle(final_translation)
+    cumsumed_positions = np.cumsum(translation_rle[1])
+    for p in position_of_joins:
+        position_of_joins_rle.add(sum(cumsumed_positions <= p))
+    # insert /\ when necessary
+    for p in position_of_joins_rle:
+        if translation_rle[0][p] not in ['/',"//","\\", "\\\\"]:
+            if translation_rle[1][p] == 2:
+                translation_rle[0][p] = translation_rle[0][p] + "/"
+            if translation_rle[1][p] == 1:
+                translation_rle[0][p] = "\\"
+    consensus = "".join(translation_rle[0])
+    return consensus
+
+
+# helper function for debugging
+def translation_alignments(query_seq, start_hit, end_hit):
+    pstart = min(start_hit)
+    pend = max(end_hit)
+    nnn = list()
+    for s, start, end in zip(query_seq, start_hit, end_hit):
+        nnn.append("-" * (start - pstart) + a2nnn(s) + "-" * (pend - end))
+    return (nnn)
+
+
 
 def adjust_gff(OUTPUT_DOMAIN, gff, WIN_DOM, OVERLAP_DOM, step):
     ''' Original gff file is adjusted in case of containing cut parts 
@@ -679,6 +804,8 @@
     THRESHOLD_SCORE = args.threshold_score
     WIN_DOM = args.win_dom
     OVERLAP_DOM = args.overlap_dom
+    SCORING_MATRIX = args.scoring_matrix
+    configuration.SC_MATRIX = configuration.SC_MATRIX_SKELETON.format(SCORING_MATRIX)
 
     if OUTPUT_DOMAIN is None:
         OUTPUT_DOMAIN = configuration.DOMAINS_GFF
@@ -702,7 +829,7 @@
         OUTPUT_DOMAIN = os.path.join(OUTPUT_DIR,
                                      os.path.basename(OUTPUT_DOMAIN))
     domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
-                  THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM)
+                  THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX)
 
     print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t))
 
@@ -772,6 +899,13 @@
         type=str,
         help="specify if you want to change the output directory")
     parser.add_argument(
+        "-M",
+        "--scoring_matrix",
+        type=str,
+        default="BL80",
+        choices=['BL80', 'BL62', 'MIQS'],
+        help="specify scoring matrix to use for similarity search (BL80, BL62, MIQS)")
+    parser.add_argument(
         "-thsc",
         "--threshold_score",
         type=int,