Mercurial > repos > petr-novak > dante
diff dante.py @ 10:d0431a839606 draft
Uploaded
author | petr-novak |
---|---|
date | Wed, 14 Aug 2019 11:24:15 -0400 |
parents | a38efa4937d7 |
children | a6c55d1bdb6c |
line wrap: on
line diff
--- a/dante.py Wed Jul 03 09:21:52 2019 -0400 +++ b/dante.py Wed Aug 14 11:24:15 2019 -0400 @@ -8,6 +8,7 @@ from collections import Counter from itertools import groupby import os +import re import configuration from tempfile import NamedTemporaryFile import sys @@ -17,7 +18,6 @@ np.set_printoptions(threshold=sys.maxsize) - def alignment_scoring(): ''' Create hash table for alignment similarity counting: for every combination of aminoacids in alignment assign score from protein @@ -234,9 +234,9 @@ def score_table(mins, maxs, data, annotations, scores, CLASSIFICATION): ''' Score table is created based on the annotations occurance in the cluster. Matrix axis y corresponds to individual annotations (indexed according to classes_dict), - axis x represents positions of analyzed seq in a given cluster. - For every hit within cluster, array of scores on the corresponding position - is recorded to the table in case if the score on certain position is so far the highest + axis x represents positions of analyzed seq in a given cluster. + For every hit within cluster, array of scores on the corresponding position + is recorded to the table in case if the score on certain position is so far the highest for the certain position and certain annotation ''' classes_dict = annotations_dict(annotations) score_matrix = np.zeros((len(classes_dict), maxs - mins + 1), dtype=int) @@ -300,7 +300,7 @@ def create_gff3(domain_type, ann_substring, unique_annotations, ann_pos_counts, dom_start, dom_end, step, best_idx, annotation_best, db_name_best, db_starts_best, db_ends_best, strand, score, - seq_id, db_seq, query_seq, domain_size, positions, gff): + seq_id, db_seq, query_seq, domain_size, positions, gff, consensus): ''' Record obtained information about domain corresponding to individual cluster to common gff file ''' best_start = positions[best_idx][0] best_end = positions[best_idx][1] @@ -341,12 +341,12 @@ unique_annotations)) else: gff.write( - "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Final_Classification={};Region_Hits_Classifications={};Best_Hit={}:{}-{}[{}percent];Best_Hit_DB_Pos={}:{}of{};DB_Seq={};Query_Seq={};Identity={};Similarity={};Relat_Length={};Relat_Interruptions={};Hit_to_DB_Length={}\n".format( + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Final_Classification={};Region_Hits_Classifications={};Best_Hit={}:{}-{}[{}percent];Best_Hit_DB_Pos={}:{}of{};DB_Seq={};Region_Seq={};Query_Seq={};Identity={};Similarity={};Relat_Length={};Relat_Interruptions={};Hit_to_DB_Length={}\n".format( seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start, dom_end, best_score, strand, configuration.PHASE, domain_type, ann_substring, unique_annotations, annotation_best, best_start, best_end, length_proportion, db_starts_best, db_ends_best, - domain_size_best, db_seq_best, query_seq_best, percent_ident, + domain_size_best, db_seq_best, consensus, query_seq_best, percent_ident, align_similarity, relat_align_len, relat_interrupt, db_len_proportion)) @@ -451,10 +451,10 @@ def domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN, - THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM): + THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX): ''' Search for protein domains using our protein database and external tool LAST, stdout is parsed in real time and hits for a single sequence undergo further processing - - tabular format(TAB) to get info about position, score, orientation + - tabular format(TAB) to get info about position, score, orientation - MAF format to gain alignment and original sequence ''' @@ -465,15 +465,28 @@ below_win, lens_above_win, seq_starts, seq_ends) ## TAB output contains all the alignment scores, positions, strands... + lastal_columns = { + "BL80" : ("score, name_db, start_db, al_size_db, strand_db," + " seq_size_db, name_q, start_q, al_size_q, strand_q, seq_size_q," + " block1, block2, block3, db_seq, q_seq"), + "BL62" : ("score, name_db, start_db, al_size_db, strand_db," + " seq_size_db, name_q, start_q, al_size_q, strand_q," + " seq_size_q, block1, block2, block3, db_seq, q_seq"), + "MIQS" : ("score, name_db, start_db, al_size_db, strand_db," + " seq_size_db, name_q, start_q, al_size_q, strand_q," + " seq_size_q, block1, db_seq, q_seq"), + } tab = subprocess.Popen( - "lastal -F15 {} {} -L 10 -m 70 -p BL80 -f TAB".format(LAST_DB, - query_temp), + "lastal -F15 {} {} -L 10 -m 70 -p {} -e 80 -f TAB".format(LAST_DB, + query_temp, + SCORING_MATRIX), stdout=subprocess.PIPE, shell=True) ## MAF output contains alignment sequences maf = subprocess.Popen( - "lastal -F15 {} {} -L 10 -m 70 -p BL80 -f MAF".format(LAST_DB, - query_temp), + "lastal -F15 {} {} -L 10 -m 70 -p {} -e 80 -f MAF".format(LAST_DB, + query_temp, + SCORING_MATRIX), stdout=subprocess.PIPE, shell=True) @@ -495,10 +508,10 @@ warnings.simplefilter("ignore") sequence_hits = np.genfromtxt( line_generator(tab_pipe, maf_pipe, start), - names= - "score, name_db, start_db, al_size_db, strand_db, seq_size_db, name_q, start_q, al_size_q, strand_q, seq_size_q, block1, block2, block3, db_seq, q_seq", - usecols= - "score, name_q, start_q, al_size_q, strand_q, seq_size_q, name_db, db_seq, q_seq, seq_size_db, start_db, al_size_db", + names=lastal_columns[SCORING_MATRIX], + usecols=("score, name_q, start_q, al_size_q," + " strand_q, seq_size_q, name_db, db_seq," + " q_seq, seq_size_db, start_db, al_size_db"), dtype=None, comments=None) except RuntimeError: @@ -544,6 +557,19 @@ db_starts = db_start[np.array(region)] db_ends = db_end[np.array(region)] scores = score[np.array(region)] + regions_above_threshold = [ + region[i] + for i, _ in enumerate(region) + if max(scores) / 100 * THRESHOLD_SCORE < scores[i] + ] + ## sort by score first: + consensus = get_full_translation( + translation_alignments( + query_seq=sortby(query_seq[regions_above_threshold], score[regions_above_threshold], True), + start_hit=sortby(start_hit[regions_above_threshold], score[regions_above_threshold], True), + end_hit=sortby(end_hit[regions_above_threshold], score[regions_above_threshold], True)) + ) + annotations = domain_annotation(db_names, CLASSIFICATION) [score_matrix, classes_dict] = score_table( mins[count_region], maxs[count_region], data[count_region], @@ -560,10 +586,11 @@ if count_region == len(indices_plus): strand_gff = "-" create_gff3(domain_type, ann_substring, unique_annotations, - ann_pos_counts, mins[count_region], maxs[count_region], + ann_pos_counts, min(start_hit[regions_above_threshold])-1, + max(end_hit[regions_above_threshold]), step, best_idx, annotation_best, db_name_best, db_starts_best, db_ends_best, strand_gff, score, - seq_id, db_seq, query_seq, domain_size, positions, gff) + seq_id, db_seq, query_seq, domain_size, positions, gff, consensus) count_region += 1 seq_ids.append(seq_id) os.unlink(query_temp) @@ -577,6 +604,104 @@ shutil.copy2(dom_tmp.name, OUTPUT_DOMAIN) os.unlink(dom_tmp.name) +def sortby(a, by, reverse=False): + ''' sort according values in the by list ''' + a_sorted = [i[0] for i in + sorted( + zip(a, by), + key=lambda i: i[1], + reverse=reverse + )] + return a_sorted + + +def a2nnn(s): + s1 = "".join([c if c in ['/', '\\'] else c + c + c + for c in s.replace("-", "")]) + # collapse frameshifts (/) + s2 = re.sub("[a-zA-Z*]{2}//[a-zA-Z*]{2}", "//", s1) + s3 = re.sub("[a-zA-Z*]/[a-zA-Z*]", "/", s2) + return (s3) + + + +def rle(s): + '''run length encoding but max is set to 3 (codon)''' + prev = "" + count = 1 + char = [] + length = [] + L = 0 + for n in s: + if n == prev and count < (3 - L): + count += 1 + else: + char.append(prev) + length.append(count) + L = 1 if prev == "/" else 0 + prev = n + count = 1 + char.append(prev) + length.append(count) + sequence = char[1:] + return sequence, length[1:] + +def get_full_translation(translations): + '''get one full length translation from multiple partial + aligned translation ''' + # find minimal set of alignements + minimal_set = [] + not_filled_prev = len(translations[0]) + for s in translations: + minimal_set.append(s) + # check defined position - is there only '-' character? + not_filled = sum([set(i) == {"-"} for i in zip(*minimal_set)]) + if not_filled == 0: + break + if not_filled == not_filled_prev: + # last added sequence did not improve coverage - remove it. + minimal_set.pop() + not_filled_prev = not_filled + # merge translations + final_translation = minimal_set[0] + # record positions of joins to correct frameshifts reportings + position_of_joins = set() + position_of_joins_rle = set() + if len(minimal_set) > 1: # translation must be merged + for s in minimal_set[1:]: + s1 = re.search(r"-\w", final_translation) + s2 = re.search(r"\w-", final_translation) + if s1: + position_of_joins.add(s1.start()) + if s2: + position_of_joins.add((s2.end() - 1)) + final_translation = "".join( + [b if a == "-" else a for a, b in zip(final_translation, s)]) + translation_rle = rle(final_translation) + cumsumed_positions = np.cumsum(translation_rle[1]) + for p in position_of_joins: + position_of_joins_rle.add(sum(cumsumed_positions <= p)) + # insert /\ when necessary + for p in position_of_joins_rle: + if translation_rle[0][p] not in ['/',"//","\\", "\\\\"]: + if translation_rle[1][p] == 2: + translation_rle[0][p] = translation_rle[0][p] + "/" + if translation_rle[1][p] == 1: + translation_rle[0][p] = "\\" + consensus = "".join(translation_rle[0]) + return consensus + + +# helper function for debugging +def translation_alignments(query_seq, start_hit, end_hit): + pstart = min(start_hit) + pend = max(end_hit) + nnn = list() + for s, start, end in zip(query_seq, start_hit, end_hit): + nnn.append("-" * (start - pstart) + a2nnn(s) + "-" * (pend - end)) + return (nnn) + + def adjust_gff(OUTPUT_DOMAIN, gff, WIN_DOM, OVERLAP_DOM, step): ''' Original gff file is adjusted in case of containing cut parts @@ -679,6 +804,8 @@ THRESHOLD_SCORE = args.threshold_score WIN_DOM = args.win_dom OVERLAP_DOM = args.overlap_dom + SCORING_MATRIX = args.scoring_matrix + configuration.SC_MATRIX = configuration.SC_MATRIX_SKELETON.format(SCORING_MATRIX) if OUTPUT_DOMAIN is None: OUTPUT_DOMAIN = configuration.DOMAINS_GFF @@ -702,7 +829,7 @@ OUTPUT_DOMAIN = os.path.join(OUTPUT_DIR, os.path.basename(OUTPUT_DOMAIN)) domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN, - THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM) + THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX) print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t)) @@ -772,6 +899,13 @@ type=str, help="specify if you want to change the output directory") parser.add_argument( + "-M", + "--scoring_matrix", + type=str, + default="BL80", + choices=['BL80', 'BL62', 'MIQS'], + help="specify scoring matrix to use for similarity search (BL80, BL62, MIQS)") + parser.add_argument( "-thsc", "--threshold_score", type=int,