dante: dante.py comparison

comparison dante.py @ 10:d0431a839606 draft

Uploaded

author	petr-novak
date	Wed, 14 Aug 2019 11:24:15 -0400
parents	a38efa4937d7
children	a6c55d1bdb6c

comparison

equal deleted inserted replaced

-:ed4d9ede9cb4
+:d0431a839606
 import time
 from operator import itemgetter
 from collections import Counter
 from itertools import groupby
 import os
+import re
 import configuration
 from tempfile import NamedTemporaryFile
 import sys
 import warnings
 import shutil
 from collections import defaultdict
 np.set_printoptions(threshold=sys.maxsize)
 def alignment_scoring():
 ''' Create hash table for alignment similarity counting: for every
 	combination of aminoacids in alignment assign score from protein
 	scoring matrix defined in configuration file  '''
 def score_table(mins, maxs, data, annotations, scores, CLASSIFICATION):
 ''' Score table is created based on the annotations occurance in the cluster.
 	Matrix axis y corresponds to individual annotations (indexed according to classes_dict),
-	axis x represents positions of analyzed seq in a given cluster.
+axis x represents positions of analyzed seq in a given cluster.
-	For every hit within cluster, array of scores on the corresponding position
+For every hit within cluster, array of scores on the corresponding position
-	is recorded to the table in case if the score on certain position is so far the highest
+is recorded to the table in case if the score on certain position is so far the highest
 	for the certain position and certain annotation '''
 classes_dict = annotations_dict(annotations)
 score_matrix = np.zeros((len(classes_dict), maxs - mins + 1), dtype=int)
 count = 0
 for item in annotations:
 def create_gff3(domain_type, ann_substring, unique_annotations, ann_pos_counts,
 dom_start, dom_end, step, best_idx, annotation_best,
 db_name_best, db_starts_best, db_ends_best, strand, score,
-seq_id, db_seq, query_seq, domain_size, positions, gff):
+seq_id, db_seq, query_seq, domain_size, positions, gff, consensus):
 ''' Record obtained information about domain corresponding to individual cluster to common gff file '''
 best_start = positions[best_idx][0]
 best_end = positions[best_idx][1]
 best_score = score[best_idx]
 ## proportion of length of the best hit to the whole region length found by base
 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start,
 dom_end, strand, configuration.PHASE, domain_type,
 unique_annotations))
 else:
 gff.write(
-"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Final_Classification={};Region_Hits_Classifications={};Best_Hit={}:{}-{}[{}percent];Best_Hit_DB_Pos={}:{}of{};DB_Seq={};Query_Seq={};Identity={};Similarity={};Relat_Length={};Relat_Interruptions={};Hit_to_DB_Length={}\n".format(
+"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Final_Classification={};Region_Hits_Classifications={};Best_Hit={}:{}-{}[{}percent];Best_Hit_DB_Pos={}:{}of{};DB_Seq={};Region_Seq={};Query_Seq={};Identity={};Similarity={};Relat_Length={};Relat_Interruptions={};Hit_to_DB_Length={}\n".format(
 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start,
 dom_end, best_score, strand, configuration.PHASE, domain_type,
 ann_substring, unique_annotations, annotation_best, best_start,
 best_end, length_proportion, db_starts_best, db_ends_best,
-domain_size_best, db_seq_best, query_seq_best, percent_ident,
+domain_size_best, db_seq_best, consensus, query_seq_best, percent_ident,
 align_similarity, relat_align_len, relat_interrupt,
 db_len_proportion))
 def filter_params(db, query, protein_len):
 dom_gff_tmp.write("{}\n".format(configuration.HEADER_GFF))
 dom_gff_tmp.write(version_string)
 def domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
-THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM):
+THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX):
 ''' Search for protein domains using our protein database and external tool LAST,
 	stdout is parsed in real time and hits for a single sequence undergo further processing
 	- tabular format(TAB) to get info about position, score, orientation
 	- MAF format to gain alignment and original sequence
 	'''
 step = WIN_DOM - OVERLAP_DOM
 [headers, above_win, below_win, lens_above_win, seq_starts, seq_ends
 ] = characterize_fasta(QUERY, WIN_DOM)
 query_temp = split_fasta(QUERY, WIN_DOM, step, headers, above_win,
 below_win, lens_above_win, seq_starts, seq_ends)
 ## TAB output contains all the alignment scores, positions, strands...
+lastal_columns = {
+"BL80" : ("score, name_db, start_db, al_size_db, strand_db,"
+" seq_size_db, name_q, start_q, al_size_q, strand_q, seq_size_q,"
+" block1, block2, block3, db_seq, q_seq"),
+"BL62" : ("score, name_db, start_db, al_size_db, strand_db,"
+" seq_size_db, name_q, start_q, al_size_q, strand_q,"
+" seq_size_q, block1, block2, block3, db_seq, q_seq"),
+"MIQS" : ("score, name_db, start_db, al_size_db, strand_db,"
+" seq_size_db, name_q, start_q, al_size_q, strand_q,"
+" seq_size_q, block1, db_seq, q_seq"),
+}
 tab = subprocess.Popen(
-"lastal -F15 {} {} -L 10 -m 70 -p BL80 -f TAB".format(LAST_DB,
+"lastal -F15 {} {} -L 10 -m 70 -p {} -e 80 -f TAB".format(LAST_DB,
-query_temp),
+query_temp,
+SCORING_MATRIX),
 stdout=subprocess.PIPE,
 shell=True)
 ## MAF output contains alignment sequences
 maf = subprocess.Popen(
-"lastal -F15 {} {} -L 10 -m 70 -p BL80 -f MAF".format(LAST_DB,
+"lastal -F15 {} {} -L 10 -m 70 -p {}  -e 80 -f MAF".format(LAST_DB,
-query_temp),
+query_temp,
+SCORING_MATRIX),
 stdout=subprocess.PIPE,
 shell=True)
 tab_pipe = tab.stdout
 maf_pipe = maf.stdout
 try:
 with warnings.catch_warnings():
 warnings.simplefilter("ignore")
 sequence_hits = np.genfromtxt(
 line_generator(tab_pipe, maf_pipe, start),
-names=
+names=lastal_columns[SCORING_MATRIX],
-"score, name_db, start_db, al_size_db, strand_db, seq_size_db, name_q, start_q, al_size_q, strand_q, seq_size_q, block1, block2, block3, db_seq, q_seq",
+usecols=("score, name_q, start_q, al_size_q,"
-usecols=
+" strand_q, seq_size_q, name_db, db_seq,"
-"score, name_q, start_q, al_size_q, strand_q, seq_size_q, name_db, db_seq, q_seq, seq_size_db, start_db, al_size_db",
+" q_seq, seq_size_db, start_db, al_size_db"),
 dtype=None,
 comments=None)
 except RuntimeError:
 break
 ## if there are no domains found
 for region in indices_overal:
 db_names = domain_db[np.array(region)]
 db_starts = db_start[np.array(region)]
 db_ends = db_end[np.array(region)]
 scores = score[np.array(region)]
+regions_above_threshold = [
+region[i]
+for i, _ in enumerate(region)
+if max(scores) / 100 * THRESHOLD_SCORE < scores[i]
+]
+## sort by score first:
+consensus = get_full_translation(
+translation_alignments(
+query_seq=sortby(query_seq[regions_above_threshold], score[regions_above_threshold], True),
+start_hit=sortby(start_hit[regions_above_threshold], score[regions_above_threshold], True),
+end_hit=sortby(end_hit[regions_above_threshold], score[regions_above_threshold], True))
+)
 annotations = domain_annotation(db_names, CLASSIFICATION)
 [score_matrix, classes_dict] = score_table(
 mins[count_region], maxs[count_region], data[count_region],
 annotations, scores, CLASSIFICATION)
 ann_per_reg = score_matrix_evaluation(score_matrix, classes_dict,
 db_starts_best = db_starts[best_idx_reg]
 db_ends_best = db_ends[best_idx_reg]
 if count_region == len(indices_plus):
 strand_gff = "-"
 create_gff3(domain_type, ann_substring, unique_annotations,
-ann_pos_counts, mins[count_region], maxs[count_region],
+ann_pos_counts, min(start_hit[regions_above_threshold])-1,
+max(end_hit[regions_above_threshold]),
 step, best_idx, annotation_best, db_name_best,
 db_starts_best, db_ends_best, strand_gff, score,
-seq_id, db_seq, query_seq, domain_size, positions, gff)
+seq_id, db_seq, query_seq, domain_size, positions, gff, consensus)
 count_region += 1
 seq_ids.append(seq_id)
 os.unlink(query_temp)
 gff.close()
 dom_tmp.close()
 adjust_gff(OUTPUT_DOMAIN, dom_tmp.name, WIN_DOM, OVERLAP_DOM, step)
 ## otherwise use the temporary output as the final domains gff
 else:
 shutil.copy2(dom_tmp.name, OUTPUT_DOMAIN)
 os.unlink(dom_tmp.name)
+def  sortby(a, by, reverse=False):
+''' sort according values in the by list '''
+a_sorted = [i[0] for i in
+sorted(
+zip(a, by),
+key=lambda i: i[1],
+reverse=reverse
+)]
+return a_sorted
+def a2nnn(s):
+s1 = "".join([c if c in ['/', '\\'] else c + c + c
+for c in s.replace("-", "")])
+# collapse frameshifts (/)
+s2 = re.sub("[a-zA-Z*]{2}//[a-zA-Z*]{2}", "//", s1)
+s3 = re.sub("[a-zA-Z*]/[a-zA-Z*]", "/", s2)
+return (s3)
+def rle(s):
+'''run length encoding but max is set to 3 (codon)'''
+prev = ""
+count = 1
+char = []
+length = []
+L = 0
+for n in s:
+if n == prev and count < (3 - L):
+count += 1
+else:
+char.append(prev)
+length.append(count)
+L = 1 if prev == "/" else 0
+prev = n
+count = 1
+char.append(prev)
+length.append(count)
+sequence = char[1:]
+return sequence, length[1:]
+def get_full_translation(translations):
+'''get one full length translation  from multiple partial
+aligned translation '''
+# find minimal set of alignements
+minimal_set = []
+not_filled_prev = len(translations[0])
+for s in translations:
+minimal_set.append(s)
+# check defined position - is there only '-' character?
+not_filled = sum([set(i) == {"-"} for i in  zip(*minimal_set)])
+if not_filled == 0:
+break
+if not_filled == not_filled_prev:
+# last added sequence did not improve coverage - remove it.
+minimal_set.pop()
+not_filled_prev = not_filled
+# merge translations
+final_translation = minimal_set[0]
+# record positions of joins to correct frameshifts reportings
+position_of_joins = set()
+position_of_joins_rle = set()
+if len(minimal_set) > 1:  # translation must be merged
+for s in minimal_set[1:]:
+s1 = re.search(r"-\w", final_translation)
+s2 = re.search(r"\w-", final_translation)
+if s1:
+position_of_joins.add(s1.start())
+if s2:
+position_of_joins.add((s2.end() - 1))
+final_translation = "".join(
+[b if a == "-" else a for a, b in zip(final_translation, s)])
+translation_rle = rle(final_translation)
+cumsumed_positions = np.cumsum(translation_rle[1])
+for p in position_of_joins:
+position_of_joins_rle.add(sum(cumsumed_positions <= p))
+# insert /\ when necessary
+for p in position_of_joins_rle:
+if translation_rle[0][p] not in ['/',"//","\\", "\\\\"]:
+if translation_rle[1][p] == 2:
+translation_rle[0][p] = translation_rle[0][p] + "/"
+if translation_rle[1][p] == 1:
+translation_rle[0][p] = "\\"
+consensus = "".join(translation_rle[0])
+return consensus
+# helper function for debugging
+def translation_alignments(query_seq, start_hit, end_hit):
+pstart = min(start_hit)
+pend = max(end_hit)
+nnn = list()
+for s, start, end in zip(query_seq, start_hit, end_hit):
+nnn.append("-" * (start - pstart) + a2nnn(s) + "-" * (pend - end))
+return (nnn)
 def adjust_gff(OUTPUT_DOMAIN, gff, WIN_DOM, OVERLAP_DOM, step):
 ''' Original gff file is adjusted in case of containing cut parts
 	- for consecutive sequences overlap is divided to half with first half
 NEW_LDB = args.new_ldb
 OUTPUT_DIR = args.output_dir
 THRESHOLD_SCORE = args.threshold_score
 WIN_DOM = args.win_dom
 OVERLAP_DOM = args.overlap_dom
+SCORING_MATRIX = args.scoring_matrix
+configuration.SC_MATRIX = configuration.SC_MATRIX_SKELETON.format(SCORING_MATRIX)
 if OUTPUT_DOMAIN is None:
 OUTPUT_DOMAIN = configuration.DOMAINS_GFF
 if os.path.isdir(LAST_DB):
 LAST_DB = os.path.join(LAST_DB, configuration.LAST_DB_FILE)
 if not os.path.exists(OUTPUT_DIR):
 os.makedirs(OUTPUT_DIR)
 OUTPUT_DOMAIN = os.path.join(OUTPUT_DIR,
 os.path.basename(OUTPUT_DOMAIN))
 domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
-THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM)
+THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX)
 print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t))
 if __name__ == "__main__":
 "-dir",
 "--output_dir",
 type=str,
 help="specify if you want to change the output directory")
 parser.add_argument(
+"-M",
+"--scoring_matrix",
+type=str,
+default="BL80",
+choices=['BL80', 'BL62', 'MIQS'],
+help="specify scoring matrix to use for similarity search (BL80, BL62, MIQS)")
+parser.add_argument(
 "-thsc",
 "--threshold_score",
 type=int,
 default=80,
 help=

Mercurial > repos > petr-novak > dante

comparison dante.py @ 10:d0431a839606 draft