Mercurial > repos > petr-novak > dante

#!/usr/bin/env python3

import numpy as np
import subprocess
import math
import time
from operator import itemgetter
from collections import Counter
from itertools import groupby
import os
import re
import configuration
from tempfile import NamedTemporaryFile
import sys
import warnings
import shutil
from collections import defaultdict

np.set_printoptions(threshold=sys.maxsize)

def alignment_scoring():
    ''' Create hash table for alignment similarity counting: for every
	combination of aminoacids in alignment assign score from protein
	scoring matrix defined in configuration file  '''
    score_dict = {}
    with open(configuration.SC_MATRIX) as smatrix:
        count = 1
        for line in smatrix:
            if not line.startswith("#"):
                if count == 1:
                    aa_all = line.rstrip().replace(" ", "")
                else:
                    count_aa = 1
                    line = list(filter(None, line.rstrip().split(" ")))
                    for aa in aa_all:
                        score_dict["{}{}".format(line[0], aa)] = line[count_aa]
                        count_aa += 1
                count += 1
    return score_dict


def characterize_fasta(QUERY, WIN_DOM):
    ''' Find the sequences, their lengths, starts, ends and if
	they exceed the window '''
    with open(QUERY) as query:
        headers = []
        fasta_lengths = []
        seq_starts = []
        seq_ends = []
        fasta_chunk_len = 0
        count_line = 1
        for line in query:
            line = line.rstrip()
            if line.startswith(">"):
                headers.append(line.rstrip())
                fasta_lengths.append(fasta_chunk_len)
                fasta_chunk_len = 0
                seq_starts.append(count_line + 1)
                seq_ends.append(count_line - 1)
            else:
                fasta_chunk_len += len(line)
            count_line += 1
        seq_ends.append(count_line)
        seq_ends = seq_ends[1:]
        fasta_lengths.append(fasta_chunk_len)
        fasta_lengths = fasta_lengths[1:]
        # control if there are correct (unique) names for individual seqs:
        # LASTAL takes seqs IDs till the first space which can then create problems with ambiguous records
        if len(headers) > len(set([header.split(" ")[0] for header in headers
                                   ])):
            raise NameError(
                '''Sequences in multifasta format are not named correctly:
							seq IDs (before the first space) are the same''')

    above_win = [idx
                 for idx, value in enumerate(fasta_lengths) if value > WIN_DOM]
    below_win = [idx
                 for idx, value in enumerate(fasta_lengths)
                 if value <= WIN_DOM]
    lens_above_win = np.array(fasta_lengths)[above_win]
    return headers, above_win, below_win, lens_above_win, seq_starts, seq_ends


def split_fasta(QUERY, WIN_DOM, step, headers, above_win, below_win,
                lens_above_win, seq_starts, seq_ends):
    ''' Create temporary file containing all sequences - the ones that exceed
	the window are cut with a set overlap (greater than domain size with a reserve) '''
    with open(QUERY, "r") as query:
        count_fasta_divided = 0
        count_fasta_not_divided = 0
        ntf = NamedTemporaryFile(delete=False)
        divided = np.array(headers)[above_win]
        row_length = configuration.FASTA_LINE
        for line in query:
            line = line.rstrip()
            if line.startswith(">") and line in divided:
                stop_line = seq_ends[above_win[
                    count_fasta_divided]] - seq_starts[above_win[
                        count_fasta_divided]] + 1
                count_line = 0
                whole_seq = []
                for line2 in query:
                    whole_seq.append(line2.rstrip())
                    count_line += 1
                    if count_line == stop_line:
                        break
                whole_seq = "".join(whole_seq)
                ## create list of starting positions for individual parts of a seq with a step given by a window and overlap
                windows_starts = list(range(0, lens_above_win[
                    count_fasta_divided], step))
                ## create list of ending positions (starting pos + window), the last element is the whole seq length
                windows_ends = [
                    x + WIN_DOM
                    if x + WIN_DOM < lens_above_win[count_fasta_divided] else
                    lens_above_win[count_fasta_divided] for x in windows_starts
                ]
                count_part = 1
                for start_part, end_part in zip(windows_starts, windows_ends):
                    seq_part = whole_seq[start_part:end_part]
                    if count_part == len(windows_starts):
                        ntf.write("{}_DANTE_PART{}_LAST:{}-{}\n{}\n".format(
                            line.split(" ")[0], count_part, start_part + 1,
                            end_part, "\n".join([seq_part[i:i + row_length]
                                                 for i in range(0, len(
                                                     seq_part), row_length)
                                                 ])).encode("utf-8"))
                    else:
                        ntf.write("{}_DANTE_PART{}:{}-{}\n{}\n".format(
                            line.split(" ")[0], count_part, start_part + 1,
                            end_part, "\n".join([seq_part[i:i + row_length]
                                                 for i in range(0, len(
                                                     seq_part), row_length)
                                                 ])).encode("utf-8"))
                    count_part += 1
                count_fasta_divided += 1
            elif line.startswith(">") and line not in divided:
                length_seq = seq_ends[below_win[
                    count_fasta_not_divided]] - seq_starts[below_win[
                        count_fasta_not_divided]] + 1
                ntf.write("{}\n{}".format(line, "".join([query.readline(
                ) for x in range(length_seq)])).encode("utf-8"))
                count_fasta_not_divided += 1
        query_temp = ntf.name
        ntf.close()
    return query_temp


def domain_annotation(elements, CLASSIFICATION):
    ''' Assign protein domain to each hit from protein database  '''
    domains = []
    annotations = []
    with open(CLASSIFICATION, "r") as cl_tbl:
        annotation = {}
        for line in cl_tbl:
            record = line.rstrip().split("\t")
            annotation[record[0]] = record[1:]
    for i in range(len(elements)):
        domains.append(elements[i].split("__")[0].split("-")[1])
        element_name = "__".join(elements[i].split("__")[1:])
        if element_name in annotation.keys():
            annotations.append("|".join([elements[i].split("__")[0].split("-")[
                1], ("|".join(annotation[element_name]))]))
        else:
            annotations.append("unknown|unknown")
    return annotations


def hits_processing(seq_len, start, end, strand):
    ''' Gain hits intervals separately for forward and reverse strand '''
    reverse_strand_idx = np.where(strand == "-")[0]
    if not reverse_strand_idx.any():
        start_pos_plus = start + 1
        end_pos_plus = end
        regions_plus = list(zip(start_pos_plus, end_pos_plus))
        regions_minus = []
    else:
        reverse_strand_idx = reverse_strand_idx[0]
        start_pos_plus = start[0:reverse_strand_idx] + 1
        end_pos_plus = end[0:reverse_strand_idx]
        start_pos_minus = seq_len[0] - end[reverse_strand_idx:] + 1
        end_pos_minus = seq_len[0] - start[reverse_strand_idx:]
        regions_plus = list(zip(start_pos_plus, end_pos_plus))
        regions_minus = list(zip(start_pos_minus, end_pos_minus))
    return reverse_strand_idx, regions_plus, regions_minus


def overlapping_regions(input_data):
    ''' Join all overlapping intervals(hits) to clusters (potential domains),
	get list of start-end positions of individual hits within the interval,
	list of minimus and maximums as well as the indices in the original
	sequence_hits structure for the hits belonging to the same clusters '''
    if input_data:
        sorted_idx, sorted_data = zip(*sorted(
            [(index, data) for index, data in enumerate(input_data)],
            key=itemgetter(1)))
        merged_ends = input_data[sorted_idx[0]][1]
        intervals = []
        data = []
        output_intervals = []
        output_data = []
        for i, j in zip(sorted_idx, sorted_data):
            if input_data[i][0] < merged_ends:
                merged_ends = max(input_data[i][1], merged_ends)
                intervals.append(i)
                data.append(j)
            else:
                output_intervals.append(intervals)
                output_data.append(data)
                intervals = []
                data = []
                intervals.append(i)
                data.append(j)
                merged_ends = input_data[i][1]
        output_intervals.append(intervals)
        output_data.append(data)
        mins = [x[0][0] for x in output_data]
        maxs = [max(x, key=itemgetter(1))[1] for x in output_data]
    else:
        mins = []
        maxs = []
        output_intervals = []
        output_data = []
    return mins, maxs, output_data, output_intervals


def annotations_dict(annotations):
    ''' Hash table where annotations of the hits within a clusters are the keys.
	Each annotation has serial number assigned which indexes the row in the score_table '''
    classes_dict = {classes: idx
                    for idx, classes in enumerate(set(annotations))}
    return classes_dict


def score_table(mins, maxs, data, annotations, scores, CLASSIFICATION):
    ''' Score table is created based on the annotations occurance in the cluster.
	Matrix axis y corresponds to individual annotations (indexed according to classes_dict),
    axis x represents positions of analyzed seq in a given cluster.
    For every hit within cluster, array of scores on the corresponding position
    is recorded to the table in case if the score on certain position is so far the highest
	for the certain position and certain annotation '''
    classes_dict = annotations_dict(annotations)
    score_matrix = np.zeros((len(classes_dict), maxs - mins + 1), dtype=int)
    count = 0
    for item in annotations:
        saved_scores = score_matrix[classes_dict[item], data[count][0] - mins:
                                    data[count][1] - mins + 1]
        new_scores = [scores[count]] * len(saved_scores)
        score_matrix[classes_dict[item], data[count][0] - mins:data[count][
            1] - mins + 1] = [max(*pos_score)
                              for pos_score in zip(saved_scores, new_scores)]
        count += 1
    return score_matrix, classes_dict


def score_matrix_evaluation(score_matrix, classes_dict, THRESHOLD_SCORE):
    ''' Score matrix is evaluated based on each position.
	For every position the list of annotations with a score which reaches
	certain percentage of the overal best score of the cluster are stored '''
    ann_per_reg = []
    overal_best_score_reg = max((score_matrix.max(axis=1)))
    for position in score_matrix.T:
        ## score threshold calculated as a percentage of the OVERALL best score in the cluster
        threshold = overal_best_score_reg * THRESHOLD_SCORE / 100
        above_th = [idx
                    for idx, score in enumerate(position)
                    if position[idx] >= threshold]
        ## select unique annotations in one position that are above threshold
        ann_per_pos = list(set(
            [key for key, value in classes_dict.items() if value in above_th]))
        ann_per_reg.append(ann_per_pos)
    return ann_per_reg


def group_annot_regs(ann_per_reg):
    ''' Get list of domains, annotations, longest common annotations and
	counts of positions with certain annotation per regions '''
    ## tranform list of lists (potential multiple annotations for every position ) to flat list of all annotations
    all_annotations = [item for sublist in ann_per_reg for item in sublist]
    unique_annotations = list(set(all_annotations))
    ann_pos_counts = [all_annotations.count(x) for x in unique_annotations]
    unique_annotations = list(set(
        [item for sublist in ann_per_reg for item in sublist]))
    domain_type = list(set([annotation.split("|")[0]
                            for annotation in unique_annotations]))
    classification_list = [annotation.split("|")
                           for annotation in unique_annotations]
    ann_substring = "|".join(os.path.commonprefix(classification_list))
    domain_type = "/".join(domain_type)
    return domain_type, ann_substring, unique_annotations, ann_pos_counts


def best_score(scores, region):
    ''' From overlapping intervals take the one with the highest score '''
    ## if more hits have the same best score take only the first one
    best_idx = region[np.where(scores == max(scores))[0][0]]
    best_idx_reg = np.where(scores == max(scores))[0][0]
    return best_idx, best_idx_reg


def create_gff3(domain_type, ann_substring, unique_annotations, ann_pos_counts,
                dom_start, dom_end, step, best_idx, annotation_best,
                db_name_best, db_starts_best, db_ends_best, strand, score,
                seq_id, db_seq, query_seq, domain_size, positions, gff, consensus):
    ''' Record obtained information about domain corresponding to individual cluster to common gff file '''
    best_start = positions[best_idx][0]
    best_end = positions[best_idx][1]
    best_score = score[best_idx]
    ## proportion of length of the best hit to the whole region length found by base
    length_proportion = int((best_end - best_start + 1) /
                            (dom_end - dom_start + 1) * 100)
    db_seq_best = db_seq[best_idx]
    query_seq_best = query_seq[best_idx]
    domain_size_best = domain_size[best_idx]
    [percent_ident, align_similarity, relat_align_len, relat_interrupt,
     db_len_proportion
     ] = filter_params(db_seq_best, query_seq_best, domain_size_best)
    ann_substring = "|".join(ann_substring.split("|")[1:])
    annotation_best = "|".join([db_name_best] + annotation_best.split("|")[1:])
    if "DANTE_PART" in seq_id:
        part = int(seq_id.split("DANTE_PART")[1].split(":")[0].split("_")[0])
        dom_start = dom_start + (part - 1) * step
        dom_end = dom_end + (part - 1) * step
        best_start = best_start + (part - 1) * step
        best_end = best_end + (part - 1) * step
    if ann_substring is '':
        ann_substring = "NONE(Annotations from different classes)"
    if len(unique_annotations) > 1:
        unique_annotations = ",".join(["{}[{}bp]".format(
            ann, pos) for ann, pos in zip(unique_annotations, ann_pos_counts)])
    else:
        unique_annotations = unique_annotations[0]
    if __name__ == '__main__':
        SOURCE = configuration.SOURCE_DANTE
    else:
        SOURCE = configuration.SOURCE_PROFREP
    if "/" in domain_type:
        gff.write(
            "{}\t{}\t{}\t{}\t{}\t.\t{}\t{}\tName={};Final_Classification=Ambiguous_domain;Region_Hits_Classifications_={}\n".format(
                seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start,
                dom_end, strand, configuration.PHASE, domain_type,
                unique_annotations))
    else:
        gff.write(
            "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Final_Classification={};Region_Hits_Classifications={};Best_Hit={}:{}-{}[{}percent];Best_Hit_DB_Pos={}:{}of{};DB_Seq={};Region_Seq={};Query_Seq={};Identity={};Similarity={};Relat_Length={};Relat_Interruptions={};Hit_to_DB_Length={}\n".format(
                seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start,
                dom_end, best_score, strand, configuration.PHASE, domain_type,
                ann_substring, unique_annotations, annotation_best, best_start,
                best_end, length_proportion, db_starts_best, db_ends_best,
                domain_size_best, db_seq_best, consensus, query_seq_best, percent_ident,
                align_similarity, relat_align_len, relat_interrupt,
                db_len_proportion))


def filter_params(db, query, protein_len):
    ''' Calculate basic statistics of the quality of the alignment '''
    score_dict = alignment_scoring()
    num_ident = 0
    count_interrupt = 0
    count_similarity = 0
    alignment_len = 0
    for i, j in zip(db.upper(), query.upper()):
        if i == j and i != "X":
            num_ident += 1
        if j == "/" or j == "\\" or j == "*":
            count_interrupt += 1
        if (i.isalpha() or i == "*") and (j.isalpha() or j == "*"):
            if int(score_dict["{}{}".format(i, j)]) > 0:
                count_similarity += 1
    ## gapless alignment length proportional to the domain protein length
    relat_align_len = round((len(db) - db.count("-")) / protein_len, 3)
    ## proportional identical bases (except of X) to al.length
    align_identity = round(num_ident / len(db), 2)
    ## proportional count of positive scores from scoring matrix to al. length
    align_similarity = round(count_similarity / len(db), 2)
    ## number of interruptions per 100 bp
    relat_interrupt = round(count_interrupt / math.ceil((len(query) / 100)), 2)
    ## Proportion of alignment to the original length of protein domain from database (indels included)
    db_len_proportion = round(len(db) / protein_len, 2)
    return align_identity, align_similarity, relat_align_len, relat_interrupt, db_len_proportion


def line_generator(tab_pipe, maf_pipe, start):
    ''' Yield individual lines of LASTAL stdout for single sequence '''
    if hasattr(line_generator, "dom"):
        seq_id = line_generator.dom.split("\t")[6]
        yield line_generator.dom.encode("utf-8")
        del line_generator.dom
    line_tab = ""
    for line_tab in tab_pipe:
        line_tab = line_tab.decode("utf-8")
        if not line_tab.startswith('#'):
            if start:
                if not ('seq_id' in locals() and
                        seq_id != line_tab.split("\t")[6]):
                    seq_id = line_tab.split("\t")[6]
                    start = False
            line_maf = [maf_pipe.readline() for line_count in range(4)]
            db_seq = line_maf[1].decode("utf-8").rstrip().split(" ")[-1]
            alignment_seq = line_maf[2].decode("utf-8").rstrip().split(" ")[-1]
            line = "{}\t{}\t{}".format(line_tab, db_seq, alignment_seq)
            line_id = line.split("\t")[6]
            if seq_id != line_id:
                line_generator.dom = line
                return
            else:
                yield line.encode("utf-8")
        else:
            maf_pipe.readline()
    if line_tab == "":
        raise RuntimeError
    else:
        return


def get_version(path, LAST_DB):
    '''Return version is run from git repository '''
    try:
        branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",
                                         shell=True,
                                         cwd=path).decode('ascii').strip()
        shorthash = subprocess.check_output("git log --pretty=format:'%h' -n 1  ",
                                            shell=True,
                                            cwd=path).decode('ascii').strip()
        revcount = len(subprocess.check_output("git log --oneline",
                                               shell=True,
                                               cwd=path).decode('ascii').split())
        version_string = (
            "##-----------------------------------------------\n"
            "##PIPELINE VERSION         : "
            "{branch}-rv-{revcount}({shorthash})\n"
            "##PROTEIN DATABASE VERSION : {PD}\n"
            "##-----------------------------------------------\n").format(
                branch=branch,
                shorthash=shorthash,
                revcount=revcount,
                PD=os.path.basename(LAST_DB))
    except:
        version_string = (
            "##-----------------------------------------------\n"
            "##PROTEIN DATABASE VERSION : {PD}\n"
            "##-----------------------------------------------\n").format(
                PD=os.path.basename(LAST_DB)
            )

    return version_string


def write_info(dom_gff_tmp, version_string):
    dom_gff_tmp.write("{}\n".format(configuration.HEADER_GFF))
    dom_gff_tmp.write(version_string)


def domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
                  THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX):
    ''' Search for protein domains using our protein database and external tool LAST,
	stdout is parsed in real time and hits for a single sequence undergo further processing
	- tabular format(TAB) to get info about position, score, orientation
	- MAF format to gain alignment and original sequence
	'''

    step = WIN_DOM - OVERLAP_DOM
    [headers, above_win, below_win, lens_above_win, seq_starts, seq_ends
     ] = characterize_fasta(QUERY, WIN_DOM)
    query_temp = split_fasta(QUERY, WIN_DOM, step, headers, above_win,
                             below_win, lens_above_win, seq_starts, seq_ends)

    ## TAB output contains all the alignment scores, positions, strands...
    lastal_columns = {
        "BL80" : ("score, name_db, start_db, al_size_db, strand_db,"
                   " seq_size_db, name_q, start_q, al_size_q, strand_q, seq_size_q,"
                   " block1, block2, block3, db_seq, q_seq"),
        "BL62" : ("score, name_db, start_db, al_size_db, strand_db,"
                   " seq_size_db, name_q, start_q, al_size_q, strand_q,"
                   " seq_size_q, block1, block2, block3, db_seq, q_seq"),
        "MIQS" : ("score, name_db, start_db, al_size_db, strand_db,"
                  " seq_size_db, name_q, start_q, al_size_q, strand_q,"
                  " seq_size_q, block1, db_seq, q_seq"),
    }
    tab = subprocess.Popen(
        "lastal -F15 {} {} -L 10 -m 70 -p {} -e 80 -f TAB".format(LAST_DB,
                                                                  query_temp,
                                                                  SCORING_MATRIX),
        stdout=subprocess.PIPE,
        shell=True)
    ## MAF output contains alignment sequences
    maf = subprocess.Popen(
        "lastal -F15 {} {} -L 10 -m 70 -p {}  -e 80 -f MAF".format(LAST_DB,
                                                                   query_temp,
                                                                   SCORING_MATRIX),
        stdout=subprocess.PIPE,
        shell=True)

    tab_pipe = tab.stdout
    maf_pipe = maf.stdout
    maf_pipe.readline()

    seq_ids = []
    dom_tmp = NamedTemporaryFile(delete=False)
    with open(dom_tmp.name, "w") as dom_gff_tmp:
        path = os.path.dirname(os.path.realpath(__file__))
        version_string = get_version(path, LAST_DB)
        write_info(dom_gff_tmp, version_string)
    gff = open(dom_tmp.name, "a")
    start = True
    while True:
        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                sequence_hits = np.genfromtxt(
                    line_generator(tab_pipe, maf_pipe, start),
                    names=lastal_columns[SCORING_MATRIX],
                    usecols=("score, name_q, start_q, al_size_q,"
                             " strand_q, seq_size_q, name_db, db_seq,"
                             " q_seq, seq_size_db, start_db, al_size_db"),
                    dtype=None,
                    comments=None)
        except RuntimeError:
            break
        ## if there are no domains found
        if sequence_hits.size is 0:
            gff.write("##NO DOMAINS")
            return [], [], [], []

        ############# PARSING LASTAL OUTPUT ############################
        sequence_hits = np.atleast_1d(sequence_hits)
        score = sequence_hits['score'].astype("int")
        seq_id = sequence_hits['name_q'][0].astype("str")
        start_hit = sequence_hits['start_q'].astype("int")
        end_hit = start_hit + sequence_hits['al_size_q'].astype("int")
        strand = sequence_hits['strand_q'].astype("str")
        seq_len = sequence_hits['seq_size_q'].astype("int")
        domain_db = sequence_hits['name_db'].astype("str")
        db_seq = sequence_hits['db_seq'].astype("str")
        query_seq = sequence_hits['q_seq'].astype("str")
        domain_size = sequence_hits['seq_size_db'].astype("int")
        db_start = sequence_hits['start_db'].astype("int") + 1
        db_end = sequence_hits['start_db'].astype("int") + sequence_hits[
            'al_size_db'].astype("int")

        [reverse_strand_idx, positions_plus, positions_minus
         ] = hits_processing(seq_len, start_hit, end_hit, strand)
        strand_gff = "+"
        [mins_plus, maxs_plus, data_plus, indices_plus
         ] = overlapping_regions(positions_plus)
        [mins_minus, maxs_minus, data_minus, indices_minus
         ] = overlapping_regions(positions_minus)
        positions = positions_plus + positions_minus
        indices_overal = indices_plus + [x + reverse_strand_idx
                                         for x in indices_minus]
        mins = mins_plus + mins_minus
        maxs = maxs_plus + maxs_minus
        data = data_plus + data_minus
        ## process every region (cluster) of overlapping hits sequentially
        count_region = 0
        for region in indices_overal:
            db_names = domain_db[np.array(region)]
            db_starts = db_start[np.array(region)]
            db_ends = db_end[np.array(region)]
            scores = score[np.array(region)]
            regions_above_threshold = [
                region[i]
                for i, _ in enumerate(region)
                if max(scores) / 100 * THRESHOLD_SCORE < scores[i]
            ]
            ## sort by score first:
            consensus = get_full_translation(
                translation_alignments(
                    query_seq=sortby(query_seq[regions_above_threshold], score[regions_above_threshold], True),
                    start_hit=sortby(start_hit[regions_above_threshold], score[regions_above_threshold], True),
                    end_hit=sortby(end_hit[regions_above_threshold], score[regions_above_threshold], True))
                )

            annotations = domain_annotation(db_names, CLASSIFICATION)
            [score_matrix, classes_dict] = score_table(
                mins[count_region], maxs[count_region], data[count_region],
                annotations, scores, CLASSIFICATION)
            ann_per_reg = score_matrix_evaluation(score_matrix, classes_dict,
                                                  THRESHOLD_SCORE)
            [domain_type, ann_substring, unique_annotations, ann_pos_counts
             ] = group_annot_regs(ann_per_reg)
            [best_idx, best_idx_reg] = best_score(scores, region)
            annotation_best = annotations[best_idx_reg]
            db_name_best = db_names[best_idx_reg]
            db_starts_best = db_starts[best_idx_reg]
            db_ends_best = db_ends[best_idx_reg]
            if count_region == len(indices_plus):
                strand_gff = "-"
            if strand_gff == "+":
                feature_start = min(start_hit[regions_above_threshold]) + 1
                feature_end = max(end_hit[regions_above_threshold])
            else:
                feature_end = seq_len[region][0] - min(start_hit[regions_above_threshold])
                feature_start = seq_len[region][0] - max(end_hit[regions_above_threshold]) + 1
            create_gff3(domain_type, ann_substring, unique_annotations,
                        ann_pos_counts, feature_start,feature_end,
                        step, best_idx, annotation_best, db_name_best,
                        db_starts_best, db_ends_best, strand_gff, score,
                        seq_id, db_seq, query_seq, domain_size, positions, gff, consensus)
            count_region += 1
        seq_ids.append(seq_id)
    os.unlink(query_temp)
    gff.close()
    dom_tmp.close()
    ## if any sequence from input data was split into windows, merge and adjust the data from individual windows
    if any("DANTE_PART" in x for x in seq_ids):
        adjust_gff(OUTPUT_DOMAIN, dom_tmp.name, WIN_DOM, OVERLAP_DOM, step)
    ## otherwise use the temporary output as the final domains gff
    else:
        shutil.copy2(dom_tmp.name, OUTPUT_DOMAIN)
    os.unlink(dom_tmp.name)

def  sortby(a, by, reverse=False):
    ''' sort according values in the by list '''
    a_sorted = [i[0] for i in
                sorted(
                    zip(a, by),
                    key=lambda i: i[1],
                    reverse=reverse
                )]
    return a_sorted


def a2nnn(s):
    s1 = "".join([c if c in ['/', '\\'] else c + c + c
                  for c in s.replace("-", "")])
    # collapse frameshifts (/)
    s2 = re.sub("[a-zA-Z*]{2}//[a-zA-Z*]{2}", "//", s1)
    s3 = re.sub("[a-zA-Z*]/[a-zA-Z*]", "/", s2)
    return (s3)


def rle(s):
    '''run length encoding but max is set to 3 (codon)'''
    prev = ""
    count = 1
    char = []
    length = []
    L = 0
    for n in s:
        if n == prev and count < (3 - L):
            count += 1
        else:
            char.append(prev)
            length.append(count)
            L = 1 if prev == "/" else 0
            prev = n
            count = 1
    char.append(prev)
    length.append(count)
    sequence = char[1:]
    return sequence, length[1:]

def get_full_translation(translations):
    '''get one full length translation  from multiple partial
    aligned translation '''
    # find minimal set of alignements
    minimal_set = []
    not_filled_prev = len(translations[0])
    for s in translations:
        minimal_set.append(s)
        # check defined position - is there only '-' character?
        not_filled = sum([set(i) == {"-"} for i in  zip(*minimal_set)])
        if not_filled == 0:
            break
        if not_filled == not_filled_prev:
            # last added sequence did not improve coverage - remove it.
            minimal_set.pop()
        not_filled_prev = not_filled
    # merge translations
    final_translation = minimal_set[0]
    # record positions of joins to correct frameshifts reportings
    position_of_joins = set()
    position_of_joins_rle = set()
    if len(minimal_set) > 1:  # translation must be merged
        for s in minimal_set[1:]:
            s1 = re.search(r"-\w", final_translation)
            s2 = re.search(r"\w-", final_translation)
            if s1:
                position_of_joins.add(s1.start())
            if s2:
                position_of_joins.add((s2.end() - 1))
            final_translation = "".join(
                [b if a == "-" else a for a, b in zip(final_translation, s)])
    translation_rle = rle(final_translation)
    cumsumed_positions = np.cumsum(translation_rle[1])
    for p in position_of_joins:
        position_of_joins_rle.add(sum(cumsumed_positions <= p))
    # insert /\ when necessary
    for p in position_of_joins_rle:
        if translation_rle[0][p] not in ['/',"//","\\", "\\\\"]:
            if translation_rle[1][p] == 2:
                translation_rle[0][p] = translation_rle[0][p] + "/"
            if translation_rle[1][p] == 1:
                translation_rle[0][p] = "\\"
    consensus = "".join(translation_rle[0])
    return consensus


# helper function for debugging
def translation_alignments(query_seq, start_hit, end_hit):
    pstart = min(start_hit)
    pend = max(end_hit)
    nnn = list()
    for s, start, end in zip(query_seq, start_hit, end_hit):
        nnn.append("-" * (start - pstart) + a2nnn(s) + "-" * (pend - end))
    return (nnn)


def adjust_gff(OUTPUT_DOMAIN, gff, WIN_DOM, OVERLAP_DOM, step):
    ''' Original gff file is adjusted in case of containing cut parts
	- for consecutive sequences overlap is divided to half with first half
	of records(domains) belonging to the first sequence and second to the following one.
	Duplicate domains going through the middle of the overlap are removed.
	First and the last part (marked as LAST) of a certain sequence are
	handled separately as the are overlapped from one side only '''

    seq_id_all = []
    class_dict = defaultdict(int)
    seen = set()
    with open(OUTPUT_DOMAIN, "w") as adjusted_gff:
        with open(gff, "r") as primary_gff:
            start = True
            for line in primary_gff:
                if line.startswith("#"):
                    adjusted_gff.write(line)
                else:
                    split_line = line.split("\t")
                    classification = split_line[-1].split(";")[1].split("=")[1]
                    if start:
                        seq_id_all.append(split_line[0].split("_DANTE_PART")[
                            0])
                        start = False
                    seq_id = split_line[0].split("_DANTE_PART")[0]
                    if "DANTE_PART" in line:
                        line_without_id = "\t".join(split_line[1:])
                        part = int(split_line[0].split("_DANTE_PART")[1].split(
                            ":")[0].split("_")[0])
                        if seq_id != seq_id_all[-1]:
                            seq_id_all.append(seq_id)

                            ## first part of the sequence
                        if part == 1:
                            cut_end = WIN_DOM - OVERLAP_DOM / 2
                            if int(split_line[3]) <= cut_end <= int(split_line[
                                    4]):
                                if line_without_id not in seen:
                                    adjusted_gff.write("{}\t{}".format(
                                        seq_id, line_without_id))
                                    class_dict[classification] += 1
                                    seen.add(line_without_id)
                            elif int(split_line[4]) < cut_end:
                                adjusted_gff.write("{}\t{}".format(
                                    seq_id, line_without_id))
                                class_dict[classification] += 1

                                ## last part of the sequence
                        elif "LAST" in split_line[0]:
                            cut_start = OVERLAP_DOM / 2 + (part - 1) * step
                            if int(split_line[3]) <= cut_start <= int(
                                    split_line[4]):
                                if line_without_id not in seen:
                                    adjusted_gff.write("{}\t{}".format(
                                        seq_id, line_without_id))
                                    class_dict[classification] += 1
                                    seen.add(line_without_id)
                            elif int(split_line[3]) > cut_start:
                                adjusted_gff.write("{}\t{}".format(
                                    seq_id, line_without_id))
                                class_dict[classification] += 1

                        ## middle part of the sequence
                        else:
                            cut_start = OVERLAP_DOM / 2 + (part - 1) * step
                            cut_end = WIN_DOM - OVERLAP_DOM / 2 + (part -
                                                                   1) * step
                            if int(split_line[3]) <= cut_start <= int(
                                    split_line[4]) or int(split_line[
                                        3]) <= cut_end <= int(split_line[4]):
                                if line_without_id not in seen:
                                    adjusted_gff.write("{}\t{}".format(
                                        seq_id, line_without_id))
                                    class_dict[classification] += 1
                                    seen.add(line_without_id)
                            elif int(split_line[3]) > cut_start and int(
                                    split_line[4]) < cut_end:
                                adjusted_gff.write("{}\t{}".format(
                                    seq_id, line_without_id))
                                class_dict[classification] += 1
                    ## not divived
                    else:
                        if seq_id != seq_id_all[-1]:
                            seq_id_all.append(seq_id)
                        adjusted_gff.write(line)
                        class_dict[classification] += 1


def main(args):

    t = time.time()

    QUERY = args.query
    LAST_DB = args.protein_database
    CLASSIFICATION = args.classification
    OUTPUT_DOMAIN = args.domain_gff
    NEW_LDB = args.new_ldb
    OUTPUT_DIR = args.output_dir
    THRESHOLD_SCORE = args.threshold_score
    WIN_DOM = args.win_dom
    OVERLAP_DOM = args.overlap_dom
    SCORING_MATRIX = args.scoring_matrix
    configuration.SC_MATRIX = configuration.SC_MATRIX_SKELETON.format(SCORING_MATRIX)

    if OUTPUT_DOMAIN is None:
        OUTPUT_DOMAIN = configuration.DOMAINS_GFF
    if os.path.isdir(LAST_DB):
        LAST_DB = os.path.join(LAST_DB, configuration.LAST_DB_FILE)
    if os.path.isdir(CLASSIFICATION):
        CLASSIFICATION = os.path.join(CLASSIFICATION, configuration.CLASS_FILE)

    if NEW_LDB:
        subprocess.call("lastdb -p -cR01 {} {}".format(LAST_DB, LAST_DB),
                        shell=True)

    if OUTPUT_DIR and not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    if not os.path.isabs(OUTPUT_DOMAIN):
        if OUTPUT_DIR is None:
            OUTPUT_DIR = configuration.TMP
            if not os.path.exists(OUTPUT_DIR):
                os.makedirs(OUTPUT_DIR)
        OUTPUT_DOMAIN = os.path.join(OUTPUT_DIR,
                                     os.path.basename(OUTPUT_DOMAIN))
    domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
                  THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX)

    print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t))


if __name__ == "__main__":
    import argparse
    from argparse import RawDescriptionHelpFormatter

    class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
                          argparse.RawDescriptionHelpFormatter):
        pass

    parser = argparse.ArgumentParser(
        description=
        '''Script performs similarity search on given DNA sequence(s) in (multi)fasta against our protein domains database of all Transposable element for certain group of organisms (Viridiplantae or Metazoans). Domains are subsequently annotated and classified - in case certain domain has multiple annotations assigned, classifation is derived from the common classification level of all of them. Domains search is accomplished engaging LASTAL alignment tool.

	DEPENDENCIES:
		- python 3.4 or higher with packages:
			-numpy
		- lastal 744 or higher [http://last.cbrc.jp/]
		- configuration.py module

	EXAMPLE OF USAGE:

		./protein_domains_pd.py -q PATH_TO_INPUT_SEQ -pdb PATH_TO_PROTEIN_DB -cs PATH_TO_CLASSIFICATION_FILE

	When running for the first time with a new database use -nld option allowing lastal to create indexed database files:

		-nld True

		''',
        epilog="""""",
        formatter_class=CustomFormatter)

    requiredNamed = parser.add_argument_group('required named arguments')
    requiredNamed.add_argument(
        "-q",
        "--query",
        type=str,
        required=True,
        help=
        'input DNA sequence to search for protein domains in a fasta format. Multifasta format allowed.')
    requiredNamed.add_argument('-pdb',
                               "--protein_database",
                               type=str,
                               required=True,
                               help='protein domains database file')
    requiredNamed.add_argument('-cs',
                               '--classification',
                               type=str,
                               required=True,
                               help='protein domains classification file')
    parser.add_argument("-oug",
                        "--domain_gff",
                        type=str,
                        help="output domains gff format")
    parser.add_argument(
        "-nld",
        "--new_ldb",
        type=str,
        default=False,
        help=
        "create indexed database files for lastal in case of working with new protein db")
    parser.add_argument(
        "-dir",
        "--output_dir",
        type=str,
        help="specify if you want to change the output directory")
    parser.add_argument(
        "-M",
        "--scoring_matrix",
        type=str,
        default="BL80",
        choices=['BL80', 'BL62', 'MIQS'],
        help="specify scoring matrix to use for similarity search (BL80, BL62, MIQS)")
    parser.add_argument(
        "-thsc",
        "--threshold_score",
        type=int,
        default=80,
        help=
        "percentage of the best score in the cluster to be tolerated when assigning annotations per base")
    parser.add_argument(
        "-wd",
        "--win_dom",
        type=int,
        default=10000000,
        help="window to process large input sequences sequentially")
    parser.add_argument("-od",
                        "--overlap_dom",
                        type=int,
                        default=10000,
                        help="overlap of sequences in two consecutive windows")

    args = parser.parse_args()
    main(args)
author	petr-novak
date	Wed, 04 Sep 2019 06:45:18 -0400
parents	3151a72a6671
children	039c45c01b47