Mercurial > repos > dcouvin > pmlst_v2

diff pmlst/pmlst.py @ 0:cfab64885f66 draft default tip
Uploaded
author: dcouvin
date: Mon, 06 Sep 2021 18:27:45 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pmlst/pmlst.py	Mon Sep 06 18:27:45 2021 +0000
@@ -0,0 +1,691 @@
+#!/usr/bin/env python3
+
+import os, sys, re, time, pprint, io, shutil
+import argparse, subprocess
+
+from cgecore.alignment import extended_cigar
+from cgecore.blaster.blaster import Blaster
+from cgecore.cgefinder import CGEFinder
+import json, gzip
+from tabulate import tabulate
+
+
+def get_read_filename(infiles):
+    ''' Infiles must be a list with 1 or 2 input files.
+        Removes path from given string and removes extensions:
+        .fq .fastq .gz and .trim
+        extract the common sample name i 2 files are given.
+    '''
+    # Remove common fastq extensions
+    seq_path = infiles[0]
+    seq_file = os.path.basename(seq_path)
+    seq_file = seq_file.replace(".fq", "")
+    seq_file = seq_file.replace(".fastq", "")
+    seq_file = seq_file.replace(".gz", "")
+    seq_file = seq_file.replace(".trim", "")
+    if len(infiles) == 1:
+        return seq_file.rstrip()
+
+    # If two files are given get the common sample name
+    sample_name = ""
+    seq_file_2 = os.path.basename(infiles[1])
+    for i in range(len(seq_file)):
+        if seq_file_2[i] == seq_file[i]:
+            sample_name += seq_file[i]
+        else: 
+            break
+    if sample_name == "":
+        sys.error("Input error: sample names of input files, {} and {}, \
+                   does not share a common sample name. If these files \
+                   are paired end reads from the same sample, please rename \
+                   them with a common sample name (e.g. 's22_R1.fq', 's22_R2.fq') \
+                   or input them seperately.".format(infiles[0], infiles[1]))
+
+    return sample_name.rstrip("-").rstrip("_")
+
+def is_gzipped(file_path):
+    ''' Returns True if file is gzipped and False otherwise.
+        The result is inferred from the first two bits in the file read
+        from the input path.
+        On unix systems this should be: 1f 8b
+        Theoretically there could be exceptions to this test but it is
+        unlikely and impossible if the input files are otherwise expected
+        to be encoded in utf-8.
+    '''
+    with open(file_path, mode='rb') as fh:
+        bit_start = fh.read(2)
+    if(bit_start == b'\x1f\x8b'):
+        return True
+    else:
+        return False
+
+def get_file_format(input_files):
+    """
+    Takes all input files and checks their first character to assess
+    the file format. Returns one of the following strings; fasta, fastq, 
+    other or mixed. fasta and fastq indicates that all input files are 
+    of the same format, either fasta or fastq. other indiates that all
+    files are not fasta nor fastq files. mixed indicates that the inputfiles
+    are a mix of different file formats.
+    """
+
+    # Open all input files and get the first character
+    file_format = []
+    invalid_files = []
+    for infile in input_files:
+        if is_gzipped(infile):#[-3:] == ".gz":
+            f = gzip.open(infile, "rb")
+            fst_char = f.read(1);
+        else:
+            f = open(infile, "rb")
+            fst_char = f.read(1);
+        f.close()
+        # Assess the first character
+        if fst_char == b"@":
+            file_format.append("fastq")
+        elif fst_char == b">":
+            file_format.append("fasta")
+        else:
+            invalid_files.append("other")
+    if len(set(file_format)) != 1:
+        return "mixed"
+    return ",".join(set(file_format))
+
+def import_profile(database, scheme, loci_list):
+    """Import all possible allele profiles with corresponding st's
+    for the scheme into a dict. The profiles are stored in a dict 
+    of dicts, to easily look up what st types are accosiated with 
+    a specific allele number of each loci. 
+    """
+    # Open allele profile file from databaseloci
+    profile_file = open("{0}/{1}.txt.clean".format(database, scheme), "r")
+    profile_header = profile_file.readline().strip().split("\t")[1:len(loci_list)+1]
+    
+    # Create dict for looking up st-types with locus/allele combinations
+    st_profiles = {}
+    # For each locus initate make an inner dict to store allele and st's
+    for locus in loci_list:
+        st_profiles[locus] = {} 
+
+    # Fill inner dict with allele no as key and st-types seen with the allele as value   
+    for line in profile_file:
+        profile = line.strip().split("\t")
+        st_name = profile[0]
+        allele_list = profile[1:len(loci_list)+1]
+
+        # Go through all allele profiles. Save locus-allele combination with the st-type
+        for i in range(len(allele_list)):
+            allele = allele_list[i]
+            locus = profile_header[i]
+            if allele in st_profiles[locus]:
+                st_profiles[locus][allele] += [st_name]
+            else:
+                st_profiles[locus][allele] = [st_name]
+    profile_file.close()
+
+    return st_profiles
+
+def st_typing(st_profiles, allele_matches, loci_list):
+    """
+    Takes the path to a dictionary, the inp list of the allele 
+    number that each loci has been assigned, and an output file string
+    where the found st type and similaity is written into it.  
+    """
+
+    # Find best ST type for all allele profiles
+    st_output = ""
+    note = ""
+
+    # First line contains matrix column headers, which are the specific loci
+    st_hits = []
+    st_marks = []
+    note = ""
+
+    # Check the quality of the alle hits
+    for locus in allele_matches:
+        allele = allele_matches[locus]["allele"]
+ 
+        # Check if allele is marked as a non-perfect match. Save mark and write note.
+        if "?*" in allele:
+            note += "?* {}: Imperfect hit, ST can not be trusted!\n".format(locus)
+            st_marks = ["?","*"]
+        elif "?" in allele:
+            note += "? {}: Uncertain hit, ST can not be trusted.\n".format(locus)
+            st_marks.append("?")
+        elif "*" in allele:
+            note += "* {}: Novel allele, ST may indicate nearest ST.\n".format(locus)
+            st_marks.append("*")
+
+        # Remove mark from allele so it can be used to look up nearest st types
+        allele = allele.rstrip("*?!")
+
+        # Get all st's that have the alleles in it's allele profile
+        st_hits += st_profiles[locus].get(allele, ["None"])
+        if "alternative_hit" in allele_matches[locus] and allele_matches[locus]["alternative_hit"] != {}:
+            note += "! {}: Multiple perfect hits found\n".format(locus)
+            st_marks.append("!")
+            for allele_name, hit_info in allele_matches[locus]["alternative_hit"].items():
+                allele = hit_info["allele"].rstrip("!")
+                st_hits += st_profiles[locus].get(allele, ["None"])
+
+    # Save allele marks to be transfered to the ST
+    st_mark = "".join(set(st_marks))
+    notes = st_mark
+    # Add marks information to notes
+    if "!" in st_mark:
+        notes += " alleles with multiple perfect hits found, multiple STs might be found\n"
+    if "*" in st_mark and "?" in st_mark:
+        notes += " alleles with less than 100% identity and 100% coverages found\n"
+    elif st_mark == "*":
+        notes = st_mark + " alleles with less than 100% identity found\n"
+    elif st_mark == "?":
+        notes = st_mark + " alleles with less than 100% coverage found\n"
+    notes += note
+
+    # Find most frequent st in st_hits
+    st_hits_counter = {}
+    max_count = 0
+    best_hit = ""
+    for hit in st_hits:
+        if hit is not "None":
+            if hit in st_hits_counter:
+                st_hits_counter[hit] += 1
+            else:
+               st_hits_counter[hit] = 1
+            if max_count < st_hits_counter[hit]:
+                max_count = st_hits_counter[hit]
+                best_hit = hit
+
+    # Check if allele profile match found st 100 %
+    similarity = round(float(max_count)/len(loci_list)*100, 2)
+
+    if similarity != 100:
+        st = "Unknown"
+        nearest_sts = []
+        # If st is not perfect find nearest st's
+        for st_hit, allele_score in sorted(st_hits_counter.items(), key=lambda x: x[1], reverse=True):
+            if allele_score < max_count:
+                break
+            nearest_sts.append(st_hit)
+        nearest_sts = ",".join(nearest_sts) #+ st_mark
+    else:
+        # allele profile has a perfect ST hit but the st marks given to the alleles might indicate imperfect hits
+        sts = [st for st, no in st_hits_counter.items() if no == max_count]
+        #if len(sts) > 1:
+        st = "{},".format(st_mark).join(sts) + st_mark 
+        #st = best_hit + st_mark
+        nearest_sts = ""
+
+    return st, notes, nearest_sts
+
+def make_aln(scheme, file_handle, allele_matches, query_aligns, homol_aligns, sbjct_aligns):
+    for locus, locus_info in allele_matches.items():        
+        allele_name = locus_info["allele_name"]
+        if allele_name == "No hit found":
+            continue
+        hit_name = locus_info["hit_name"]
+
+        seqs = ["","",""]
+        seqs[0] = sbjct_aligns[scheme][hit_name]    
+        seqs[1] = homol_aligns[scheme][hit_name]    
+        seqs[2] = query_aligns[scheme][hit_name]
+
+        write_align(seqs, allele_name, file_handle)
+
+
+        # write alternative seq
+        if "alternative_hit" in locus_info:
+            for allele_name in locus_info["alternative_hit"]:
+                hit_name = locus_info["alternative_hit"][allele_name]["hit_name"]
+                seqs = ["","",""]
+                seqs[0] = sbjct_aligns[scheme][hit_name]    
+                seqs[1] = homol_aligns[scheme][hit_name]    
+                seqs[2] = query_aligns[scheme][hit_name]
+
+                write_align(seqs, allele_name, file_handle)
+
+def write_align(seq, seq_name, file_handle):
+    file_handle.write("# {}".format(seq_name) + "\n")
+    sbjct_seq = seq[0]
+    homol_seq = seq[1]
+    query_seq = seq[2]
+    for i in range(0,len(sbjct_seq),60):
+        file_handle.write("%-10s\t%s\n"%("template:", sbjct_seq[i:i+60]))
+        file_handle.write("%-10s\t%s\n"%("", homol_seq[i:i+60]))
+        file_handle.write("%-10s\t%s\n\n"%("query:", query_seq[i:i+60]))
+
+def text_table(headers, rows, empty_replace='-'):
+   ''' Create text table
+   
+   USAGE:
+      >>> from tabulate import tabulate
+      >>> headers = ['A','B']
+      >>> rows = [[1,2],[3,4]]
+      >>> print(text_table(headers, rows))
+      **********
+        A    B
+      **********
+        1    2
+        3    4
+      ==========
+   '''
+   # Replace empty cells with placeholder
+   rows = map(lambda row: map(lambda x: x if x else empty_replace, row), rows)
+   # Create table
+   table = tabulate(rows, headers, tablefmt='simple').split('\n')
+   # Prepare title injection
+   width = len(table[0])
+   # Switch horisontal line
+   table[1] = '*'*(width+2)
+   # Update table with title
+   table = ("%s\n"*3)%('*'*(width+2), '\n'.join(table), '='*(width+2))
+   return table
+
+
+parser = argparse.ArgumentParser(description="")
+# Arguments
+parser.add_argument("-i", "--infile",
+                    help="FASTA or FASTQ files to do pMLST on.",
+                    nargs="+", required=True)
+parser.add_argument("-o", "--outdir",
+                    help="Output directory.",
+                    default=".")
+parser.add_argument("-s", "--scheme",
+                    help="scheme database used for pMLST prediction", required=True)
+parser.add_argument("-p", "--database",
+                    help="Directory containing the databases.", default="/database")
+parser.add_argument("-t", "--tmp_dir",
+                    help="Temporary directory for storage of the results\
+                          from the external software.",
+                    default="tmp_pMLST")
+parser.add_argument("-mp", "--method_path",
+                    help="Path to the method to use (kma or blastn)\
+                          if assembled contigs are inputted the path to executable blastn should be given,\
+                          if fastq files are given path to executable kma should be given")
+parser.add_argument("-x", "--extented_output",
+                    help="Give extented output with allignment files, template and query hits in fasta and\
+                          a tab seperated file with allele profile results", action="store_true")
+parser.add_argument("-q", "--quiet", action="store_true")
+
+
+#parser.add_argument("-c", "--coverage",
+#                    help="Minimum template coverage required", default = 0.6)
+#parser.add_argument("-i", "--identity",
+#                    help="Minimum template identity required", default = 0.9)
+args = parser.parse_args()
+
+if args.quiet:
+    f = open(os.devnull, 'w')
+    sys.stdout = f
+
+
+#TODO what are the clonal complex data used for??
+
+# TODO error handling
+infile = args.infile
+# Check that outdir is an existing dir...
+outdir = os.path.abspath(args.outdir)
+scheme = args.scheme
+database = os.path.abspath(args.database)
+tmp_dir = os.path.abspath(args.tmp_dir)
+# Check if method path is executable
+method_path = args.method_path
+extented_output = args.extented_output
+
+min_cov = 0.6	   # args.coverage
+threshold = 0.95 # args.identity
+
+# Check file format (fasta, fastq or other format)
+file_format = get_file_format(infile)
+
+db_path = "{}/".format(database, scheme)
+
+config_file = open(database + "/config","r")
+
+# Get profile_name from config file
+scheme_list = []
+for line in config_file:
+    if line.startswith("#"):
+        continue
+    line = line.split("\t")
+    scheme_list.append(line[0])
+    if line[0] == scheme:
+        profile_name = line[1]
+
+config_file.close()
+        
+if scheme not in scheme_list:
+    sys.exit("{}, is not a valid scheme. \n\nPlease choose a scheme available in the database:\n{}".format(scheme, ", ".join(scheme_list)))
+
+# Get loci list from allele profile file
+with open("{0}/{1}.txt.clean".format(database, scheme), "r") as st_file:
+    file_header = st_file.readline().strip().split("\t")
+    loci_list = file_header[1:]
+
+# Call appropriate method (kma or blastn) based on file format 
+if file_format == "fastq":
+    if not method_path:
+        method_path = "kma"
+        if shutil.which(method_path) == None:
+            sys.exit("No valid path to a kma program was provided. Use the -mp flag to provide the path.")
+    # Check the number of files
+    if len(infile) == 1:
+        infile_1 = infile[0]
+        infile_2 = None
+    elif len(infile) == 2:
+        infile_1 = infile[0]
+        infile_2 = infile[1]
+    else:
+        sys.exit("Only 2 input file accepted for raw read data,\
+                  if data from more runs is avaliable for the same\
+                  sample, please concatinate the reads into two files")
+    
+    sample_name = get_read_filename(infile)
+    method = "kma"
+
+    # Call KMA
+    method_obj = CGEFinder.kma(infile_1, outdir, [scheme], db_path, min_cov=min_cov,
+                                threshold=threshold, kma_path=method_path, sample_name=sample_name,
+                                inputfile_2=infile_2, kma_mrs=0.75, kma_gapopen=-5,
+                                kma_gapextend=-1, kma_penalty=-3, kma_reward=1)
+
+elif file_format == "fasta":
+    if not method_path:
+        method_path = "blastn"
+        if shutil.which(method_path) == None:
+            sys.exit("No valid path to a blastn program was provided. Use the -mp flag to provide the path.")
+    # Assert that only one fasta file is inputted
+    assert len(infile) == 1, "Only one input file accepted for assembled data."
+    infile = infile[0]
+    method = "blast"
+
+    # Call BLASTn
+    method_obj = Blaster(infile, [scheme], db_path, tmp_dir, 
+                         min_cov, threshold, method_path, cut_off=False)
+                         #allewed_overlap=50)
+else:
+    sys.exit("Input file must be fastq or fasta format, not "+ file_format)
+
+results      = method_obj.results
+query_aligns = method_obj.gene_align_query
+homol_aligns = method_obj.gene_align_homo
+sbjct_aligns = method_obj.gene_align_sbjct
+
+# Check that the results dict is not empty
+warning = ""
+if results[scheme] == "No hit found":
+    results[scheme] = {}
+    warning = ("No MLST loci was found in the input data, "
+               "make sure that the correct pMLST scheme was chosen.")
+
+
+allele_matches = {}
+
+# Get the found allele profile contained in the results dict
+for hit, locus_hit in results[scheme].items():
+
+    # Get allele number for locus
+    allele_name = locus_hit["sbjct_header"]
+    allele_obj  = re.search("(\w+)[_|-](\w+$)", allele_name)
+
+    # Get variable to later storage in the results dict
+    locus     = allele_obj.group(1)
+    allele    = allele_obj.group(2)
+    coverage  = float(locus_hit["perc_coverage"])
+    identity  = float(locus_hit["perc_ident"])
+    score     = float(locus_hit["cal_score"])
+    gaps      = int(locus_hit["gaps"])    
+    align_len = locus_hit["HSP_length"]
+    sbj_len   = int(locus_hit["sbjct_length"])
+    sbjct_seq = locus_hit["sbjct_string"]
+    query_seq = locus_hit["query_string"] 
+    homol_seq = locus_hit["homo_string"]
+    cigar     = extended_cigar(sbjct_aligns[scheme][hit], query_aligns[scheme][hit]) 
+
+    # Check for perfect hits
+    if coverage == 100 and identity == 100:
+        # If a perfect hit was already found the list more_perfect hits will exist this new hit is appended to this list
+        try:
+            allele_matches[locus]["alternative_hit"][allele_name] = {"allele":allele+"!", "align_len":align_len, "sbj_len":sbj_len, 
+                                                                 "coverage":coverage, "identity":identity, "hit_name":hit}
+            if allele_matches[locus]["allele"][-1] != "!":
+                allele_matches[locus]["allele"] += "!"
+        except KeyError:
+            # Overwrite alleles already saved, save the perfect match and break to go to next locus
+            allele_matches[locus] = {"score":score, "allele":allele, "coverage":coverage,
+                                     "identity":identity, "match_priority": 1, "align_len":align_len,
+                                     "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
+                                     "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq, 
+                                     "hit_name":hit, "cigar":cigar, "alternative_hit":{}} 
+    else:
+        # If no hit has yet been stored initialize dict variables that are looked up below
+        if locus not in allele_matches:
+            allele_matches[locus] = {"score":0, "match_priority": 4}
+
+        # We weight full coverage higher than perfect identity match
+        if coverage == 100 and identity != 100:
+            # Check that better (higher prioritized) 100% coverage hit has not been stored yet
+            if allele_matches[locus]["match_priority"] > 2 or (allele_matches[locus]["match_priority"] == 2 and score > allele_matches[locus]["score"]):
+                allele_matches[locus] = {"score":score, "allele":allele+"*", "coverage":coverage,
+                                         "identity":identity, "match_priority": 2, "align_len":align_len,
+                                         "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
+                                         "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq,
+                                         "hit_name":hit, "cigar":cigar}
+        elif coverage != 100 and identity == 100:
+            # Check that higher prioritized hit was not already stored
+            if allele_matches[locus]["match_priority"] > 3 or (allele_matches[locus]["match_priority"] == 3 and score > allele_matches[locus]["score"]):
+                allele_matches[locus] = {"score":score, "allele":allele + "?", "coverage":coverage,
+                                         "identity":identity, "match_priority": 3, "align_len":align_len,
+                                         "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
+                                         "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq,
+                                         "hit_name":hit, "cigar":cigar}
+        else: # coverage != 100 and identity != 100:
+            if allele_matches[locus]["match_priority"] == 4 and score > allele_matches[locus]["score"]:
+                allele_matches[locus] = {"score":score, "allele":allele + "?*", "coverage":coverage,
+                                         "identity":identity, "match_priority": 4, "align_len":align_len,
+                                         "gaps":gaps, "sbj_len":sbj_len, "allele_name":allele_name,
+                                         "sbjct_seq":sbjct_seq, "query_seq":query_seq, "homol_seq":homol_seq,
+                                         "hit_name":hit, "cigar":cigar}
+for locus in loci_list:
+    if locus not in allele_matches:
+        allele_matches[locus] = {"identity":"", "coverage":"", "allele":"", "allele_name":"No hit found", "align_len":"", "gaps":"", "sbj_len":""}
+
+# Import all possible st profiles into dict
+st_profiles = import_profile(database, scheme,loci_list)
+
+# Find st or neatest sts
+st, note, nearest_sts = st_typing(st_profiles, allele_matches, loci_list)
+
+# Give warning of mlst schene if no loci were found
+if note == "" and warning != "":
+    note = warning
+
+# Set ST for incF
+if scheme.lower() == "incf":
+    st = ["F","A", "B"]
+    if "FII" in allele_matches and allele_matches["FII"]["identity"] == 100.0:
+        st[0] += allele_matches["FII"]["allele_name"].split("_")[-1]
+    elif "FIC" in allele_matches and allele_matches["FIC"]["identity"] == 100.0:
+        st[0] = "C" + allele_matches["FIC"]["allele_name"].split("_")[-1]
+    elif "FIIK" in allele_matches and allele_matches["FIIK"]["identity"] == 100.0:
+        st[0] = "K" + allele_matches["FIIK"]["allele_name"].split("_")[-1]
+    elif "FIIS" in allele_matches and allele_matches["FIIS"]["identity"] == 100.0:
+        st[0] = "S" + allele_matches["FIIS"]["allele_name"].split("_")[-1]
+    elif "FIIY" in allele_matches and allele_matches["FIIY"]["identity"] == 100.0:
+        st[0] = "Y" + allele_matches["FIIY"]["allele_name"].split("_")[-1]
+    else:
+        st[0] += "-"
+
+    if "FIA" in allele_matches and allele_matches["FIA"]["identity"] == 100.0:
+        st[1] += allele_matches["FIA"]["allele_name"].split("_")[-1]
+    else:
+        st[1] += "-"
+
+    if "FIB" in allele_matches and allele_matches["FIB"]["identity"] == 100.0:
+        st[2] += allele_matches["FIB"]["allele_name"].split("_")[-1]
+    else:
+        st[2] += "-"
+  
+    st = "["+":".join(st)+"]"
+
+
+# Check if ST is associated with a clonal complex.
+clpx = ""
+if st != "Unknown" or nearest_sts != "":
+    with open("{0}/{1}.clpx".format(database,scheme), "r") as clpx_file:
+        for line in clpx_file:
+            line = line.split("\t")
+            if st[0] == line[0] or nearest_sts == line[0]:
+                clpx = line[1].strip()
+
+    
+# Get run info for JSON file
+service = os.path.basename(__file__).replace(".py", "")
+date = time.strftime("%d.%m.%Y")
+time = time.strftime("%H:%M:%S")
+
+# TODO find a system to show the database and service version using git
+
+# Make JSON output file
+data = {service:{}}
+allele_results = {}
+for locus, locus_info in allele_matches.items():
+    allele_results[locus] = {"identity":0, "coverage":0, "allele":[], "allele_name":[], "align_len":[], "gaps":0, "sbj_len":[]}
+    for (key, value) in locus_info.items():
+        if key in allele_results[locus] or (key == "alternative_hit" and value != {}):
+            allele_results[locus][key] = value
+ 
+userinput = {"filename":args.infile, "scheme":args.scheme, "profile":profile_name,"file_format":file_format}
+run_info = {"date":date, "time":time}#, "database":{"remote_db":remote_db, "last_commit_hash":head_hash}}
+server_results = {"sequence_type":st, "allele_profile": allele_results,
+                             "nearest_sts":nearest_sts,"clonal_complex":clpx, "notes":note}
+
+data[service]["user_input"] = userinput
+data[service]["run_info"] = run_info
+data[service]["results"] = server_results
+
+pprint.pprint(data)
+
+# Save json output
+result_file = "{}/data.json".format(outdir) 
+with open(result_file, "w") as outfile:  
+    json.dump(data, outfile)
+
+if extented_output:
+    # Define extented output 
+    table_filename  = "{}/results_tab.tsv".format(outdir)
+    query_filename  = "{}/Hit_in_genome_seq.fsa".format(outdir)
+    sbjct_filename  = "{}/pMLST_allele_seq.fsa".format(outdir)
+    result_filename = "{}/results.txt".format(outdir)
+    table_file  = open(table_filename, "w")
+    query_file  = open(query_filename, "w")
+    sbjct_file  = open(sbjct_filename, "w")
+    result_file = open(result_filename, "w")
+
+    # Make results file
+    result_file.write("{0} Results\n\n".format(service))
+    result_file.write("pMLST profile: {}\n\nSequence Type: {}\n".format(profile_name, st))
+    # If ST is unknown report nearest ST
+    if st == "Unknown" and nearest_sts != "":
+        if len(nearest_sts.split(",")) == 1:
+            result_file.write("Nearest ST: {}\n".format(nearest_sts))
+        else:
+            result_file.write("Nearest STs: {}\n".format(nearest_sts))
+
+    # Report clonal complex if one was associated with ST:
+    if clpx != "":
+        result_file.write("Clonal complex: {}\n".format(clpx))
+
+    # Write tsv table header
+    table_header = ["Locus", "Identity", "Coverage", "Alignment Length", "Allele Length", "Gaps", "Allele"]
+    table_file.write("\t".join(table_header) + "\n")
+    rows = []
+    for locus, allele_info in allele_matches.items():
+
+        identity = str(allele_info["identity"])
+        coverage = str(allele_info["coverage"])
+        allele = allele_info["allele"]
+        allele_name = allele_info["allele_name"]
+        align_len = str(allele_info["align_len"])
+        sbj_len = str(allele_info["sbj_len"])
+        gaps = str(allele_info["gaps"])
+
+        # Write alleles names with indications of imperfect hits
+        if allele_name != "No hit found":
+            allele_name_w_mark = locus + "_" + allele
+        else:
+            allele_name_w_mark = allele_name          
+        
+        # Write allele results to tsv table
+        row = [locus, identity, coverage, align_len, sbj_len, gaps, allele_name_w_mark]
+        rows.append(row)
+        if "alternative_hit" in allele_info:
+            for allele_name, dic in allele_info["alternative_hit"].items():
+                row = [locus, identity, coverage, str(dic["align_len"]), str(dic["sbj_len"]), "0", allele_name + "!"]
+                rows.append(row)                
+        #
+
+        if allele_name == "No hit found":
+            continue
+
+        # Write query fasta output
+        hit_name = allele_info["hit_name"]
+        query_seq = query_aligns[scheme][hit_name]
+        sbjct_seq = sbjct_aligns[scheme][hit_name] 
+        homol_seq = homol_aligns[scheme][hit_name]
+
+        if allele_info["match_priority"] == 1:
+            match = "PERFECT MATCH"
+        else:
+            match = "WARNING"
+        header = ">{}:{} ID:{}% COV:{}% Best_match:{}\n".format(locus, match, allele_info["identity"], 
+                                                  allele_info["coverage"], allele_info["allele_name"])
+        query_file.write(header)
+        for i in range(0,len(query_seq),60):
+            query_file.write(query_seq[i:i+60] + "\n")
+
+        # Write template fasta output
+        header = ">{}\n".format(allele_info["allele_name"])
+        sbjct_file.write(header)
+        for i in range(0,len(sbjct_seq),60):
+            sbjct_file.write(sbjct_seq[i:i+60] + "\n")
+
+        if "alternative_hit" in allele_info:
+            for allele_name in allele_info["alternative_hit"]:
+                header = ">{}:{} ID:{}% COV:{}% Best_match:{}\n".format(locus, "PERFECT MATCH", 100, 
+                                                                        100, allele_name)
+                hit_name = allele_info["alternative_hit"][allele_name]["hit_name"]
+                query_seq = query_aligns[scheme][hit_name]
+                sbjct_seq = sbjct_aligns[scheme][hit_name] 
+                homol_seq = homol_aligns[scheme][hit_name]
+                query_file.write(header)
+                for i in range(0,len(query_seq),60):
+                    query_file.write(query_seq[i:i+60] + "\n")
+
+                # Write template fasta output
+                header = ">{}\n".format(allele_name)
+                sbjct_file.write(header)
+                for i in range(0,len(sbjct_seq),60):
+                    sbjct_file.write(sbjct_seq[i:i+60] + "\n")
+
+    # Write Allele profile results tables in results file and table file
+    rows.sort(key=lambda x: x[0])
+    result_file.write(text_table(table_header, rows))
+    for row in rows:
+        table_file.write("\t".join(row) + "\n")
+    # Write any notes
+    if note != "":
+       result_file.write("\nNotes: {}\n\n".format(note))
+
+    # Write allignment output
+    result_file.write("\n\nExtended Output:\n\n")
+    make_aln(scheme, result_file, allele_matches, query_aligns, homol_aligns, sbjct_aligns)
+
+    # Close all files
+    query_file.close()
+    sbjct_file.close()
+    table_file.close()
+    result_file.close()
+
+if args.quiet:
+    f.close()
author	dcouvin
date	Mon, 06 Sep 2021 18:27:45 +0000
parents
children