Mercurial > repos > nml > srst2
comparison srst2.py @ 0:6f870ed59b6e draft
Uploaded
| author | nml | 
|---|---|
| date | Mon, 06 Feb 2017 12:31:04 -0500 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:6f870ed59b6e | 
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 # SRST2 - Short Read Sequence Typer (v2) | |
| 4 # Python Version 2.7.5 | |
| 5 # | |
| 6 # Authors - Michael Inouye (minouye@unimelb.edu.au), Harriet Dashnow (h.dashnow@gmail.com), | |
| 7 # Kathryn Holt (kholt@unimelb.edu.au), Bernie Pope (bjpope@unimelb.edu.au) | |
| 8 # | |
| 9 # see LICENSE.txt for the license | |
| 10 # | |
| 11 # Dependencies: | |
| 12 # bowtie2 http://bowtie-bio.sourceforge.net/bowtie2/index.shtml version 2.1.0 | |
| 13 # SAMtools http://samtools.sourceforge.net Version: 0.1.18 (Version: 0.1.19 DOES NOT WORK - loss of edge coverage) | |
| 14 # SciPy http://www.scipy.org/install.html | |
| 15 # | |
| 16 # Git repository: https://github.com/katholt/srst2/ | |
| 17 # README: https://github.com/katholt/srst2/blob/master/README.md | |
| 18 # Questions or feature requests: https://github.com/katholt/srst2/issues | |
| 19 # Manuscript: http://biorxiv.org/content/early/2014/06/26/006627 | |
| 20 | |
| 21 | |
| 22 from argparse import (ArgumentParser, FileType) | |
| 23 import logging | |
| 24 from subprocess import call, check_output, CalledProcessError, STDOUT | |
| 25 import os, sys, re, collections, operator | |
| 26 from scipy.stats import binom_test, linregress | |
| 27 from math import log | |
| 28 from itertools import groupby | |
| 29 from operator import itemgetter | |
| 30 from collections import OrderedDict | |
| 31 try: | |
| 32 from version import srst2_version | |
| 33 except: | |
| 34 srst2_version = "version unknown" | |
| 35 | |
| 36 edge_a = edge_z = 2 | |
| 37 | |
| 38 | |
| 39 def parse_args(): | |
| 40 "Parse the input arguments, use '-h' for help." | |
| 41 | |
| 42 parser = ArgumentParser(description='SRST2 - Short Read Sequence Typer (v2)') | |
| 43 | |
| 44 # version number of srst2, print and then exit | |
| 45 parser.add_argument('--version', action='version', version='%(prog)s ' + srst2_version) | |
| 46 | |
| 47 # Read inputs | |
| 48 parser.add_argument( | |
| 49 '--input_se', nargs='+',type=str, required=False, | |
| 50 help='Single end read file(s) for analysing (may be gzipped)') | |
| 51 parser.add_argument( | |
| 52 '--input_pe', nargs='+', type=str, required=False, | |
| 53 help='Paired end read files for analysing (may be gzipped)') | |
| 54 parser.add_argument( | |
| 55 '--forward', type=str, required=False, default="_1", | |
| 56 help='Designator for forward reads (only used if NOT in MiSeq format sample_S1_L001_R1_001.fastq.gz; otherwise default is _1, i.e. expect forward reads as sample_1.fastq.gz)') | |
| 57 parser.add_argument( | |
| 58 '--reverse', type=str, required=False, default="_2", | |
| 59 help='Designator for reverse reads (only used if NOT in MiSeq format sample_S1_L001_R2_001.fastq.gz; otherwise default is _2, i.e. expect forward reads as sample_2.fastq.gz') | |
| 60 parser.add_argument('--read_type', type=str, choices=['q', 'qseq', 'f'], default='q', | |
| 61 help='Read file type (for bowtie2; default is q=fastq; other options: qseq=solexa, f=fasta).') | |
| 62 | |
| 63 # MLST parameters | |
| 64 parser.add_argument('--mlst_db', type=str, required=False, nargs=1, help='Fasta file of MLST alleles (optional)') | |
| 65 parser.add_argument('--mlst_delimiter', type=str, required=False, | |
| 66 help='Character(s) separating gene name from allele number in MLST database (default "-", as in arcc-1)', default="-") | |
| 67 parser.add_argument('--mlst_definitions', type=str, required=False, | |
| 68 help='ST definitions for MLST scheme (required if mlst_db supplied and you want to calculate STs)') | |
| 69 parser.add_argument('--mlst_max_mismatch', type=str, required=False, default = "10", | |
| 70 help='Maximum number of mismatches per read for MLST allele calling (default 10)') | |
| 71 | |
| 72 # Gene database parameters | |
| 73 parser.add_argument('--gene_db', type=str, required=False, nargs='+', help='Fasta file/s for gene databases (optional)') | |
| 74 parser.add_argument('--no_gene_details', action="store_false", required=False, help='Switch OFF verbose reporting of gene typing') | |
| 75 parser.add_argument('--gene_max_mismatch', type=str, required=False, default = "10", | |
| 76 help='Maximum number of mismatches per read for gene detection and allele calling (default 10)') | |
| 77 | |
| 78 # Cutoffs for scoring/heuristics | |
| 79 parser.add_argument('--min_coverage', type=float, required=False, help='Minimum %%coverage cutoff for gene reporting (default 90)',default=90) | |
| 80 parser.add_argument('--max_divergence', type=float, required=False, help='Maximum %%divergence cutoff for gene reporting (default 10)',default=10) | |
| 81 parser.add_argument('--min_depth', type=float, required=False, help='Minimum mean depth to flag as dubious allele call (default 5)',default=5) | |
| 82 parser.add_argument('--min_edge_depth', type=float, required=False, help='Minimum edge depth to flag as dubious allele call (default 2)',default=2) | |
| 83 parser.add_argument('--prob_err', type=float, default=0.01, help='Probability of sequencing error (default 0.01)') | |
| 84 | |
| 85 # Mapping parameters for bowtie2 | |
| 86 parser.add_argument('--stop_after', type=str, required=False, help='Stop mapping after this number of reads have been mapped (otherwise map all)') | |
| 87 parser.add_argument('--other', type=str, help='Other arguments to pass to bowtie2.', required=False) | |
| 88 | |
| 89 # Samtools parameters | |
| 90 parser.add_argument('--mapq', type=int, default=1, help='Samtools -q parameter (default 1)') | |
| 91 parser.add_argument('--baseq', type=int, default=20, help='Samtools -Q parameter (default 20)') | |
| 92 | |
| 93 # Reporting options | |
| 94 parser.add_argument('--output', type=str, required=True, help='Prefix for srst2 output files') | |
| 95 parser.add_argument('--log', action="store_true", required=False, help='Switch ON logging to file (otherwise log to stdout)') | |
| 96 parser.add_argument('--save_scores', action="store_true", required=False, help='Switch ON verbose reporting of all scores') | |
| 97 parser.add_argument('--report_new_consensus', action="store_true", required=False, help='If a matching alleles is not found, report the consensus allele. Note, only SNP differences are considered, not indels.') | |
| 98 parser.add_argument('--report_all_consensus', action="store_true", required=False, help='Report the consensus allele for the most likely allele. Note, only SNP differences are considered, not indels.') | |
| 99 | |
| 100 # Run options | |
| 101 parser.add_argument('--use_existing_pileup', action="store_true", required=False, | |
| 102 help='Use existing pileups if available, otherwise they will be generated') # to facilitate testing of rescoring from pileups | |
| 103 parser.add_argument('--use_existing_scores', action="store_true", required=False, | |
| 104 help='Use existing scores files if available, otherwise they will be generated') # to facilitate testing of reporting from scores | |
| 105 parser.add_argument('--keep_interim_alignment', action="store_true", required=False, default=False, | |
| 106 help='Keep interim files (sam & unsorted bam), otherwise they will be deleted after sorted bam is created') # to facilitate testing of sam processing | |
| 107 # parser.add_argument('--keep_final_alignment', action="store_true", required=False, default=False, | |
| 108 # help='Keep interim files (sam & unsorted bam), otherwise they will be deleted after sorted bam is created') # to facilitate testing of sam processing | |
| 109 | |
| 110 # Compile previous output files | |
| 111 parser.add_argument('--prev_output', nargs='+', type=str, required=False, | |
| 112 help='SRST2 results files to compile (any new results from this run will also be incorporated)') | |
| 113 | |
| 114 return parser.parse_args() | |
| 115 | |
| 116 | |
| 117 # Exception to raise if the command we try to run fails for some reason | |
| 118 class CommandError(Exception): | |
| 119 pass | |
| 120 | |
| 121 def run_command(command, **kwargs): | |
| 122 'Execute a shell command and check the exit status and any O/S exceptions' | |
| 123 command_str = ' '.join(command) | |
| 124 logging.info('Running: {}'.format(command_str)) | |
| 125 try: | |
| 126 exit_status = call(command, **kwargs) | |
| 127 except OSError as e: | |
| 128 message = "Command '{}' failed due to O/S error: {}".format(command_str, str(e)) | |
| 129 raise CommandError({"message": message}) | |
| 130 if exit_status != 0: | |
| 131 message = "Command '{}' failed with non-zero exit status: {}".format(command_str, exit_status) | |
| 132 raise CommandError({"message": message}) | |
| 133 | |
| 134 | |
| 135 def bowtie_index(fasta_files): | |
| 136 'Build a bowtie2 index from the given input fasta(s)' | |
| 137 | |
| 138 # check that both bowtie and samtools have the right versions | |
| 139 check_command_version(['bowtie2', '--version'], | |
| 140 'bowtie2-align version 2.1.0', | |
| 141 'bowtie', | |
| 142 '2.1.0') | |
| 143 | |
| 144 for fasta in fasta_files: | |
| 145 built_index = fasta + '.1.bt2' | |
| 146 if os.path.exists(built_index): | |
| 147 logging.info('Index for {} is already built...'.format(fasta)) | |
| 148 else: | |
| 149 logging.info('Building bowtie2 index for {}...'.format(fasta)) | |
| 150 run_command(['bowtie2-build', fasta, fasta]) | |
| 151 | |
| 152 | |
| 153 def modify_bowtie_sam(raw_bowtie_sam,max_mismatch): | |
| 154 # fix sam flags for comprehensive pileup | |
| 155 with open(raw_bowtie_sam) as sam, open(raw_bowtie_sam + '.mod', 'w') as sam_mod: | |
| 156 for line in sam: | |
| 157 if not line.startswith('@'): | |
| 158 fields = line.split('\t') | |
| 159 flag = int(fields[1]) | |
| 160 flag = (flag - 256) if (flag & 256) else flag | |
| 161 m = re.search("NM:i:(\d+)\s",line) | |
| 162 if m != None: | |
| 163 num_mismatch = m.group(1) | |
| 164 if int(num_mismatch) <= int(max_mismatch): | |
| 165 sam_mod.write('\t'.join([fields[0], str(flag)] + fields[2:])) | |
| 166 else: | |
| 167 logging.info('Excluding read from SAM file due to missing NM (num mismatches) field: ' + fields[0]) | |
| 168 num_mismatch = 0 | |
| 169 else: | |
| 170 sam_mod.write(line) | |
| 171 return(raw_bowtie_sam,raw_bowtie_sam + '.mod') | |
| 172 | |
| 173 | |
| 174 def parse_fai(fai_file,db_type,delimiter): | |
| 175 'Get sequence lengths for reference alleles - important for scoring' | |
| 176 'Get gene names also, required if no MLST definitions provided' | |
| 177 size = {} | |
| 178 gene_clusters = [] # for gene DBs, this is cluster ID | |
| 179 allele_symbols = [] | |
| 180 gene_cluster_symbols = {} # key = cluster ID, value = gene symbol (for gene DBs) | |
| 181 unique_allele_symbols = True | |
| 182 unique_gene_symbols = True | |
| 183 delimiter_check = [] # list of names that may violate the MLST delimiter supplied | |
| 184 with open(fai_file) as fai: | |
| 185 for line in fai: | |
| 186 fields = line.split('\t') | |
| 187 name = fields[0] # full allele name | |
| 188 size[name] = int(fields[1]) # store length | |
| 189 if db_type!="mlst": | |
| 190 allele_info = name.split()[0].split("__") | |
| 191 if len(allele_info) > 2: | |
| 192 gene_cluster = allele_info[0] # ID number for the cluster | |
| 193 cluster_symbol = allele_info[1] # gene name for the cluster | |
| 194 name = allele_info[2] # specific allele name | |
| 195 if gene_cluster in gene_cluster_symbols: | |
| 196 if gene_cluster_symbols[gene_cluster] != cluster_symbol: | |
| 197 unique_gene_symbols = False # already seen this cluster symbol | |
| 198 logging.info( "Non-unique:" + gene_cluster + ", " + cluster_symbol) | |
| 199 else: | |
| 200 gene_cluster_symbols[gene_cluster] = cluster_symbol | |
| 201 else: | |
| 202 # treat as unclustered database, use whole header | |
| 203 gene_cluster = cluster_symbol = name | |
| 204 else: | |
| 205 gene_cluster = name.split(delimiter)[0] # accept gene clusters raw for mlst | |
| 206 # check if the delimiter makes sense | |
| 207 parts = name.split(delimiter) | |
| 208 if len(parts) != 2: | |
| 209 delimiter_check.append(name) | |
| 210 else: | |
| 211 try: | |
| 212 x = int(parts[1]) | |
| 213 except: | |
| 214 delimiter_check.append(name) | |
| 215 | |
| 216 # check if we have seen this allele name before | |
| 217 if name in allele_symbols: | |
| 218 unique_allele_symbols = False # already seen this allele name | |
| 219 allele_symbols.append(name) | |
| 220 | |
| 221 # record gene (cluster): | |
| 222 if gene_cluster not in gene_clusters: | |
| 223 gene_clusters.append(gene_cluster) | |
| 224 | |
| 225 if len(delimiter_check) > 0: | |
| 226 print "Warning! MLST delimiter is " + delimiter + " but these genes may violate the pattern and cause problems:" | |
| 227 print ",".join(delimiter_check) | |
| 228 | |
| 229 return size, gene_clusters, unique_gene_symbols, unique_allele_symbols, gene_cluster_symbols | |
| 230 | |
| 231 | |
| 232 def read_pileup_data(pileup_file, size, prob_err, consensus_file = ""): | |
| 233 with open(pileup_file) as pileup: | |
| 234 prob_success = 1 - prob_err # Set by user, default is prob_err = 0.01 | |
| 235 hash_alignment = {} | |
| 236 hash_max_depth = {} | |
| 237 hash_edge_depth = {} | |
| 238 max_depth = 1 | |
| 239 avg_depth_allele = {} | |
| 240 next_to_del_depth_allele = {} | |
| 241 coverage_allele = {} | |
| 242 mismatch_allele = {} | |
| 243 indel_allele = {} | |
| 244 missing_allele = {} | |
| 245 size_allele = {} | |
| 246 | |
| 247 # Split all lines in the pileup by whitespace | |
| 248 pileup_split = ( x.split() for x in pileup ) | |
| 249 # Group the split lines based on the first field (allele) | |
| 250 for allele, lines in groupby(pileup_split, itemgetter(0)): | |
| 251 | |
| 252 # Reset variables for new allele | |
| 253 allele_line = 1 # Keep track of line for this allele | |
| 254 exp_nuc_num = 0 # Expected position in ref allele | |
| 255 allele_size = size[allele] | |
| 256 total_depth = 0 | |
| 257 depth_a = depth_z = 0 | |
| 258 position_depths = [0] * allele_size # store depths in case required for penalties; then we don't need to track total_missing_bases | |
| 259 hash_alignment[allele] = [] | |
| 260 total_missing_bases = 0 | |
| 261 total_mismatch = 0 | |
| 262 ins_poscount = 0 | |
| 263 del_poscount = 0 | |
| 264 next_to_del_depth = 99999 | |
| 265 consensus_seq = "" | |
| 266 | |
| 267 for fields in lines: | |
| 268 # Parse this line and store details required for scoring | |
| 269 nuc_num = int(fields[1]) # Actual position in ref allele | |
| 270 exp_nuc_num += 1 | |
| 271 allele_line += 1 | |
| 272 nuc = fields[2] | |
| 273 nuc_depth = int(fields[3]) | |
| 274 position_depths[nuc_num-1] = nuc_depth | |
| 275 if len(fields) <= 5: | |
| 276 aligned_bases = '' | |
| 277 else: | |
| 278 aligned_bases = fields[4] | |
| 279 | |
| 280 # Missing bases (pileup skips basepairs) | |
| 281 if nuc_num > exp_nuc_num: | |
| 282 total_missing_bases += abs(exp_nuc_num - nuc_num) | |
| 283 exp_nuc_num = nuc_num | |
| 284 if nuc_depth == 0: | |
| 285 total_missing_bases += 1 | |
| 286 | |
| 287 # Calculate depths for this position | |
| 288 if nuc_num <= edge_a: | |
| 289 depth_a += nuc_depth | |
| 290 if abs(nuc_num - allele_size) < edge_z: | |
| 291 depth_z += nuc_depth | |
| 292 if nuc_depth > max_depth: | |
| 293 hash_max_depth[allele] = nuc_depth | |
| 294 max_depth = nuc_depth | |
| 295 | |
| 296 total_depth = total_depth + nuc_depth | |
| 297 | |
| 298 # Parse aligned bases list for this position in the pileup | |
| 299 num_match = 0 | |
| 300 ins_readcount = 0 | |
| 301 del_readcount = 0 | |
| 302 nuc_counts = {} | |
| 303 | |
| 304 i = 0 | |
| 305 while i < len(aligned_bases): | |
| 306 | |
| 307 if aligned_bases[i] == "^": | |
| 308 # Signifies start of a read, next char is mapping quality (skip it) | |
| 309 i += 2 | |
| 310 continue | |
| 311 | |
| 312 if aligned_bases[i] == "+": | |
| 313 i += int(aligned_bases[i+1]) + 2 # skip to next read | |
| 314 ins_readcount += 1 | |
| 315 continue | |
| 316 | |
| 317 if aligned_bases[i] == "-": | |
| 318 i += int(aligned_bases[i+1]) + 2 # skip to next read | |
| 319 continue | |
| 320 | |
| 321 if aligned_bases[i] == "*": | |
| 322 i += 1 # skip to next read | |
| 323 del_readcount += 1 | |
| 324 continue | |
| 325 | |
| 326 if aligned_bases[i] == "." or aligned_bases[i] == ",": | |
| 327 num_match += 1 | |
| 328 i += 1 | |
| 329 continue | |
| 330 | |
| 331 elif aligned_bases[i].upper() in "ATCG": | |
| 332 this_nuc = aligned_bases[i].upper() | |
| 333 if this_nuc not in nuc_counts: | |
| 334 nuc_counts[this_nuc] = 0 | |
| 335 nuc_counts[this_nuc] += 1 | |
| 336 | |
| 337 i += 1 | |
| 338 | |
| 339 # Save the most common nucleotide at this position | |
| 340 consensus_nuc = nuc # by default use reference nucleotide | |
| 341 max_freq = num_match # Number of bases matching the reference | |
| 342 for nucleotide in nuc_counts: | |
| 343 if nuc_counts[nucleotide] > max_freq: | |
| 344 consensus_nuc = nucleotide | |
| 345 max_freq = nuc_counts[nucleotide] | |
| 346 consensus_seq += (consensus_nuc) | |
| 347 | |
| 348 # Calculate details of this position for scoring and reporting | |
| 349 | |
| 350 # mismatches and indels | |
| 351 num_mismatch = nuc_depth - num_match | |
| 352 if num_mismatch > num_match: | |
| 353 total_mismatch += 1 # record as mismatch (could be a snp or deletion) | |
| 354 if del_readcount > num_match: | |
| 355 del_poscount += 1 | |
| 356 if ins_readcount > nuc_depth / 2: | |
| 357 ins_poscount += 1 | |
| 358 | |
| 359 # Hash for later processing | |
| 360 hash_alignment[allele].append((num_match, num_mismatch, prob_success)) # snp or deletion | |
| 361 if ins_readcount > 0: | |
| 362 hash_alignment[allele].append((nuc_depth - ins_readcount, ins_readcount, prob_success)) # penalize for any insertion calls at this position | |
| 363 | |
| 364 # Determine the consensus sequence if required | |
| 365 if consensus_file != "": | |
| 366 if consensus_file.split(".")[-2] == "new_consensus_alleles": | |
| 367 consensus_type = "variant" | |
| 368 elif consensus_file.split(".")[-2] == "all_consensus_alleles": | |
| 369 consensus_type = "consensus" | |
| 370 with open(consensus_file, "a") as consensus_outfile: | |
| 371 consensus_outfile.write(">{0}.{1} {2}\n".format(allele, consensus_type, pileup_file.split(".")[1].split("__")[1])) | |
| 372 outstring = consensus_seq + "\n" | |
| 373 consensus_outfile.write(outstring) | |
| 374 | |
| 375 # Finished reading pileup for this allele | |
| 376 | |
| 377 # Check for missing bases at the end of the allele | |
| 378 if nuc_num < allele_size: | |
| 379 total_missing_bases += abs(allele_size - nuc_num) | |
| 380 # determine penalty based on coverage of last 2 bases | |
| 381 penalty = float(position_depths[nuc_num-1] + position_depths[nuc_num-2])/2 | |
| 382 m = min(position_depths[nuc_num-1],position_depths[nuc_num-2]) | |
| 383 hash_alignment[allele].append((0, penalty, prob_success)) | |
| 384 if next_to_del_depth > m: | |
| 385 next_to_del_depth = m # keep track of lowest near-del depth for reporting | |
| 386 | |
| 387 # Calculate allele summary stats and save | |
| 388 avg_depth = round(total_depth / float(allele_line),3) | |
| 389 avg_a = depth_a / float(edge_a) # Avg depth at 5' end, num basepairs determined by edge_a | |
| 390 avg_z = depth_z / float(edge_z) # 3' | |
| 391 hash_max_depth[allele] = max_depth | |
| 392 hash_edge_depth[allele] = (avg_a, avg_z) | |
| 393 min_penalty = max(5, int(avg_depth)) | |
| 394 coverage_allele[allele] = 100*(allele_size - total_missing_bases - del_poscount)/float(allele_size) # includes in-read deletions | |
| 395 mismatch_allele[allele] = total_mismatch - del_poscount # snps only | |
| 396 indel_allele[allele] = del_poscount + ins_poscount # insertions or deletions | |
| 397 missing_allele[allele] = total_missing_bases # truncated bases | |
| 398 size_allele[allele] = allele_size | |
| 399 | |
| 400 # Penalize truncations or large deletions (i.e. positions not covered in pileup) | |
| 401 j = 0 | |
| 402 while j < (len(position_depths)-2): | |
| 403 # note end-of-seq truncations are dealt with above) | |
| 404 if position_depths[j]==0 and position_depths[j+1]!=0: | |
| 405 penalty = float(position_depths[j+1]+position_depths[j+2])/2 # mean of next 2 bases | |
| 406 hash_alignment[allele].append((0, penalty, prob_success)) | |
| 407 m = min(position_depths[nuc_num-1],position_depths[nuc_num-2]) | |
| 408 if next_to_del_depth > m: | |
| 409 next_to_del_depth = m # keep track of lowest near-del depth for reporting | |
| 410 j += 1 | |
| 411 | |
| 412 # Store depth info for reporting | |
| 413 avg_depth_allele[allele] = avg_depth | |
| 414 if next_to_del_depth == 99999: | |
| 415 next_to_del_depth = "NA" | |
| 416 next_to_del_depth_allele[allele] = next_to_del_depth | |
| 417 | |
| 418 return hash_alignment, hash_max_depth, hash_edge_depth, avg_depth_allele, coverage_allele, mismatch_allele, indel_allele, missing_allele, size_allele, next_to_del_depth_allele | |
| 419 | |
| 420 | |
| 421 def score_alleles(args, mapping_files_pre, hash_alignment, hash_max_depth, hash_edge_depth, | |
| 422 avg_depth_allele, coverage_allele, mismatch_allele, indel_allele, missing_allele, | |
| 423 size_allele, next_to_del_depth_allele, run_type): | |
| 424 | |
| 425 if args.save_scores: | |
| 426 scores_output = file(mapping_files_pre + '.scores', 'w') | |
| 427 scores_output.write("Allele\tScore\tAvg_depth\tEdge1_depth\tEdge2_depth\tPercent_coverage\tSize\tMismatches\tIndels\tTruncated_bases\tDepthNeighbouringTruncation\tmaxMAF\tLeastConfident_Rate\tLeastConfident_Mismatches\tLeastConfident_Depth\tLeastConfident_Pvalue\n") | |
| 428 | |
| 429 scores = {} # key = allele, value = score | |
| 430 mix_rates = {} # key = allele, value = highest minor allele frequency, 0 -> 0.5 | |
| 431 | |
| 432 for allele in hash_alignment: | |
| 433 if (run_type == "mlst") or (coverage_allele[allele] > args.min_coverage): | |
| 434 pvals = [] | |
| 435 min_pval = 1.0 | |
| 436 min_pval_data = (999,999) # (mismatch, depth) for position with lowest p-value | |
| 437 mix_rate = 0 # highest minor allele frequency 0 -> 0.5 | |
| 438 for nuc_info in hash_alignment[allele]: | |
| 439 if nuc_info is not None: | |
| 440 match, mismatch, prob_success = nuc_info | |
| 441 if match > 0 or mismatch > 0: | |
| 442 if mismatch == 0: | |
| 443 p_value = 1.0 | |
| 444 else: | |
| 445 p_value = binom_test([match, mismatch], None, prob_success) | |
| 446 # Weight pvalue by (depth/max_depth) | |
| 447 max_depth = hash_max_depth[allele] | |
| 448 weight = (match + mismatch) / float(max_depth) | |
| 449 p_value *= weight | |
| 450 if p_value < min_pval: | |
| 451 min_pval = p_value | |
| 452 min_pval_data = (mismatch,match + mismatch) | |
| 453 if p_value > 0: | |
| 454 p_value = -log(p_value, 10) | |
| 455 else: | |
| 456 p_value = 1000 | |
| 457 pvals.append(p_value) | |
| 458 mismatch_prop = float(match)/float(match+mismatch) | |
| 459 if min(mismatch_prop, 1-mismatch_prop) > mix_rate: | |
| 460 mix_rate = min(mismatch_prop, 1-mismatch_prop) | |
| 461 # Fit linear model to observed Pval distribution vs expected Pval distribution (QQ plot) | |
| 462 pvals.sort(reverse=True) | |
| 463 len_obs_pvals = len(pvals) | |
| 464 exp_pvals = range(1, len_obs_pvals + 1) | |
| 465 exp_pvals2 = [-log(float(ep) / (len_obs_pvals + 1), 10) for ep in exp_pvals] | |
| 466 | |
| 467 # Slope is score | |
| 468 slope, _intercept, _r_value, _p_value, _std_err = linregress(exp_pvals2, pvals) | |
| 469 | |
| 470 # Store all scores for later processing | |
| 471 scores[allele] = slope | |
| 472 mix_rates[allele] = mix_rate | |
| 473 | |
| 474 # print scores for each allele, if requested | |
| 475 if args.save_scores: | |
| 476 if allele in hash_edge_depth: | |
| 477 start_depth, end_depth = hash_edge_depth[allele] | |
| 478 edge_depth_str = str(start_depth) + '\t' + str(end_depth) | |
| 479 else: | |
| 480 edge_depth_str = "NA\tNA" | |
| 481 this_depth = avg_depth_allele.get(allele, "NA") | |
| 482 this_coverage = coverage_allele.get(allele, "NA") | |
| 483 this_mismatch = mismatch_allele.get(allele, "NA") | |
| 484 this_indel = indel_allele.get(allele, "NA") | |
| 485 this_missing = missing_allele.get(allele, "NA") | |
| 486 this_size = size_allele.get(allele, "NA") | |
| 487 this_next_to_del_depth = next_to_del_depth_allele.get(allele, "NA") | |
| 488 scores_output.write('\t'.join([allele, str(slope), str(this_depth), edge_depth_str, | |
| 489 str(this_coverage), str(this_size), str(this_mismatch), str(this_indel), str(this_missing), str(this_next_to_del_depth), str(mix_rate), str(float(min_pval_data[0])/min_pval_data[1]),str(min_pval_data[0]),str(min_pval_data[1]),str(min_pval)]) + '\n') | |
| 490 | |
| 491 if args.save_scores: | |
| 492 scores_output.close() | |
| 493 | |
| 494 return(scores,mix_rates) | |
| 495 | |
| 496 # Check that an acceptable version of a command is installed | |
| 497 # Exits the program if it can't be found. | |
| 498 # - command_list is the command to run to determine the version. | |
| 499 # - version_identifier is the unique string we look for in the stdout of the program. | |
| 500 # - command_name is the name of the command to show in error messages. | |
| 501 # - required_version is the version number to show in error messages. | |
| 502 def check_command_version(command_list, version_identifier, command_name, required_version): | |
| 503 try: | |
| 504 command_stdout = check_output(command_list, stderr=STDOUT) | |
| 505 except OSError as e: | |
| 506 logging.error("Failed command: {}".format(' '.join(command_list))) | |
| 507 logging.error(str(e)) | |
| 508 logging.error("Could not determine the version of {}.".format(command_name)) | |
| 509 logging.error("Do you have {} installed in your PATH?".format(command_name)) | |
| 510 exit(-1) | |
| 511 except CalledProcessError as e: | |
| 512 # some programs such as samtools return a non-zero exit status | |
| 513 # when you ask for the version (sigh). We ignore it here. | |
| 514 command_stdout = e.output | |
| 515 | |
| 516 if version_identifier not in command_stdout: | |
| 517 logging.error("Incorrect version of {} installed.".format(command_name)) | |
| 518 logging.error("{} version {} is required by SRST2.".format(command_name, required_version)) | |
| 519 exit(-1) | |
| 520 | |
| 521 | |
| 522 def run_bowtie(mapping_files_pre,sample_name,fastqs,args,db_name,db_full_path): | |
| 523 | |
| 524 print "Starting mapping with bowtie2" | |
| 525 | |
| 526 # check that both bowtie and samtools have the right versions | |
| 527 check_command_version(['bowtie2', '--version'], | |
| 528 'bowtie2-align version 2.1.0', | |
| 529 'bowtie', | |
| 530 '2.1.0') | |
| 531 | |
| 532 check_command_version(['samtools'], | |
| 533 'Version: 0.1.18', | |
| 534 'samtools', | |
| 535 '0.1.18') | |
| 536 | |
| 537 command = ['bowtie2'] | |
| 538 | |
| 539 if len(fastqs)==1: | |
| 540 # single end | |
| 541 command += ['-U', fastqs[0]] | |
| 542 elif len(fastqs)==2: | |
| 543 # paired end | |
| 544 command += ['-1', fastqs[0], '-2', fastqs[1]] | |
| 545 | |
| 546 sam = mapping_files_pre + ".sam" | |
| 547 logging.info('Output prefix set to: ' + mapping_files_pre) | |
| 548 | |
| 549 command += ['-S', sam, | |
| 550 '-' + args.read_type, # add a dash to the front of the option | |
| 551 '--very-sensitive-local', | |
| 552 '--no-unal', | |
| 553 '-a', # Search for and report all alignments | |
| 554 '-x', db_full_path # The index to be aligned to | |
| 555 ] | |
| 556 | |
| 557 if args.stop_after: | |
| 558 try: | |
| 559 command += ['-u',str(int(args.stop_after))] | |
| 560 except ValueError: | |
| 561 print "WARNING. You asked to stop after mapping '" + args.stop_after + "' reads. I don't understand this, and will map all reads. Please speficy an integer with --stop_after or leave this as default to map 1 million reads." | |
| 562 | |
| 563 if args.other: | |
| 564 command += args.other.split() | |
| 565 | |
| 566 logging.info('Aligning reads to index {} using bowtie2...'.format(db_full_path)) | |
| 567 | |
| 568 run_command(command) | |
| 569 | |
| 570 return(sam) | |
| 571 | |
| 572 def get_pileup(args,mapping_files_pre,raw_bowtie_sam,bowtie_sam_mod,fasta,pileup_file): | |
| 573 # Analyse output with SAMtools | |
| 574 logging.info('Processing Bowtie2 output with SAMtools...') | |
| 575 logging.info('Generate and sort BAM file...') | |
| 576 out_file_bam = mapping_files_pre + ".unsorted.bam" | |
| 577 run_command(['samtools', 'view', '-b', '-o', out_file_bam, | |
| 578 '-q', str(args.mapq), '-S', bowtie_sam_mod]) | |
| 579 out_file_bam_sorted = mapping_files_pre + ".sorted" | |
| 580 run_command(['samtools', 'sort', out_file_bam, out_file_bam_sorted]) | |
| 581 | |
| 582 # Delete interim files (sam, modified sam, unsorted bam) unless otherwise specified. | |
| 583 # Note users may also want to delete final sorted bam and pileup on completion to save space. | |
| 584 if not args.keep_interim_alignment: | |
| 585 logging.info('Deleting sam and bam files that are not longer needed...') | |
| 586 del_filenames = [raw_bowtie_sam, bowtie_sam_mod, out_file_bam] | |
| 587 for f in del_filenames: | |
| 588 logging.info('Deleting ' + f) | |
| 589 os.remove(f) | |
| 590 | |
| 591 logging.info('Generate pileup...') | |
| 592 with open(pileup_file, 'w') as sam_pileup: | |
| 593 run_command(['samtools', 'mpileup', '-L', '1000', '-f', fasta, | |
| 594 '-Q', str(args.baseq), '-q', str(args.mapq), out_file_bam_sorted + '.bam'], | |
| 595 stdout=sam_pileup) | |
| 596 | |
| 597 def calculate_ST(allele_scores, ST_db, gene_names, sample_name, mlst_delimiter, avg_depth_allele, mix_rates): | |
| 598 allele_numbers = [] # clean allele calls for determing ST. order is taken from gene names, as in ST definitions file | |
| 599 alleles_with_flags = [] # flagged alleles for printing (* if mismatches, ? if depth issues) | |
| 600 mismatch_flags = [] # allele/diffs | |
| 601 uncertainty_flags = [] #allele/uncertainty | |
| 602 # st_flags = [] # (* if mismatches, ? if depth issues) | |
| 603 depths = [] # depths for each typed locus | |
| 604 mafs = [] # minor allele freqencies for each typed locus | |
| 605 | |
| 606 # get allele numbers & info | |
| 607 for gene in gene_names: | |
| 608 if gene in allele_scores: | |
| 609 (allele,diffs,depth_problem,divergence) = allele_scores[gene] | |
| 610 allele_number = allele.split(mlst_delimiter)[-1] | |
| 611 depths.append(avg_depth_allele[allele]) | |
| 612 mix_rate = mix_rates[allele] | |
| 613 mafs.append(mix_rate) | |
| 614 else: | |
| 615 allele_number = "-" | |
| 616 diffs = "" | |
| 617 depth_problem = "" | |
| 618 mix_rate = "" | |
| 619 allele_numbers.append(allele_number) | |
| 620 | |
| 621 allele_with_flags = allele_number | |
| 622 if diffs != "": | |
| 623 if diffs != "trun": | |
| 624 allele_with_flags+="*" # trun indicates only that a truncated form had lower score, which isn't a mismatch | |
| 625 mismatch_flags.append(allele+"/"+diffs) | |
| 626 if depth_problem != "": | |
| 627 allele_with_flags+="?" | |
| 628 uncertainty_flags.append(allele+"/"+depth_problem) | |
| 629 alleles_with_flags.append(allele_with_flags) | |
| 630 | |
| 631 # calculate ST (no flags) | |
| 632 if ST_db: | |
| 633 allele_string = " ".join(allele_numbers) # for determining ST | |
| 634 try: | |
| 635 clean_st = ST_db[allele_string] | |
| 636 except KeyError: | |
| 637 print "This combination of alleles was not found in the sequence type database:", | |
| 638 print sample_name, | |
| 639 for gene in allele_scores: | |
| 640 (allele,diffs,depth_problems,divergence) = allele_scores[gene] | |
| 641 print allele, | |
| 642 print | |
| 643 clean_st = "NF" | |
| 644 else: | |
| 645 clean_st = "ND" | |
| 646 | |
| 647 # add flags for reporting | |
| 648 st = clean_st | |
| 649 if len(mismatch_flags) > 0: | |
| 650 if mismatch_flags!=["trun"]: | |
| 651 st += "*" # trun indicates only that a truncated form had lower score, which isn't a mismatch | |
| 652 else: | |
| 653 mismatch_flags = ['0'] # record no mismatches | |
| 654 if len(uncertainty_flags) > 0: | |
| 655 st += "?" | |
| 656 else: | |
| 657 uncertainty_flags = ['-'] | |
| 658 | |
| 659 # mean depth across loci | |
| 660 if len(depths) > 0: | |
| 661 mean_depth = float(sum(depths))/len(depths) | |
| 662 else: | |
| 663 mean_depth = 0 | |
| 664 | |
| 665 # maximum maf across locus | |
| 666 if len(mafs) > 0: | |
| 667 max_maf = max(mafs) | |
| 668 else: | |
| 669 max_maf = 0 | |
| 670 | |
| 671 return (st,clean_st,alleles_with_flags,mismatch_flags,uncertainty_flags,mean_depth,max_maf) | |
| 672 | |
| 673 def parse_ST_database(ST_filename,gene_names_from_fai): | |
| 674 # Read ST definitions | |
| 675 ST_db = {} # key = allele string, value = ST | |
| 676 gene_names = [] | |
| 677 num_gene_cols_expected = len(gene_names_from_fai) | |
| 678 print "Attempting to read " + str(num_gene_cols_expected) + " loci from ST database " + ST_filename | |
| 679 with open(ST_filename) as f: | |
| 680 count = 0 | |
| 681 for line in f: | |
| 682 count += 1 | |
| 683 line_split = line.rstrip().split("\t") | |
| 684 if count == 1: # Header | |
| 685 gene_names = line_split[1:min(num_gene_cols_expected+1,len(line_split))] | |
| 686 for g in gene_names_from_fai: | |
| 687 if g not in gene_names: | |
| 688 print "Warning: gene " + g + " in database file isn't among the columns in the ST definitions: " + ",".join(gene_names) | |
| 689 print " Any sequences with this gene identifer from the database will not be included in typing." | |
| 690 if len(line_split) == num_gene_cols_expected+1: | |
| 691 gene_names.pop() # we read too many columns | |
| 692 num_gene_cols_expected -= 1 | |
| 693 for g in gene_names: | |
| 694 if g not in gene_names_from_fai: | |
| 695 print "Warning: gene " + g + " in ST definitions file isn't among those in the database " + ",".join(gene_names_from_fai) | |
| 696 print " This will result in all STs being called as unknown (but allele calls will be accurate for other loci)." | |
| 697 else: | |
| 698 ST = line_split[0] | |
| 699 if ST not in ST_db.values(): | |
| 700 ST_string = " ".join(line_split[1:num_gene_cols_expected+1]) | |
| 701 ST_db[ST_string] = ST | |
| 702 else: | |
| 703 print "Warning: this ST is not unique in the ST definitions file: " + ST | |
| 704 print "Read ST database " + ST_filename + " successfully" | |
| 705 return (ST_db, gene_names) | |
| 706 | |
| 707 def get_allele_name_from_db(allele,unique_allele_symbols,unique_cluster_symbols,run_type,args): | |
| 708 | |
| 709 if run_type != "mlst": | |
| 710 # header format: >[cluster]___[gene]___[allele]___[uniqueID] [info] | |
| 711 allele_parts = allele.split() | |
| 712 allele_detail = allele_parts.pop(0) | |
| 713 allele_info = allele_detail.split("__") | |
| 714 | |
| 715 if len(allele_info)>2: | |
| 716 cluster_id = allele_info[0] # ID number for the cluster | |
| 717 gene_name = allele_info[1] # gene name/symbol for the cluster | |
| 718 allele_name = allele_info[2] # specific allele name | |
| 719 seqid = allele_info[3] # unique identifier for this seq | |
| 720 else: | |
| 721 cluster_id = gene_name = allele_name = seqid = allele_parts[0] | |
| 722 | |
| 723 if not unique_allele_symbols: | |
| 724 allele_name += "_" + seqid | |
| 725 | |
| 726 else: | |
| 727 gene_name = allele.split(args.mlst_delimiter) | |
| 728 allele_name = allele | |
| 729 seqid = None | |
| 730 cluster_id = None | |
| 731 | |
| 732 return gene_name, allele_name, cluster_id, seqid | |
| 733 | |
| 734 def create_allele_pileup(allele_name, all_pileup_file): | |
| 735 outpileup = allele_name + "." + all_pileup_file | |
| 736 with open(outpileup, 'w') as allele_pileup: | |
| 737 with open(all_pileup_file) as all_pileup: | |
| 738 for line in all_pileup: | |
| 739 if line.split()[0] == allele_name: | |
| 740 allele_pileup.write(line) | |
| 741 return outpileup | |
| 742 | |
| 743 def parse_scores(run_type,args,scores, hash_edge_depth, | |
| 744 avg_depth_allele, coverage_allele, mismatch_allele, indel_allele, | |
| 745 missing_allele, size_allele, next_to_del_depth_allele, | |
| 746 unique_cluster_symbols,unique_allele_symbols, pileup_file): | |
| 747 | |
| 748 # sort into hash for each gene locus | |
| 749 scores_by_gene = collections.defaultdict(dict) # key1 = gene, key2 = allele, value = score | |
| 750 | |
| 751 if run_type=="mlst": | |
| 752 for allele in scores: | |
| 753 if coverage_allele[allele] > args.min_coverage: | |
| 754 allele_info = allele.split(args.mlst_delimiter) | |
| 755 scores_by_gene[allele_info[0]][allele] = scores[allele] | |
| 756 else: | |
| 757 for allele in scores: | |
| 758 if coverage_allele[allele] > args.min_coverage: | |
| 759 gene_name = get_allele_name_from_db(allele,unique_allele_symbols,unique_cluster_symbols,run_type,args)[2] # cluster ID | |
| 760 scores_by_gene[gene_name][allele] = scores[allele] | |
| 761 | |
| 762 # determine best allele for each gene locus/cluster | |
| 763 results = {} # key = gene, value = (allele,diffs,depth) | |
| 764 | |
| 765 for gene in scores_by_gene: | |
| 766 | |
| 767 gene_hash = scores_by_gene[gene] | |
| 768 scores_sorted = sorted(gene_hash.iteritems(),key=operator.itemgetter(1)) # sort by score | |
| 769 (top_allele,top_score) = scores_sorted[0] | |
| 770 | |
| 771 # check if depth is adequate for confident call | |
| 772 adequate_depth = False | |
| 773 depth_problem = "" | |
| 774 if hash_edge_depth[top_allele][0] > args.min_edge_depth and hash_edge_depth[top_allele][1] > args.min_edge_depth: | |
| 775 if next_to_del_depth_allele[top_allele] != "NA": | |
| 776 if float(next_to_del_depth_allele[top_allele]) > args.min_edge_depth: | |
| 777 if avg_depth_allele[top_allele] > args.min_depth: | |
| 778 adequate_depth = True | |
| 779 else: | |
| 780 depth_problem="depth"+str(avg_depth_allele[top_allele]) | |
| 781 else: | |
| 782 depth_problem = "del"+str(next_to_del_depth_allele[top_allele]) | |
| 783 elif avg_depth_allele[top_allele] > args.min_depth: | |
| 784 adequate_depth = True | |
| 785 else: | |
| 786 depth_problem="depth"+str(avg_depth_allele[top_allele]) | |
| 787 else: | |
| 788 depth_problem = "edge"+str(min(hash_edge_depth[top_allele][0],hash_edge_depth[top_allele][1])) | |
| 789 | |
| 790 # check if there are confident differences against this allele | |
| 791 differences = "" | |
| 792 if mismatch_allele[top_allele] > 0: | |
| 793 differences += str(mismatch_allele[top_allele])+"snp" | |
| 794 if indel_allele[top_allele] > 0: | |
| 795 differences += str(indel_allele[top_allele])+"indel" | |
| 796 if missing_allele[top_allele] > 0: | |
| 797 differences += str(missing_allele[top_allele])+"holes" | |
| 798 | |
| 799 divergence = float(mismatch_allele[top_allele]) / float( size_allele[top_allele] - missing_allele[top_allele] ) | |
| 800 | |
| 801 # check for truncated | |
| 802 if differences != "" or not adequate_depth: | |
| 803 # if there are SNPs or not enough depth to trust the result, no need to screen next best match | |
| 804 results[gene] = (top_allele, differences, depth_problem, divergence) | |
| 805 else: | |
| 806 # looks good but this could be a truncated version of the real allele; check for longer versions | |
| 807 truncation_override = False | |
| 808 if len(scores_sorted) > 1: | |
| 809 (next_best_allele,next_best_score) = scores_sorted[1] | |
| 810 if size_allele[next_best_allele] > size_allele[top_allele]: | |
| 811 # next best is longer, top allele could be a truncation? | |
| 812 if (mismatch_allele[next_best_allele] + indel_allele[next_best_allele] + missing_allele[next_best_allele]) == 0: | |
| 813 # next best also has no mismatches | |
| 814 if (next_best_score - top_score)/top_score < 0.1: | |
| 815 # next best has score within 10% of this one | |
| 816 truncation_override = True | |
| 817 if truncation_override: | |
| 818 results[gene] = (next_best_allele, "trun", "", divergence) # no diffs but report this call is based on truncation test | |
| 819 else: | |
| 820 results[gene] = (top_allele, "", "",divergence) # no caveats to report | |
| 821 | |
| 822 # Check if there are any potential new alleles | |
| 823 #depth_problem = results[gene][2] | |
| 824 #divergence = results[gene][3] | |
| 825 if depth_problem == "" and divergence > 0: | |
| 826 new_allele = True | |
| 827 # Get the consensus for this new allele and write it to file | |
| 828 if args.report_new_consensus or args.report_all_consensus: | |
| 829 new_alleles_filename = args.output + ".new_consensus_alleles.fasta" | |
| 830 allele_pileup_file = create_allele_pileup(top_allele, pileup_file) # XXX Creates a new pileup file for that allele. Not currently cleaned up | |
| 831 read_pileup_data(allele_pileup_file, size_allele, args.prob_err, consensus_file = new_alleles_filename) | |
| 832 if args.report_all_consensus: | |
| 833 new_alleles_filename = args.output + ".all_consensus_alleles.fasta" | |
| 834 allele_pileup_file = create_allele_pileup(top_allele, pileup_file) | |
| 835 read_pileup_data(allele_pileup_file, size_allele, args.prob_err, consensus_file = new_alleles_filename) | |
| 836 | |
| 837 return results # (allele, diffs, depth_problem, divergence) | |
| 838 | |
| 839 | |
| 840 def get_readFile_components(full_file_path): | |
| 841 (file_path,file_name) = os.path.split(full_file_path) | |
| 842 m1 = re.match("(.*).gz",file_name) | |
| 843 ext = "" | |
| 844 if m1 != None: | |
| 845 # gzipped | |
| 846 ext = ".gz" | |
| 847 file_name = m1.groups()[0] | |
| 848 (file_name_before_ext,ext2) = os.path.splitext(file_name) | |
| 849 full_ext = ext2+ext | |
| 850 return(file_path,file_name_before_ext,full_ext) | |
| 851 | |
| 852 def read_file_sets(args): | |
| 853 | |
| 854 fileSets = {} # key = id, value = list of files for that sample | |
| 855 num_single_readsets = 0 | |
| 856 num_paired_readsets = 0 | |
| 857 | |
| 858 if args.input_se: | |
| 859 # single end | |
| 860 for fastq in args.input_se: | |
| 861 (file_path,file_name_before_ext,full_ext) = get_readFile_components(fastq) | |
| 862 m=re.match("(.*)(_S.*)(_L.*)(_R.*)(_.*)", file_name_before_ext) | |
| 863 if m==None: | |
| 864 fileSets[file_name_before_ext] = [fastq] | |
| 865 else: | |
| 866 fileSets[m.groups()[0]] = [fastq] # Illumina names | |
| 867 num_single_readsets += 1 | |
| 868 | |
| 869 elif args.input_pe: | |
| 870 # paired end | |
| 871 forward_reads = {} # key = sample, value = full path to file | |
| 872 reverse_reads = {} # key = sample, value = full path to file | |
| 873 num_paired_readsets = 0 | |
| 874 num_single_readsets = 0 | |
| 875 for fastq in args.input_pe: | |
| 876 (file_path,file_name_before_ext,full_ext) = get_readFile_components(fastq) | |
| 877 # try to match to MiSeq format: | |
| 878 m=re.match("(.*)(_S.*)(_L.*)(_R.*)(_.*)", file_name_before_ext) | |
| 879 if m==None: | |
| 880 # not default Illumina file naming format, expect simple/ENA format | |
| 881 m=re.match("(.*)("+args.forward+")$",file_name_before_ext) | |
| 882 if m!=None: | |
| 883 # store as forward read | |
| 884 (baseName,read) = m.groups() | |
| 885 forward_reads[baseName] = fastq | |
| 886 else: | |
| 887 m=re.match("(.*)("+args.reverse+")$",file_name_before_ext) | |
| 888 if m!=None: | |
| 889 # store as reverse read | |
| 890 (baseName,read) = m.groups() | |
| 891 reverse_reads[baseName] = fastq | |
| 892 else: | |
| 893 logging.info("Could not determine forward/reverse read status for input file " + fastq) | |
| 894 else: | |
| 895 # matches default Illumina file naming format, e.g. m.groups() = ('samplename', '_S1', '_L001', '_R1', '_001') | |
| 896 baseName, read = m.groups()[0], m.groups()[3] | |
| 897 if read == "_R1": | |
| 898 forward_reads[baseName] = fastq | |
| 899 elif read == "_R2": | |
| 900 reverse_reads[baseName] = fastq | |
| 901 else: | |
| 902 logging.info( "Could not determine forward/reverse read status for input file " + fastq ) | |
| 903 logging.info( " this file appears to match the MiSeq file naming convention (samplename_S1_L001_[R1]_001), but we were expecting [R1] or [R2] to designate read as forward or reverse?" ) | |
| 904 fileSets[file_name_before_ext] = fastq | |
| 905 num_single_readsets += 1 | |
| 906 # store in pairs | |
| 907 for sample in forward_reads: | |
| 908 if sample in reverse_reads: | |
| 909 fileSets[sample] = [forward_reads[sample],reverse_reads[sample]] # store pair | |
| 910 num_paired_readsets += 1 | |
| 911 else: | |
| 912 fileSets[sample] = [forward_reads[sample]] # no reverse found | |
| 913 num_single_readsets += 1 | |
| 914 logging.info('Warning, could not find pair for read:' + forward_reads[sample]) | |
| 915 for sample in reverse_reads: | |
| 916 if sample not in fileSets: | |
| 917 fileSets[sample] = reverse_reads[sample] # no forward found | |
| 918 num_single_readsets += 1 | |
| 919 logging.info('Warning, could not find pair for read:' + reverse_reads[sample]) | |
| 920 | |
| 921 if num_paired_readsets > 0: | |
| 922 logging.info('Total paired readsets found:' + str(num_paired_readsets)) | |
| 923 if num_single_readsets > 0: | |
| 924 logging.info('Total single reads found:' + str(num_single_readsets)) | |
| 925 | |
| 926 return fileSets | |
| 927 | |
| 928 def read_results_from_file(infile): | |
| 929 | |
| 930 if os.stat(infile).st_size == 0: | |
| 931 logging.info("WARNING: Results file provided is empty: " + infile) | |
| 932 return False, False, False | |
| 933 | |
| 934 results_info = infile.split("__") | |
| 935 if len(results_info) > 1: | |
| 936 | |
| 937 if re.search("compiledResults",infile)!=None: | |
| 938 dbtype = "compiled" | |
| 939 dbname = results_info[0] # output identifier | |
| 940 else: | |
| 941 dbtype = results_info[1] # mlst or genes | |
| 942 dbname = results_info[2] # database | |
| 943 | |
| 944 logging.info("Processing " + dbtype + " results from file " + infile) | |
| 945 | |
| 946 if dbtype == "genes": | |
| 947 results = collections.defaultdict(dict) # key1 = sample, key2 = gene, value = allele | |
| 948 with open(infile) as f: | |
| 949 header = [] | |
| 950 for line in f: | |
| 951 line_split = line.rstrip().split("\t") | |
| 952 if len(header) == 0: | |
| 953 header = line_split | |
| 954 else: | |
| 955 sample = line_split[0] | |
| 956 for i in range(1,len(line_split)): | |
| 957 gene = header[i] # cluster_id | |
| 958 results[sample][gene] = line_split[i] | |
| 959 | |
| 960 elif dbtype == "mlst": | |
| 961 results = {} # key = sample, value = MLST string | |
| 962 with open(infile) as f: | |
| 963 header = 0 | |
| 964 for line in f: | |
| 965 if header > 0: | |
| 966 results[line.split("\t")[0]] = line.rstrip() | |
| 967 if "maxMAF" not in header: | |
| 968 results[line.split("\t")[0]] += "\tNC" # empty column for maxMAF | |
| 969 else: | |
| 970 header = line.rstrip() | |
| 971 results[line.split("\t")[0]] = line.rstrip() # store header line too (index "Sample") | |
| 972 if "maxMAF" not in header: | |
| 973 results[line.split("\t")[0]] += "\tmaxMAF" # add column for maxMAF | |
| 974 | |
| 975 elif dbtype == "compiled": | |
| 976 results = collections.defaultdict(dict) # key1 = sample, key2 = gene, value = allele | |
| 977 with open(infile) as f: | |
| 978 header = [] | |
| 979 mlst_cols = 0 # INDEX of the last mlst column | |
| 980 n_cols = 0 | |
| 981 for line in f: | |
| 982 line_split = line.rstrip().split("\t") | |
| 983 if len(header) == 0: | |
| 984 header = line_split | |
| 985 n_cols = len(header) | |
| 986 if n_cols > 1: | |
| 987 if header[1] == "ST": | |
| 988 # there is mlst data reported | |
| 989 mlst_cols = 2 # first locus column | |
| 990 while header[mlst_cols] != "depth": | |
| 991 mlst_cols += 1 | |
| 992 results["Sample"]["mlst"] = "\t".join(line_split[0:(mlst_cols+1)]) | |
| 993 results["Sample"]["mlst"] += "\tmaxMAF" # add to mlst header even if not encountered in this file, as it may be in others | |
| 994 if header[mlst_cols+1] == "maxMAF": | |
| 995 mlst_cols += 1 # record maxMAF column within MLST data, if present | |
| 996 else: | |
| 997 # no mlst data reported | |
| 998 dbtype = "genes" | |
| 999 logging.info("No MLST data in compiled results file " + infile) | |
| 1000 else: | |
| 1001 # no mlst data reported | |
| 1002 dbtype = "genes" | |
| 1003 logging.info("No MLST data in compiled results file " + infile) | |
| 1004 | |
| 1005 else: | |
| 1006 sample = line_split[0] | |
| 1007 if mlst_cols > 0: | |
| 1008 results[sample]["mlst"] = "\t".join(line_split[0:(mlst_cols+1)]) | |
| 1009 if "maxMAF" not in header: | |
| 1010 results[sample]["mlst"] += "\t" # add to mlst section even if not encountered in this file, as it may be in others | |
| 1011 if n_cols > mlst_cols: | |
| 1012 # read genes component | |
| 1013 for i in range(mlst_cols+1,n_cols): | |
| 1014 # note i=1 if mlst_cols==0, ie we are reading all | |
| 1015 gene = header[i] | |
| 1016 if len(line_split) > i: | |
| 1017 results[sample][gene] = line_split[i] | |
| 1018 else: | |
| 1019 results[sample][gene] = "-" | |
| 1020 else: | |
| 1021 results = False | |
| 1022 dbtype = False | |
| 1023 dbname = False | |
| 1024 logging.info("Couldn't decide what to do with file results file provided: " + infile) | |
| 1025 | |
| 1026 else: | |
| 1027 results = False | |
| 1028 dbtype = False | |
| 1029 dbname = False | |
| 1030 logging.info("Couldn't decide what to do with file results file provided: " + infile) | |
| 1031 | |
| 1032 return results, dbtype, dbname | |
| 1033 | |
| 1034 def read_scores_file(scores_file): | |
| 1035 hash_edge_depth = {} | |
| 1036 avg_depth_allele = {} | |
| 1037 coverage_allele = {} | |
| 1038 mismatch_allele = {} | |
| 1039 indel_allele = {} | |
| 1040 missing_allele = {} | |
| 1041 size_allele = {} | |
| 1042 next_to_del_depth_allele = {} | |
| 1043 mix_rates = {} | |
| 1044 scores = {} | |
| 1045 | |
| 1046 f = file(scores_file,"r") | |
| 1047 | |
| 1048 for line in f: | |
| 1049 line_split = line.rstrip().split("\t") | |
| 1050 allele = line_split[0] | |
| 1051 if allele != "Allele": # skip header row | |
| 1052 scores[allele] = float(line_split[1]) | |
| 1053 mix_rates[allele] = float(line_split[11]) | |
| 1054 avg_depth_allele[allele] = float(line_split[2]) | |
| 1055 hash_edge_depth[allele] = (float(line_split[3]),float(line_split[4])) | |
| 1056 coverage_allele[allele] = float(line_split[5]) | |
| 1057 size_allele[allele] = int(line_split[6]) | |
| 1058 mismatch_allele[allele] = int(line_split[7]) | |
| 1059 indel_allele[allele] = int(line_split[8]) | |
| 1060 missing_allele[allele] = int(line_split[9]) | |
| 1061 next_to_del_depth = line_split[10] | |
| 1062 next_to_del_depth_allele[allele] = line_split[10] | |
| 1063 | |
| 1064 return hash_edge_depth, avg_depth_allele, coverage_allele, mismatch_allele, indel_allele, \ | |
| 1065 missing_allele, size_allele, next_to_del_depth_allele, scores, mix_rates | |
| 1066 | |
| 1067 def run_srst2(args, fileSets, dbs, run_type): | |
| 1068 | |
| 1069 db_reports = [] # list of db-specific output files to return | |
| 1070 db_results_list = [] # list of results hashes, one per db | |
| 1071 | |
| 1072 for fasta in dbs: | |
| 1073 db_reports, db_results_list = process_fasta_db(args, fileSets, run_type, db_reports, db_results_list, fasta) | |
| 1074 | |
| 1075 return db_reports, db_results_list | |
| 1076 | |
| 1077 def process_fasta_db(args, fileSets, run_type, db_reports, db_results_list, fasta): | |
| 1078 | |
| 1079 check_command_version(['samtools'], | |
| 1080 'Version: 0.1.18', | |
| 1081 'samtools', | |
| 1082 '0.1.18') | |
| 1083 | |
| 1084 logging.info('Processing database ' + fasta) | |
| 1085 | |
| 1086 db_path, db_name = os.path.split(fasta) # database | |
| 1087 (db_name,db_ext) = os.path.splitext(db_name) | |
| 1088 db_results = "__".join([args.output,run_type,db_name,"results.txt"]) | |
| 1089 db_report = file(db_results,"w") | |
| 1090 db_reports.append(db_results) | |
| 1091 | |
| 1092 # Get sequence lengths and gene names | |
| 1093 # lengths are needed for MLST heuristic to distinguish alleles from their truncated forms | |
| 1094 # gene names read from here are needed for non-MLST dbs | |
| 1095 fai_file = fasta + '.fai' | |
| 1096 if not os.path.exists(fai_file): | |
| 1097 run_command(['samtools', 'faidx', fasta]) | |
| 1098 size, gene_names, unique_gene_symbols, unique_allele_symbols, cluster_symbols = \ | |
| 1099 parse_fai(fai_file,run_type,args.mlst_delimiter) | |
| 1100 | |
| 1101 # Prepare for MLST reporting | |
| 1102 ST_db = False | |
| 1103 if run_type == "mlst": | |
| 1104 results = {} # key = sample, value = ST string for printing | |
| 1105 if args.mlst_definitions: | |
| 1106 # store MLST profiles, replace gene names (we want the order as they appear in this file) | |
| 1107 ST_db, gene_names = parse_ST_database(args.mlst_definitions,gene_names) | |
| 1108 db_report.write("\t".join(["Sample","ST"]+gene_names+["mismatches","uncertainty","depth","maxMAF"]) + "\n") | |
| 1109 results["Sample"] = "\t".join(["Sample","ST"]+gene_names+["mismatches","uncertainty","depth","maxMAF"]) | |
| 1110 | |
| 1111 else: | |
| 1112 # store final results for later tabulation | |
| 1113 results = collections.defaultdict(dict) #key1 = sample, key2 = gene, value = allele | |
| 1114 | |
| 1115 gene_list = [] # start with empty gene list; will add genes from each genedb test | |
| 1116 | |
| 1117 # determine maximum mismatches per read to use for pileup | |
| 1118 if run_type == "mlst": | |
| 1119 max_mismatch = args.mlst_max_mismatch | |
| 1120 else: | |
| 1121 max_mismatch = args.gene_max_mismatch | |
| 1122 | |
| 1123 # Align and score each read set against this DB | |
| 1124 for sample_name in fileSets: | |
| 1125 logging.info('Processing sample ' + sample_name) | |
| 1126 fastq_inputs = fileSets[sample_name] # reads | |
| 1127 | |
| 1128 try: | |
| 1129 # try mapping and scoring this fileset against the current database | |
| 1130 # update the gene_list list and results dict with data from this strain | |
| 1131 # __mlst__ will be printed during this routine if this is a mlst run | |
| 1132 # __fullgenes__ will be printed during this routine if requested and this is a gene_db run | |
| 1133 gene_list, results = \ | |
| 1134 map_fileSet_to_db(args,sample_name,fastq_inputs,db_name,fasta,size,gene_names,\ | |
| 1135 unique_gene_symbols, unique_allele_symbols,run_type,ST_db,results,gene_list,db_report,cluster_symbols,max_mismatch) | |
| 1136 # if we get an error from one of the commands we called | |
| 1137 # log the error message and continue onto the next fasta db | |
| 1138 except CommandError as e: | |
| 1139 logging.error(e.message) | |
| 1140 # record results as unknown, so we know that we did attempt to analyse this readset | |
| 1141 if run_type == "mlst": | |
| 1142 st_result_string = "\t".join( [sample_name,"-"] + ["-"] * (len(gene_names)+3)) # record missing results | |
| 1143 db_report.write( st_result_string + "\n") | |
| 1144 logging.info(" " + st_result_string) | |
| 1145 results[sample_name] = st_result_string | |
| 1146 else: | |
| 1147 logging.info(" failed gene detection") | |
| 1148 results[sample_name]["failed"] = True # so we know that we tried this strain | |
| 1149 | |
| 1150 if run_type != "mlst": | |
| 1151 # tabulate results across samples for this gene db (i.e. __genes__ file) | |
| 1152 logging.info('Tabulating results for database {} ...'.format(fasta)) | |
| 1153 gene_list.sort() | |
| 1154 db_report.write("\t".join(["Sample"]+gene_list)+"\n") # report header row | |
| 1155 for sample_name in fileSets: | |
| 1156 db_report.write(sample_name) | |
| 1157 if sample_name in results: | |
| 1158 # print results | |
| 1159 if "failed" not in results[sample_name]: | |
| 1160 for cluster_id in gene_list: | |
| 1161 if cluster_id in results[sample_name]: | |
| 1162 db_report.write("\t"+results[sample_name][cluster_id]) # print full allele name | |
| 1163 else: | |
| 1164 db_report.write("\t-") # no hits for this gene cluster | |
| 1165 else: | |
| 1166 # no data on this, as the sample failed mapping | |
| 1167 for cluster_id in gene_list: | |
| 1168 db_report.write("\t?") # | |
| 1169 results[sample_name][cluster_id] = "?" # record as unknown | |
| 1170 else: | |
| 1171 # no data on this because genes were not found (but no mapping errors) | |
| 1172 for cluster_id in gene_list: | |
| 1173 db_report.write("\t?") # | |
| 1174 results[sample_name][cluster_id] = "-" # record as absent | |
| 1175 db_report.write("\n") | |
| 1176 | |
| 1177 # Finished with this database | |
| 1178 logging.info('Finished processing for database {} ...'.format(fasta)) | |
| 1179 db_report.close() | |
| 1180 db_results_list.append(results) | |
| 1181 | |
| 1182 return db_reports, db_results_list | |
| 1183 | |
| 1184 def map_fileSet_to_db(args,sample_name,fastq_inputs,db_name,fasta,size,gene_names,\ | |
| 1185 unique_gene_symbols, unique_allele_symbols,run_type,ST_db,results,gene_list,db_report,cluster_symbols,max_mismatch): | |
| 1186 | |
| 1187 mapping_files_pre = args.output + '__' + sample_name + '.' + db_name | |
| 1188 pileup_file = mapping_files_pre + '.pileup' | |
| 1189 scores_file = mapping_files_pre + '.scores' | |
| 1190 | |
| 1191 # Get or read scores | |
| 1192 | |
| 1193 if args.use_existing_scores and os.path.exists(scores_file): | |
| 1194 | |
| 1195 logging.info(' Using existing scores in ' + scores_file) | |
| 1196 | |
| 1197 # read in scores and info from existing scores file | |
| 1198 hash_edge_depth, avg_depth_allele, coverage_allele, \ | |
| 1199 mismatch_allele, indel_allele, missing_allele, size_allele, \ | |
| 1200 next_to_del_depth_allele, scores, mix_rates = read_scores_file(scores_file) | |
| 1201 | |
| 1202 else: | |
| 1203 | |
| 1204 # Get or read pileup | |
| 1205 | |
| 1206 if args.use_existing_pileup and os.path.exists(pileup_file): | |
| 1207 logging.info(' Using existing pileup in ' + pileup_file) | |
| 1208 | |
| 1209 else: | |
| 1210 | |
| 1211 # run bowtie against this db | |
| 1212 bowtie_sam = run_bowtie(mapping_files_pre,sample_name,fastq_inputs,args,db_name,fasta) | |
| 1213 | |
| 1214 # Modify Bowtie's SAM formatted output so that we get secondary | |
| 1215 # alignments in downstream pileup | |
| 1216 (raw_bowtie_sam,bowtie_sam_mod) = modify_bowtie_sam(bowtie_sam,max_mismatch) | |
| 1217 | |
| 1218 # generate pileup from sam (via sorted bam) | |
| 1219 get_pileup(args,mapping_files_pre,raw_bowtie_sam,bowtie_sam_mod,fasta,pileup_file) | |
| 1220 | |
| 1221 # Get scores | |
| 1222 | |
| 1223 # Process the pileup and extract info for scoring and reporting on each allele | |
| 1224 logging.info(' Processing SAMtools pileup...') | |
| 1225 hash_alignment, hash_max_depth, hash_edge_depth, avg_depth_allele, coverage_allele, \ | |
| 1226 mismatch_allele, indel_allele, missing_allele, size_allele, next_to_del_depth_allele= \ | |
| 1227 read_pileup_data(pileup_file, size, args.prob_err) | |
| 1228 | |
| 1229 # Generate scores for all alleles (prints these and associated info if verbose) | |
| 1230 # result = dict, with key=allele, value=score | |
| 1231 logging.info(' Scoring alleles...') | |
| 1232 scores, mix_rates = score_alleles(args, mapping_files_pre, hash_alignment, hash_max_depth, hash_edge_depth, \ | |
| 1233 avg_depth_allele, coverage_allele, mismatch_allele, indel_allele, missing_allele, \ | |
| 1234 size_allele, next_to_del_depth_allele, run_type) | |
| 1235 | |
| 1236 # GET BEST SCORE for each gene/cluster | |
| 1237 # result = dict, with key = gene, value = (allele,diffs,depth_problem) | |
| 1238 # for MLST DBs, key = gene = locus, allele = gene-number | |
| 1239 # for gene DBs, key = gene = cluster ID, allele = cluster__gene__allele__id | |
| 1240 # for gene DBs, only those alleles passing the coverage cutoff are returned | |
| 1241 | |
| 1242 allele_scores = parse_scores(run_type, args, scores, \ | |
| 1243 hash_edge_depth, avg_depth_allele, coverage_allele, mismatch_allele, \ | |
| 1244 indel_allele, missing_allele, size_allele, next_to_del_depth_allele, | |
| 1245 unique_gene_symbols, unique_allele_symbols, pileup_file) | |
| 1246 | |
| 1247 # REPORT/RECORD RESULTS | |
| 1248 | |
| 1249 # Report MLST results to __mlst__ file | |
| 1250 if run_type == "mlst" and len(allele_scores) > 0: | |
| 1251 | |
| 1252 # Calculate ST and get info for reporting | |
| 1253 (st,clean_st,alleles_with_flags,mismatch_flags,uncertainty_flags,mean_depth,max_maf) = \ | |
| 1254 calculate_ST(allele_scores, ST_db, gene_names, sample_name, args.mlst_delimiter, avg_depth_allele, mix_rates) | |
| 1255 | |
| 1256 # Print to MLST report, log and save the result | |
| 1257 st_result_string = "\t".join([sample_name,st]+alleles_with_flags+[";".join(mismatch_flags),";".join(uncertainty_flags),str(mean_depth),str(max_maf)]) | |
| 1258 db_report.write( st_result_string + "\n") | |
| 1259 logging.info(" " + st_result_string) | |
| 1260 results[sample_name] = st_result_string | |
| 1261 | |
| 1262 # Make sure scores are printed if there was uncertainty in the call | |
| 1263 scores_output_file = mapping_files_pre + '.scores' | |
| 1264 if uncertainty_flags != ["-"] and not args.save_scores and not os.path.exists(scores_output_file): | |
| 1265 # print full score set | |
| 1266 logging.info("Printing all MLST scores to " + scores_output_file) | |
| 1267 scores_output = file(scores_output_file, 'w') | |
| 1268 scores_output.write("Allele\tScore\tAvg_depth\tEdge1_depth\tEdge2_depth\tPercent_coverage\tSize\tMismatches\tIndels\tTruncated_bases\tDepthNeighbouringTruncation\tMmaxMAF\n") | |
| 1269 for allele in scores.keys(): | |
| 1270 score = scores[allele] | |
| 1271 scores_output.write('\t'.join([allele, str(score), str(avg_depth_allele[allele]), \ | |
| 1272 str(hash_edge_depth[allele][0]), str(hash_edge_depth[allele][1]), \ | |
| 1273 str(coverage_allele[allele]), str(size_allele[allele]), str(mismatch_allele[allele]), \ | |
| 1274 str(indel_allele[allele]), str(missing_allele[allele]), str(next_to_del_depth_allele[allele]), str(round(mix_rates[allele],3))]) + '\n') | |
| 1275 scores_output.close() | |
| 1276 | |
| 1277 # Record gene results for later processing and optionally print detailed gene results to __fullgenes__ file | |
| 1278 elif run_type == "genes" and len(allele_scores) > 0: | |
| 1279 if args.no_gene_details: | |
| 1280 full_results = "__".join([args.output,"full"+run_type,db_name,"results.txt"]) | |
| 1281 logging.info("Printing verbose gene detection results to " + full_results) | |
| 1282 f = file(full_results,"w") | |
| 1283 f.write("\t".join(["Sample","DB","gene","allele","coverage","depth","diffs","uncertainty","divergence","length", "maxMAF","clusterid","seqid","annotation"])+"\n") | |
| 1284 for gene in allele_scores: | |
| 1285 (allele,diffs,depth_problem,divergence) = allele_scores[gene] # gene = top scoring alleles for each cluster | |
| 1286 gene_name, allele_name, cluster_id, seqid = \ | |
| 1287 get_allele_name_from_db(allele,unique_allele_symbols,unique_gene_symbols,run_type,args) | |
| 1288 | |
| 1289 # store for gene result table only if divergence passes minimum threshold: | |
| 1290 if divergence*100 <= float(args.max_divergence): | |
| 1291 column_header = cluster_symbols[cluster_id] | |
| 1292 results[sample_name][column_header] = allele_name | |
| 1293 if diffs != "": | |
| 1294 results[sample_name][column_header] += "*" | |
| 1295 if depth_problem != "": | |
| 1296 results[sample_name][column_header] += "?" | |
| 1297 if gene not in gene_list: | |
| 1298 gene_list.append(column_header) | |
| 1299 | |
| 1300 # write details to full genes report | |
| 1301 if args.no_gene_details: | |
| 1302 | |
| 1303 # get annotation info | |
| 1304 header_string = os.popen(" ".join(["grep",allele,fasta])) | |
| 1305 try: | |
| 1306 header = header_string.read().rstrip().split() | |
| 1307 header.pop(0) # remove allele name | |
| 1308 if len(header) > 0: | |
| 1309 annotation = " ".join(header) # put back the spaces | |
| 1310 else: | |
| 1311 annotation = "" | |
| 1312 | |
| 1313 except: | |
| 1314 annotation = "" | |
| 1315 | |
| 1316 f.write("\t".join([sample_name,db_name,gene_name,allele_name,str(round(coverage_allele[allele],3)),str(avg_depth_allele[allele]),diffs,depth_problem,str(round(divergence*100,3)),str(size_allele[allele]),str(round(mix_rates[allele],3)),cluster_id,seqid,annotation])+"\n") | |
| 1317 | |
| 1318 # log the gene detection result | |
| 1319 logging.info(" " + str(len(allele_scores)) + " genes identified in " + sample_name) | |
| 1320 | |
| 1321 # Finished with this read set | |
| 1322 logging.info(' Finished processing for read set {} ...'.format(sample_name)) | |
| 1323 | |
| 1324 return gene_list, results | |
| 1325 | |
| 1326 def compile_results(args,mlst_results,db_results,compiled_output_file): | |
| 1327 | |
| 1328 o = file(compiled_output_file,"w") | |
| 1329 | |
| 1330 # get list of all samples and genes present in these datasets | |
| 1331 sample_list = [] # each entry is a sample present in at least one db | |
| 1332 gene_list = [] | |
| 1333 mlst_cols = 0 | |
| 1334 mlst_header_string = "" | |
| 1335 blank_mlst_section = "" | |
| 1336 | |
| 1337 mlst_results_master = {} # compilation of all MLST results | |
| 1338 db_results_master = collections.defaultdict(dict) # compilation of all gene results | |
| 1339 st_counts = {} # key = ST, value = count | |
| 1340 | |
| 1341 if len(mlst_results) > 0: | |
| 1342 | |
| 1343 for mlst_result in mlst_results: | |
| 1344 | |
| 1345 # check length of the mlst string | |
| 1346 if "Sample" in mlst_result: | |
| 1347 test_string = mlst_result["Sample"] | |
| 1348 if mlst_cols == 0: | |
| 1349 mlst_header_string = test_string | |
| 1350 else: | |
| 1351 test_string = mlst_result[mlst_result.keys()[0]] # no header line? | |
| 1352 test_string_split = test_string.split("\t") | |
| 1353 this_mlst_cols = len(test_string) | |
| 1354 | |
| 1355 if (mlst_cols == 0) or (mlst_cols == this_mlst_cols): | |
| 1356 mlst_cols = this_mlst_cols | |
| 1357 blank_mlst_section = "\t" * (mlst_cols-1) # blank MLST string in case some samples missing | |
| 1358 # use this data | |
| 1359 for sample in mlst_result: | |
| 1360 mlst_results_master[sample] = mlst_result[sample] | |
| 1361 if sample not in sample_list: | |
| 1362 sample_list.append(sample) | |
| 1363 elif mlst_cols != this_mlst_cols: | |
| 1364 # don't process this data further | |
| 1365 logging.info("Problem reconciling MLST data from two files, first MLST results encountered had " + str(mlst_cols) + " columns, this one has " + str(this_mlst_cols) + " columns?") | |
| 1366 if args.mlst_db: | |
| 1367 logging.info("Compiled report will contain only the MLST data from this run, not previous outputs") | |
| 1368 else: | |
| 1369 logging.info("Compiled report will contain only the data from the first MLST result set provided") | |
| 1370 | |
| 1371 if len(db_results) > 0: | |
| 1372 for results in db_results: | |
| 1373 for sample in results: | |
| 1374 if sample not in sample_list: | |
| 1375 sample_list.append(sample) | |
| 1376 for gene in results[sample]: | |
| 1377 if gene != "failed": | |
| 1378 db_results_master[sample][gene] = results[sample][gene] | |
| 1379 if gene not in gene_list: | |
| 1380 gene_list.append(gene) | |
| 1381 | |
| 1382 if "Sample" in sample_list: | |
| 1383 sample_list.remove("Sample") | |
| 1384 sample_list.sort() | |
| 1385 gene_list.sort() | |
| 1386 | |
| 1387 # print header | |
| 1388 header_elements = [] | |
| 1389 if len(mlst_results) > 0: | |
| 1390 header_elements.append(mlst_header_string) | |
| 1391 else: | |
| 1392 header_elements.append("Sample") | |
| 1393 if (gene_list) > 0: | |
| 1394 header_elements += gene_list | |
| 1395 o.write("\t".join(header_elements)+"\n") | |
| 1396 | |
| 1397 # print results for all samples | |
| 1398 for sample in sample_list: | |
| 1399 | |
| 1400 sample_info = [] # first entry is mlst string OR sample name, rest are genes | |
| 1401 | |
| 1402 # print mlst if provided, otherwise just print sample name | |
| 1403 if len(mlst_results_master) > 0: | |
| 1404 if sample in mlst_results_master: | |
| 1405 sample_info.append(mlst_results_master[sample]) | |
| 1406 this_st = mlst_results_master[sample].split("\t")[1] | |
| 1407 else: | |
| 1408 sample_info.append(sample+blank_mlst_section) | |
| 1409 this_st = "unknown" | |
| 1410 # record the MLST result | |
| 1411 if this_st in st_counts: | |
| 1412 st_counts[this_st] += 1 | |
| 1413 else: | |
| 1414 st_counts[this_st] = 1 | |
| 1415 else: | |
| 1416 sample_info.append(sample) | |
| 1417 | |
| 1418 # get gene info if provided | |
| 1419 if sample in db_results_master: | |
| 1420 for gene in gene_list: | |
| 1421 if gene in db_results_master[sample]: | |
| 1422 sample_info.append(db_results_master[sample][gene]) | |
| 1423 else: | |
| 1424 sample_info.append("-") | |
| 1425 else: | |
| 1426 for gene in gene_list: | |
| 1427 sample_info.append("?") # record no gene data on this strain | |
| 1428 | |
| 1429 o.write("\t".join(sample_info)+"\n") | |
| 1430 | |
| 1431 o.close() | |
| 1432 | |
| 1433 logging.info("Compiled data on " + str(len(sample_list)) + " samples printed to: " + compiled_output_file) | |
| 1434 | |
| 1435 # log ST counts | |
| 1436 if len(mlst_results_master) > 0: | |
| 1437 logging.info("Detected " + str(len(st_counts.keys())) + " STs: ") | |
| 1438 sts = st_counts.keys() | |
| 1439 sts.sort() | |
| 1440 for st in sts: | |
| 1441 logging.info("ST" + st + "\t" + str(st_counts[st])) | |
| 1442 | |
| 1443 return True | |
| 1444 | |
| 1445 | |
| 1446 def main(): | |
| 1447 args = parse_args() | |
| 1448 if args.log is True: | |
| 1449 logfile = args.output + ".log" | |
| 1450 else: | |
| 1451 logfile = None | |
| 1452 logging.basicConfig( | |
| 1453 filename=logfile, | |
| 1454 level=logging.DEBUG, | |
| 1455 filemode='w', | |
| 1456 format='%(asctime)s %(message)s', | |
| 1457 datefmt='%m/%d/%Y %H:%M:%S') | |
| 1458 logging.info('program started') | |
| 1459 logging.info('command line: {0}'.format(' '.join(sys.argv))) | |
| 1460 | |
| 1461 # Delete consensus file if it already exists (so can use append file in funtions) | |
| 1462 if args.report_new_consensus or args.report_all_consensus: | |
| 1463 new_alleles_filename = args.output + ".consensus_alleles.fasta" | |
| 1464 if os.path.exists(new_alleles_filename): | |
| 1465 os.remove(new_alleles_filename) | |
| 1466 | |
| 1467 # vars to store results | |
| 1468 mlst_results_hashes = [] # dict (sample->MLST result string) for each MLST output files created/read | |
| 1469 gene_result_hashes = [] # dict (sample->gene->result) for each gene typing output files created/read | |
| 1470 | |
| 1471 # parse list of file sets to analyse | |
| 1472 fileSets = read_file_sets(args) # get list of files to process | |
| 1473 | |
| 1474 # run MLST scoring | |
| 1475 if fileSets and args.mlst_db: | |
| 1476 | |
| 1477 if not args.mlst_definitions: | |
| 1478 | |
| 1479 # print warning to screen to alert user, may want to stop and restart | |
| 1480 print "Warning, MLST allele sequences were provided without ST definitions:" | |
| 1481 print " allele sequences: " + str(args.mlst_db) | |
| 1482 print " these will be mapped and scored, but STs can not be calculated" | |
| 1483 | |
| 1484 # log | |
| 1485 logging.info("Warning, MLST allele sequences were provided without ST definitions:") | |
| 1486 logging.info(" allele sequences: " + str(args.mlst_db)) | |
| 1487 logging.info(" these will be mapped and scored, but STs can not be calculated") | |
| 1488 | |
| 1489 bowtie_index(args.mlst_db) # index the MLST database | |
| 1490 | |
| 1491 # score file sets against MLST database | |
| 1492 mlst_report, mlst_results = run_srst2(args,fileSets,args.mlst_db,"mlst") | |
| 1493 | |
| 1494 logging.info('MLST output printed to ' + mlst_report[0]) | |
| 1495 | |
| 1496 #mlst_reports_files += mlst_report | |
| 1497 mlst_results_hashes += mlst_results | |
| 1498 | |
| 1499 # run gene detection | |
| 1500 if fileSets and args.gene_db: | |
| 1501 | |
| 1502 bowtie_index(args.gene_db) # index the gene databases | |
| 1503 | |
| 1504 db_reports, db_results = run_srst2(args,fileSets,args.gene_db,"genes") | |
| 1505 | |
| 1506 for outfile in db_reports: | |
| 1507 logging.info('Gene detection output printed to ' + outfile) | |
| 1508 | |
| 1509 gene_result_hashes += db_results | |
| 1510 | |
| 1511 # process prior results files | |
| 1512 if args.prev_output: | |
| 1513 | |
| 1514 unique_results_files = list(OrderedDict.fromkeys(args.prev_output)) | |
| 1515 | |
| 1516 for results_file in unique_results_files: | |
| 1517 | |
| 1518 results, dbtype, dbname = read_results_from_file(results_file) | |
| 1519 | |
| 1520 if dbtype == "mlst": | |
| 1521 mlst_results_hashes.append(results) | |
| 1522 | |
| 1523 elif dbtype == "genes": | |
| 1524 gene_result_hashes.append(results) | |
| 1525 | |
| 1526 elif dbtype == "compiled": | |
| 1527 # store mlst in its own db | |
| 1528 mlst_results = {} | |
| 1529 for sample in results: | |
| 1530 if "mlst" in results[sample]: | |
| 1531 mlst_results[sample] = results[sample]["mlst"] | |
| 1532 del results[sample]["mlst"] | |
| 1533 mlst_results_hashes.append(mlst_results) | |
| 1534 gene_result_hashes.append(results) | |
| 1535 | |
| 1536 # compile results if multiple databases or datasets provided | |
| 1537 if ( (len(gene_result_hashes) + len(mlst_results_hashes)) > 1 ): | |
| 1538 compiled_output_file = args.output + "__compiledResults.txt" | |
| 1539 compile_results(args,mlst_results_hashes,gene_result_hashes,compiled_output_file) | |
| 1540 | |
| 1541 elif args.prev_output: | |
| 1542 logging.info('One previous output file was provided, but there is no other data to compile with.') | |
| 1543 | |
| 1544 logging.info('SRST2 has finished.') | |
| 1545 | |
| 1546 | |
| 1547 if __name__ == '__main__': | |
| 1548 main() | 
