annotate scoreProfiles.py @ 2:6787d952242e draft

Uploaded
author estrain
date Sun, 19 May 2019 07:36:55 -0400
parents efc202baae1f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
efc202baae1f planemo upload
estrain
parents:
diff changeset
1 #!/usr/bin/env
efc202baae1f planemo upload
estrain
parents:
diff changeset
2
efc202baae1f planemo upload
estrain
parents:
diff changeset
3 ## Generate basic summary stats for SRST2 (v2) allele score output. Generate summaries for each profile defined in the database
efc202baae1f planemo upload
estrain
parents:
diff changeset
4 ## author: errol strain, estrain@gmail.com
efc202baae1f planemo upload
estrain
parents:
diff changeset
5
efc202baae1f planemo upload
estrain
parents:
diff changeset
6 from argparse import (ArgumentParser, FileType)
efc202baae1f planemo upload
estrain
parents:
diff changeset
7 import sys
efc202baae1f planemo upload
estrain
parents:
diff changeset
8 import glob
efc202baae1f planemo upload
estrain
parents:
diff changeset
9 from decimal import Decimal
efc202baae1f planemo upload
estrain
parents:
diff changeset
10
efc202baae1f planemo upload
estrain
parents:
diff changeset
11 def parse_args():
efc202baae1f planemo upload
estrain
parents:
diff changeset
12 "Parse the input arguments, use '-h' for help."
efc202baae1f planemo upload
estrain
parents:
diff changeset
13
efc202baae1f planemo upload
estrain
parents:
diff changeset
14 parser = ArgumentParser(description='Generate Summary Scores for SRST2 Allele Score Output')
efc202baae1f planemo upload
estrain
parents:
diff changeset
15
efc202baae1f planemo upload
estrain
parents:
diff changeset
16 # Read inputs
efc202baae1f planemo upload
estrain
parents:
diff changeset
17 parser.add_argument('--mlst_definitions', type=str, required=True, nargs=1, help='MLST Definitions')
efc202baae1f planemo upload
estrain
parents:
diff changeset
18 parser.add_argument('--output', type=str, required=True, nargs=1, help='MLST Definitions')
efc202baae1f planemo upload
estrain
parents:
diff changeset
19 parser.add_argument('--profile_cov', type=str, required=False, help='Minimum Average Coverage to Report ST Profile',default="98")
efc202baae1f planemo upload
estrain
parents:
diff changeset
20 parser.add_argument('--profile_max_mismatch', type=str, required=False, help='Maximum Number of Mismatches (SNPs)', default="1")
efc202baae1f planemo upload
estrain
parents:
diff changeset
21
efc202baae1f planemo upload
estrain
parents:
diff changeset
22 return parser.parse_args()
efc202baae1f planemo upload
estrain
parents:
diff changeset
23
efc202baae1f planemo upload
estrain
parents:
diff changeset
24 args =parse_args()
efc202baae1f planemo upload
estrain
parents:
diff changeset
25
efc202baae1f planemo upload
estrain
parents:
diff changeset
26 allHash = {}
efc202baae1f planemo upload
estrain
parents:
diff changeset
27
efc202baae1f planemo upload
estrain
parents:
diff changeset
28 # Read in Profile Database
efc202baae1f planemo upload
estrain
parents:
diff changeset
29 profiles = open(args.mlst_definitions[0],"r")
efc202baae1f planemo upload
estrain
parents:
diff changeset
30 output = open(args.output[0],"w")
efc202baae1f planemo upload
estrain
parents:
diff changeset
31
efc202baae1f planemo upload
estrain
parents:
diff changeset
32 # Minimum mean percent coverage for reporting profile
efc202baae1f planemo upload
estrain
parents:
diff changeset
33 min_per=float(args.profile_cov)
efc202baae1f planemo upload
estrain
parents:
diff changeset
34 # Maximum mean mismatch for reporting profile
efc202baae1f planemo upload
estrain
parents:
diff changeset
35 max_mis=float(args.profile_max_mismatch)
efc202baae1f planemo upload
estrain
parents:
diff changeset
36
efc202baae1f planemo upload
estrain
parents:
diff changeset
37 # Read in Allele Scores
efc202baae1f planemo upload
estrain
parents:
diff changeset
38 # Scores should be in srts2*.scores file
efc202baae1f planemo upload
estrain
parents:
diff changeset
39 # Column 0:Allele, 1:Score, 2:Avg Depth, 5:% Coverage, 7:Mismatches, 8:Indels
efc202baae1f planemo upload
estrain
parents:
diff changeset
40 scoreFile = open(glob.glob("srst2*.scores")[0],"r")
efc202baae1f planemo upload
estrain
parents:
diff changeset
41 scoreFile.readline()
efc202baae1f planemo upload
estrain
parents:
diff changeset
42
efc202baae1f planemo upload
estrain
parents:
diff changeset
43 for line in scoreFile.readlines() :
efc202baae1f planemo upload
estrain
parents:
diff changeset
44 els = line.split("\t")
efc202baae1f planemo upload
estrain
parents:
diff changeset
45 vals = els[0].split("_")
efc202baae1f planemo upload
estrain
parents:
diff changeset
46 allHash.update({els[0]:line})
efc202baae1f planemo upload
estrain
parents:
diff changeset
47
efc202baae1f planemo upload
estrain
parents:
diff changeset
48
efc202baae1f planemo upload
estrain
parents:
diff changeset
49 # Allele names in first row
efc202baae1f planemo upload
estrain
parents:
diff changeset
50 als = profiles.readline().rstrip()
efc202baae1f planemo upload
estrain
parents:
diff changeset
51 filehead = als + str("\tMean_Score\tMean_Depth\tMean_%_Coverage\tTotal_Mismatches\tTotal_Indels\n")
efc202baae1f planemo upload
estrain
parents:
diff changeset
52 output.write(filehead)
efc202baae1f planemo upload
estrain
parents:
diff changeset
53
efc202baae1f planemo upload
estrain
parents:
diff changeset
54 genes = als.split("\t")
efc202baae1f planemo upload
estrain
parents:
diff changeset
55
efc202baae1f planemo upload
estrain
parents:
diff changeset
56 for line in profiles.readlines() :
efc202baae1f planemo upload
estrain
parents:
diff changeset
57 line = line.rstrip()
efc202baae1f planemo upload
estrain
parents:
diff changeset
58 els = line.split("\t")
efc202baae1f planemo upload
estrain
parents:
diff changeset
59 alleles = []
efc202baae1f planemo upload
estrain
parents:
diff changeset
60 for i in range(1,len(genes)) :
efc202baae1f planemo upload
estrain
parents:
diff changeset
61 alleles.append(genes[i] + "_" + els[i])
efc202baae1f planemo upload
estrain
parents:
diff changeset
62 mscore=0
efc202baae1f planemo upload
estrain
parents:
diff changeset
63 mdepth=0
efc202baae1f planemo upload
estrain
parents:
diff changeset
64 mcover=0
efc202baae1f planemo upload
estrain
parents:
diff changeset
65 mmisma=0
efc202baae1f planemo upload
estrain
parents:
diff changeset
66 mindel=0
efc202baae1f planemo upload
estrain
parents:
diff changeset
67 for i in alleles :
efc202baae1f planemo upload
estrain
parents:
diff changeset
68 if i in allHash :
efc202baae1f planemo upload
estrain
parents:
diff changeset
69 vals=str(allHash[i]).split("\t")
efc202baae1f planemo upload
estrain
parents:
diff changeset
70 mscore+=float(vals[1])
efc202baae1f planemo upload
estrain
parents:
diff changeset
71 mdepth+=float(vals[2])
efc202baae1f planemo upload
estrain
parents:
diff changeset
72 mcover+=float(vals[5])
efc202baae1f planemo upload
estrain
parents:
diff changeset
73 mmisma+=int(vals[7])
efc202baae1f planemo upload
estrain
parents:
diff changeset
74 mindel+=int(vals[8])
efc202baae1f planemo upload
estrain
parents:
diff changeset
75
efc202baae1f planemo upload
estrain
parents:
diff changeset
76 mscore=round(Decimal(mscore/(len(genes)-1)),5)
efc202baae1f planemo upload
estrain
parents:
diff changeset
77 mdepth=round(Decimal(mdepth/(len(genes)-1)),2)
efc202baae1f planemo upload
estrain
parents:
diff changeset
78 mcover=round(Decimal(mcover/(len(genes)-1)),2)
efc202baae1f planemo upload
estrain
parents:
diff changeset
79
efc202baae1f planemo upload
estrain
parents:
diff changeset
80 if mmisma<=max_mis and mcover>=min_per :
efc202baae1f planemo upload
estrain
parents:
diff changeset
81 output.write(line + "\t" + str(mscore) + "\t" + str(mdepth) + "\t" + str(mcover) + "\t" + str(mmisma) + "\t" + str(mindel) + "\n")
efc202baae1f planemo upload
estrain
parents:
diff changeset
82