0
|
1 #!/usr/bin/env
|
|
2
|
|
3 ## Generate basic summary stats from SKESA, fastq-scan, and MLST output.
|
|
4 ## author: errol strain, estrain@gmail.com
|
|
5
|
|
6 from argparse import (ArgumentParser, FileType)
|
|
7 import sys
|
|
8 import glob
|
|
9 import subprocess
|
|
10 from decimal import Decimal
|
|
11
|
|
12 def parse_args():
|
|
13 "Parse the input arguments, use '-h' for help."
|
|
14
|
|
15 parser = ArgumentParser(description='Generate Basic Summary Statistics from SKESA assemblies, fastq-scan output, and MLST reports')
|
|
16
|
|
17 # Read inputs
|
|
18 parser.add_argument('--fasta', type=str, required=True, nargs=1, help='SKESA FASTA assembly')
|
|
19 parser.add_argument('--mlst', type=str, required=True, nargs=1, help='MLST output')
|
|
20 parser.add_argument('--fqscan', type=str, required=True, nargs=1, help='fastq-scan output')
|
|
21 parser.add_argument('--med', type=str, required=True, nargs=1, help='Median Insert Size')
|
|
22 parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File')
|
|
23
|
|
24 return parser.parse_args()
|
|
25
|
|
26 args =parse_args()
|
|
27
|
|
28 # FASTA file
|
|
29 fasta = args.fasta[0]
|
|
30
|
|
31 # Get individual and total length of contigs
|
|
32 cmd = ["awk", "/^>/ {if (seqlen){print seqlen}; ;seqlen=0;next; } { seqlen = seqlen +length($0)}END{print seqlen}",fasta]
|
|
33 seqlen = subprocess.Popen(cmd,stdout= subprocess.PIPE).communicate()[0]
|
|
34 intlen = list(map(int,seqlen.splitlines()))
|
|
35 totlen = sum(intlen)
|
|
36 # Count number of contigs
|
|
37 numtigs = len(intlen)
|
|
38
|
|
39 # Get coverage information from skesa fasta header
|
|
40 cmd1 = ["grep",">",fasta]
|
|
41 cmd2 = ["cut","-f","3","-d","_"]
|
|
42 p1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE)
|
|
43 p2 = subprocess.Popen(cmd2, stdin=p1.stdout, stdout=subprocess.PIPE).communicate()[0]
|
|
44 covdep = map(float,p2.splitlines())
|
|
45 covlist = [a*b for a,b in zip([float(i) for i in intlen],covdep)]
|
|
46 covdep = round(sum(covlist)/totlen,1)
|
|
47
|
|
48 # Calculate N50
|
|
49 vals = [int(i) for i in intlen]
|
|
50 vals.sort(reverse=True)
|
|
51 n50=0
|
|
52 for counter in range(0,len(vals)-1):
|
|
53 if sum(vals[0:counter]) > (totlen/2):
|
|
54 n50=vals[counter-1]
|
|
55 break
|
|
56
|
|
57 # Read in MLST output
|
|
58 mlst = open(args.mlst[0],"r")
|
|
59 profile = mlst.readline()
|
|
60 els = profile.split("\t")
|
|
61
|
|
62 # Read in median insert size
|
|
63 medfile = open(args.med[0],"r")
|
|
64 insert = medfile.readline()
|
|
65 insert = insert.rstrip()
|
|
66
|
|
67 # Read in fastq-scan
|
|
68 fqfile = open(args.fqscan[0],"r")
|
|
69 fq = fqfile.readline()
|
|
70 fq = fq.rstrip()
|
|
71
|
|
72 output = open(args.output[0],"w")
|
|
73
|
|
74 filehead = str("File\tContigs\tLength\tEstCov\tN50\tMedianInsert\tMeanLength_R1\tMeanLength_R2\tMeanQ_R1\tMeanQ_R2\tScheme\tST\n")
|
|
75 output.write(filehead)
|
|
76
|
|
77 output.write(str(fasta) + "\t" + str(numtigs) + "\t" + str(totlen) + "\t" + str(covdep) + "\t" + str(n50) +"\t" + str(insert) + "\t" + str(fq))
|
|
78 for counter in range(1,len(els)):
|
|
79 output.write("\t" + str(els[counter]))
|