comparison sum_mlst.py @ 0:a53acd38d77e draft

Uploaded
author estrain
date Tue, 24 Mar 2020 08:54:42 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a53acd38d77e
1 #!/usr/bin/env
2
3 ## Generate basic summary stats from SKESA, fastq-scan, and MLST output.
4 ## author: errol strain, estrain@gmail.com
5
6 from argparse import (ArgumentParser, FileType)
7 import sys
8 import glob
9 import subprocess
10 from decimal import Decimal
11
12 def parse_args():
13 "Parse the input arguments, use '-h' for help."
14
15 parser = ArgumentParser(description='Generate Basic Summary Statistics from SKESA assemblies, fastq-scan output, and MLST reports')
16
17 # Read inputs
18 parser.add_argument('--fasta', type=str, required=True, nargs=1, help='SKESA FASTA assembly')
19 parser.add_argument('--mlst', type=str, required=True, nargs=1, help='MLST output')
20 parser.add_argument('--fqscan', type=str, required=True, nargs=1, help='fastq-scan output')
21 parser.add_argument('--med', type=str, required=True, nargs=1, help='Median Insert Size')
22 parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File')
23
24 return parser.parse_args()
25
26 args =parse_args()
27
28 # FASTA file
29 fasta = args.fasta[0]
30
31 # Get individual and total length of contigs
32 cmd = ["awk", "/^>/ {if (seqlen){print seqlen}; ;seqlen=0;next; } { seqlen = seqlen +length($0)}END{print seqlen}",fasta]
33 seqlen = subprocess.Popen(cmd,stdout= subprocess.PIPE).communicate()[0]
34 intlen = list(map(int,seqlen.splitlines()))
35 totlen = sum(intlen)
36 # Count number of contigs
37 numtigs = len(intlen)
38
39 # Get coverage information from skesa fasta header
40 cmd1 = ["grep",">",fasta]
41 cmd2 = ["cut","-f","3","-d","_"]
42 p1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE)
43 p2 = subprocess.Popen(cmd2, stdin=p1.stdout, stdout=subprocess.PIPE).communicate()[0]
44 covdep = map(float,p2.splitlines())
45 covlist = [a*b for a,b in zip([float(i) for i in intlen],covdep)]
46 covdep = round(sum(covlist)/totlen,1)
47
48 # Calculate N50
49 vals = [int(i) for i in intlen]
50 vals.sort(reverse=True)
51 n50=0
52 for counter in range(0,len(vals)-1):
53 if sum(vals[0:counter]) > (totlen/2):
54 n50=vals[counter-1]
55 break
56
57 # Read in MLST output
58 mlst = open(args.mlst[0],"r")
59 profile = mlst.readline()
60 els = profile.split("\t")
61
62 # Read in median insert size
63 medfile = open(args.med[0],"r")
64 insert = medfile.readline()
65 insert = insert.rstrip()
66
67 # Read in fastq-scan
68 fqfile = open(args.fqscan[0],"r")
69 fq = fqfile.readline()
70 fq = fq.rstrip()
71
72 output = open(args.output[0],"w")
73
74 filehead = str("File\tContigs\tLength\tEstCov\tN50\tMedianInsert\tMeanLength_R1\tMeanLength_R2\tMeanQ_R1\tMeanQ_R2\tScheme\tST\n")
75 output.write(filehead)
76
77 output.write(str(fasta) + "\t" + str(numtigs) + "\t" + str(totlen) + "\t" + str(covdep) + "\t" + str(n50) +"\t" + str(insert) + "\t" + str(fq))
78 for counter in range(1,len(els)):
79 output.write("\t" + str(els[counter]))