annotate sum_mlst.py @ 0:a53acd38d77e draft

Uploaded
author estrain
date Tue, 24 Mar 2020 08:54:42 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a53acd38d77e Uploaded
estrain
parents:
diff changeset
1 #!/usr/bin/env
a53acd38d77e Uploaded
estrain
parents:
diff changeset
2
a53acd38d77e Uploaded
estrain
parents:
diff changeset
3 ## Generate basic summary stats from SKESA, fastq-scan, and MLST output.
a53acd38d77e Uploaded
estrain
parents:
diff changeset
4 ## author: errol strain, estrain@gmail.com
a53acd38d77e Uploaded
estrain
parents:
diff changeset
5
a53acd38d77e Uploaded
estrain
parents:
diff changeset
6 from argparse import (ArgumentParser, FileType)
a53acd38d77e Uploaded
estrain
parents:
diff changeset
7 import sys
a53acd38d77e Uploaded
estrain
parents:
diff changeset
8 import glob
a53acd38d77e Uploaded
estrain
parents:
diff changeset
9 import subprocess
a53acd38d77e Uploaded
estrain
parents:
diff changeset
10 from decimal import Decimal
a53acd38d77e Uploaded
estrain
parents:
diff changeset
11
a53acd38d77e Uploaded
estrain
parents:
diff changeset
12 def parse_args():
a53acd38d77e Uploaded
estrain
parents:
diff changeset
13 "Parse the input arguments, use '-h' for help."
a53acd38d77e Uploaded
estrain
parents:
diff changeset
14
a53acd38d77e Uploaded
estrain
parents:
diff changeset
15 parser = ArgumentParser(description='Generate Basic Summary Statistics from SKESA assemblies, fastq-scan output, and MLST reports')
a53acd38d77e Uploaded
estrain
parents:
diff changeset
16
a53acd38d77e Uploaded
estrain
parents:
diff changeset
17 # Read inputs
a53acd38d77e Uploaded
estrain
parents:
diff changeset
18 parser.add_argument('--fasta', type=str, required=True, nargs=1, help='SKESA FASTA assembly')
a53acd38d77e Uploaded
estrain
parents:
diff changeset
19 parser.add_argument('--mlst', type=str, required=True, nargs=1, help='MLST output')
a53acd38d77e Uploaded
estrain
parents:
diff changeset
20 parser.add_argument('--fqscan', type=str, required=True, nargs=1, help='fastq-scan output')
a53acd38d77e Uploaded
estrain
parents:
diff changeset
21 parser.add_argument('--med', type=str, required=True, nargs=1, help='Median Insert Size')
a53acd38d77e Uploaded
estrain
parents:
diff changeset
22 parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File')
a53acd38d77e Uploaded
estrain
parents:
diff changeset
23
a53acd38d77e Uploaded
estrain
parents:
diff changeset
24 return parser.parse_args()
a53acd38d77e Uploaded
estrain
parents:
diff changeset
25
a53acd38d77e Uploaded
estrain
parents:
diff changeset
26 args =parse_args()
a53acd38d77e Uploaded
estrain
parents:
diff changeset
27
a53acd38d77e Uploaded
estrain
parents:
diff changeset
28 # FASTA file
a53acd38d77e Uploaded
estrain
parents:
diff changeset
29 fasta = args.fasta[0]
a53acd38d77e Uploaded
estrain
parents:
diff changeset
30
a53acd38d77e Uploaded
estrain
parents:
diff changeset
31 # Get individual and total length of contigs
a53acd38d77e Uploaded
estrain
parents:
diff changeset
32 cmd = ["awk", "/^>/ {if (seqlen){print seqlen}; ;seqlen=0;next; } { seqlen = seqlen +length($0)}END{print seqlen}",fasta]
a53acd38d77e Uploaded
estrain
parents:
diff changeset
33 seqlen = subprocess.Popen(cmd,stdout= subprocess.PIPE).communicate()[0]
a53acd38d77e Uploaded
estrain
parents:
diff changeset
34 intlen = list(map(int,seqlen.splitlines()))
a53acd38d77e Uploaded
estrain
parents:
diff changeset
35 totlen = sum(intlen)
a53acd38d77e Uploaded
estrain
parents:
diff changeset
36 # Count number of contigs
a53acd38d77e Uploaded
estrain
parents:
diff changeset
37 numtigs = len(intlen)
a53acd38d77e Uploaded
estrain
parents:
diff changeset
38
a53acd38d77e Uploaded
estrain
parents:
diff changeset
39 # Get coverage information from skesa fasta header
a53acd38d77e Uploaded
estrain
parents:
diff changeset
40 cmd1 = ["grep",">",fasta]
a53acd38d77e Uploaded
estrain
parents:
diff changeset
41 cmd2 = ["cut","-f","3","-d","_"]
a53acd38d77e Uploaded
estrain
parents:
diff changeset
42 p1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE)
a53acd38d77e Uploaded
estrain
parents:
diff changeset
43 p2 = subprocess.Popen(cmd2, stdin=p1.stdout, stdout=subprocess.PIPE).communicate()[0]
a53acd38d77e Uploaded
estrain
parents:
diff changeset
44 covdep = map(float,p2.splitlines())
a53acd38d77e Uploaded
estrain
parents:
diff changeset
45 covlist = [a*b for a,b in zip([float(i) for i in intlen],covdep)]
a53acd38d77e Uploaded
estrain
parents:
diff changeset
46 covdep = round(sum(covlist)/totlen,1)
a53acd38d77e Uploaded
estrain
parents:
diff changeset
47
a53acd38d77e Uploaded
estrain
parents:
diff changeset
48 # Calculate N50
a53acd38d77e Uploaded
estrain
parents:
diff changeset
49 vals = [int(i) for i in intlen]
a53acd38d77e Uploaded
estrain
parents:
diff changeset
50 vals.sort(reverse=True)
a53acd38d77e Uploaded
estrain
parents:
diff changeset
51 n50=0
a53acd38d77e Uploaded
estrain
parents:
diff changeset
52 for counter in range(0,len(vals)-1):
a53acd38d77e Uploaded
estrain
parents:
diff changeset
53 if sum(vals[0:counter]) > (totlen/2):
a53acd38d77e Uploaded
estrain
parents:
diff changeset
54 n50=vals[counter-1]
a53acd38d77e Uploaded
estrain
parents:
diff changeset
55 break
a53acd38d77e Uploaded
estrain
parents:
diff changeset
56
a53acd38d77e Uploaded
estrain
parents:
diff changeset
57 # Read in MLST output
a53acd38d77e Uploaded
estrain
parents:
diff changeset
58 mlst = open(args.mlst[0],"r")
a53acd38d77e Uploaded
estrain
parents:
diff changeset
59 profile = mlst.readline()
a53acd38d77e Uploaded
estrain
parents:
diff changeset
60 els = profile.split("\t")
a53acd38d77e Uploaded
estrain
parents:
diff changeset
61
a53acd38d77e Uploaded
estrain
parents:
diff changeset
62 # Read in median insert size
a53acd38d77e Uploaded
estrain
parents:
diff changeset
63 medfile = open(args.med[0],"r")
a53acd38d77e Uploaded
estrain
parents:
diff changeset
64 insert = medfile.readline()
a53acd38d77e Uploaded
estrain
parents:
diff changeset
65 insert = insert.rstrip()
a53acd38d77e Uploaded
estrain
parents:
diff changeset
66
a53acd38d77e Uploaded
estrain
parents:
diff changeset
67 # Read in fastq-scan
a53acd38d77e Uploaded
estrain
parents:
diff changeset
68 fqfile = open(args.fqscan[0],"r")
a53acd38d77e Uploaded
estrain
parents:
diff changeset
69 fq = fqfile.readline()
a53acd38d77e Uploaded
estrain
parents:
diff changeset
70 fq = fq.rstrip()
a53acd38d77e Uploaded
estrain
parents:
diff changeset
71
a53acd38d77e Uploaded
estrain
parents:
diff changeset
72 output = open(args.output[0],"w")
a53acd38d77e Uploaded
estrain
parents:
diff changeset
73
a53acd38d77e Uploaded
estrain
parents:
diff changeset
74 filehead = str("File\tContigs\tLength\tEstCov\tN50\tMedianInsert\tMeanLength_R1\tMeanLength_R2\tMeanQ_R1\tMeanQ_R2\tScheme\tST\n")
a53acd38d77e Uploaded
estrain
parents:
diff changeset
75 output.write(filehead)
a53acd38d77e Uploaded
estrain
parents:
diff changeset
76
a53acd38d77e Uploaded
estrain
parents:
diff changeset
77 output.write(str(fasta) + "\t" + str(numtigs) + "\t" + str(totlen) + "\t" + str(covdep) + "\t" + str(n50) +"\t" + str(insert) + "\t" + str(fq))
a53acd38d77e Uploaded
estrain
parents:
diff changeset
78 for counter in range(1,len(els)):
a53acd38d77e Uploaded
estrain
parents:
diff changeset
79 output.write("\t" + str(els[counter]))