Mercurial > repos > estrain > microrunqc
comparison sum_mlst.py @ 0:a53acd38d77e draft
Uploaded
author | estrain |
---|---|
date | Tue, 24 Mar 2020 08:54:42 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a53acd38d77e |
---|---|
1 #!/usr/bin/env | |
2 | |
3 ## Generate basic summary stats from SKESA, fastq-scan, and MLST output. | |
4 ## author: errol strain, estrain@gmail.com | |
5 | |
6 from argparse import (ArgumentParser, FileType) | |
7 import sys | |
8 import glob | |
9 import subprocess | |
10 from decimal import Decimal | |
11 | |
12 def parse_args(): | |
13 "Parse the input arguments, use '-h' for help." | |
14 | |
15 parser = ArgumentParser(description='Generate Basic Summary Statistics from SKESA assemblies, fastq-scan output, and MLST reports') | |
16 | |
17 # Read inputs | |
18 parser.add_argument('--fasta', type=str, required=True, nargs=1, help='SKESA FASTA assembly') | |
19 parser.add_argument('--mlst', type=str, required=True, nargs=1, help='MLST output') | |
20 parser.add_argument('--fqscan', type=str, required=True, nargs=1, help='fastq-scan output') | |
21 parser.add_argument('--med', type=str, required=True, nargs=1, help='Median Insert Size') | |
22 parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File') | |
23 | |
24 return parser.parse_args() | |
25 | |
26 args =parse_args() | |
27 | |
28 # FASTA file | |
29 fasta = args.fasta[0] | |
30 | |
31 # Get individual and total length of contigs | |
32 cmd = ["awk", "/^>/ {if (seqlen){print seqlen}; ;seqlen=0;next; } { seqlen = seqlen +length($0)}END{print seqlen}",fasta] | |
33 seqlen = subprocess.Popen(cmd,stdout= subprocess.PIPE).communicate()[0] | |
34 intlen = list(map(int,seqlen.splitlines())) | |
35 totlen = sum(intlen) | |
36 # Count number of contigs | |
37 numtigs = len(intlen) | |
38 | |
39 # Get coverage information from skesa fasta header | |
40 cmd1 = ["grep",">",fasta] | |
41 cmd2 = ["cut","-f","3","-d","_"] | |
42 p1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE) | |
43 p2 = subprocess.Popen(cmd2, stdin=p1.stdout, stdout=subprocess.PIPE).communicate()[0] | |
44 covdep = map(float,p2.splitlines()) | |
45 covlist = [a*b for a,b in zip([float(i) for i in intlen],covdep)] | |
46 covdep = round(sum(covlist)/totlen,1) | |
47 | |
48 # Calculate N50 | |
49 vals = [int(i) for i in intlen] | |
50 vals.sort(reverse=True) | |
51 n50=0 | |
52 for counter in range(0,len(vals)-1): | |
53 if sum(vals[0:counter]) > (totlen/2): | |
54 n50=vals[counter-1] | |
55 break | |
56 | |
57 # Read in MLST output | |
58 mlst = open(args.mlst[0],"r") | |
59 profile = mlst.readline() | |
60 els = profile.split("\t") | |
61 | |
62 # Read in median insert size | |
63 medfile = open(args.med[0],"r") | |
64 insert = medfile.readline() | |
65 insert = insert.rstrip() | |
66 | |
67 # Read in fastq-scan | |
68 fqfile = open(args.fqscan[0],"r") | |
69 fq = fqfile.readline() | |
70 fq = fq.rstrip() | |
71 | |
72 output = open(args.output[0],"w") | |
73 | |
74 filehead = str("File\tContigs\tLength\tEstCov\tN50\tMedianInsert\tMeanLength_R1\tMeanLength_R2\tMeanQ_R1\tMeanQ_R2\tScheme\tST\n") | |
75 output.write(filehead) | |
76 | |
77 output.write(str(fasta) + "\t" + str(numtigs) + "\t" + str(totlen) + "\t" + str(covdep) + "\t" + str(n50) +"\t" + str(insert) + "\t" + str(fq)) | |
78 for counter in range(1,len(els)): | |
79 output.write("\t" + str(els[counter])) |