# HG changeset patch
# User estrain
# Date 1678454578 0
# Node ID 25a92dfb780a4e7d78e38992b106ff9b6a4f40b7
# Parent 2dc1074c9a6b3ea6dd9f081c09f453ce4cec653a
Uploaded
diff -r 2dc1074c9a6b -r 25a92dfb780a median_size.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/median_size.py Fri Mar 10 13:22:58 2023 +0000
@@ -0,0 +1,26 @@
+#!/usr/bin/env
+
+## Errol Strain (estrain@gmail.com)
+## calculate median insert size from sam file
+
+import numpy as np
+
+def get_data(infile):
+ lengths = []
+ for line in infile:
+ if line.startswith('@'):
+ pass
+ else:
+ line = line.rsplit()
+ length = int(line[8])
+ if length > 0:
+ lengths.append(length)
+ else:
+ pass
+ return lengths
+
+if __name__ == "__main__":
+ import sys
+ lengths = get_data(sys.stdin)
+ md = int(np.median(lengths))
+print(md)
diff -r 2dc1074c9a6b -r 25a92dfb780a microrunqc.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microrunqc.xml Fri Mar 10 13:22:58 2023 +0000
@@ -0,0 +1,192 @@
+
+
+
+ skesa
+ mlst
+ bwa
+ numpy
+ fastq-scan
+
+
+ ${outname}.fasta;
+
+ bwa index ${outname}.fasta;
+ bwa mem -t $num_cores ${outname}.fasta ${bwalist} | python $__tool_directory__/median_size.py > insert.median;
+
+ mlst --nopath --threads $num_cores
+ #if $options.select=="advanced"
+ #if $options.minid
+ --minid $options.minid
+ #end if
+ #if $options.mincov
+ --mincov $options.mincov
+ #end if
+ #if $options.minscore
+ --minscore $options.minscore
+ #end if
+ #end if
+ ${outname}.fasta > ${outname}.mlst.tsv;
+
+ python $__tool_directory__/run_fastq_scan.py --fastq ${bwalist} --out fq_out.tab --type ${fqscan};
+
+ python $__tool_directory__/sum_mlst.py --fasta ${outname}.fasta --mlst ${outname}.mlst.tsv --med insert.median --fqscan fq_out.tab --out sum_qc.txt
+
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @misc{pope_dashnow_zobel_holt_raven_schultz_inouye_tomita_2014,
+ title={skesa: eSKESA is a de-novo sequence read assembler for cultured single isolate genomes
+ based on DeBruijn graphs. It uses conservative heuristics and is designed to
+ create breaks at repeat regions in the genome. This leads to excellent sequence
+ quality but not necessarily a large N50 statistic. It is a multi-threaded
+ application that scales well with the number of processors. For different runs
+ with the same inputs, including the order of reads, the order and orientation
+ of contigs in the output is deterministic. },
+ url={https://github.com/ncbi/ngs-tools/tree/master/tools/skesa/},
+ author={National Center for Biotechnology Information },
+ }
+
+
+ @UNPUBLISHED{Seemann2016,
+ author = "Seemann T",
+ title = "MLST: Scan contig files against PubMLST typing schemes",
+ year = "2016",
+ url = {https://github.com/tseemann/mlst}
+ }
+
+
diff -r 2dc1074c9a6b -r 25a92dfb780a microrunqc/median_size.py
--- a/microrunqc/median_size.py Fri Mar 10 13:20:01 2023 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-#!/usr/bin/env
-
-## Errol Strain (estrain@gmail.com)
-## calculate median insert size from sam file
-
-import numpy as np
-
-def get_data(infile):
- lengths = []
- for line in infile:
- if line.startswith('@'):
- pass
- else:
- line = line.rsplit()
- length = int(line[8])
- if length > 0:
- lengths.append(length)
- else:
- pass
- return lengths
-
-if __name__ == "__main__":
- import sys
- lengths = get_data(sys.stdin)
- md = int(np.median(lengths))
-print(md)
diff -r 2dc1074c9a6b -r 25a92dfb780a microrunqc/microrunqc.xml
--- a/microrunqc/microrunqc.xml Fri Mar 10 13:20:01 2023 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,192 +0,0 @@
-
-
-
- skesa
- mlst
- bwa
- numpy
- fastq-scan
-
-
- ${outname}.fasta;
-
- bwa index ${outname}.fasta;
- bwa mem -t $num_cores ${outname}.fasta ${bwalist} | python $__tool_directory__/median_size.py > insert.median;
-
- mlst --nopath --threads $num_cores
- #if $options.select=="advanced"
- #if $options.minid
- --minid $options.minid
- #end if
- #if $options.mincov
- --mincov $options.mincov
- #end if
- #if $options.minscore
- --minscore $options.minscore
- #end if
- #end if
- ${outname}.fasta > ${outname}.mlst.tsv;
-
- python $__tool_directory__/run_fastq_scan.py --fastq ${bwalist} --out fq_out.tab --type ${fqscan};
-
- python $__tool_directory__/sum_mlst.py --fasta ${outname}.fasta --mlst ${outname}.mlst.tsv --med insert.median --fqscan fq_out.tab --out sum_qc.txt
-
- ]]>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- @misc{pope_dashnow_zobel_holt_raven_schultz_inouye_tomita_2014,
- title={skesa: eSKESA is a de-novo sequence read assembler for cultured single isolate genomes
- based on DeBruijn graphs. It uses conservative heuristics and is designed to
- create breaks at repeat regions in the genome. This leads to excellent sequence
- quality but not necessarily a large N50 statistic. It is a multi-threaded
- application that scales well with the number of processors. For different runs
- with the same inputs, including the order of reads, the order and orientation
- of contigs in the output is deterministic. },
- url={https://github.com/ncbi/ngs-tools/tree/master/tools/skesa/},
- author={National Center for Biotechnology Information },
- }
-
-
- @UNPUBLISHED{Seemann2016,
- author = "Seemann T",
- title = "MLST: Scan contig files against PubMLST typing schemes",
- year = "2016",
- url = {https://github.com/tseemann/mlst}
- }
-
-
diff -r 2dc1074c9a6b -r 25a92dfb780a microrunqc/run_fastq_scan.py
--- a/microrunqc/run_fastq_scan.py Fri Mar 10 13:20:01 2023 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,54 +0,0 @@
-#!/usr/bin/env
-
-## Run fastq-scan to get mean read length and mean quality score
-## author: errol strain, estrain@gmail.com
-
-from argparse import (ArgumentParser, FileType)
-import sys
-import glob
-import subprocess
-import json
-
-def parse_args():
- "Parse the input arguments, use '-h' for help."
-
- parser = ArgumentParser(description='Run fastq-scan on a pair of gzipped FASTQ files')
-
- # Read inputs
- parser.add_argument('--fastq', type=str, required=True, nargs=2, help='FASTQ files')
- parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File')
- parser.add_argument('--type', type=str, required=True, nargs=1, help='File Type (text or gz)')
-
- return parser.parse_args()
-
-args =parse_args()
-
-# FASTA file
-r1 = args.fastq[0]
-r2 = args.fastq[1]
-
-# Read 1
-if str(args.type[0]) == "gz" :
- cmd1 = ["zcat", r1]
-else :
- cmd1 = ["cat", r1]
-cmd2 = ["fastq-scan"]
-pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False)
-r1json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0])
-r1q = round(r1json["qc_stats"]["qual_mean"],1)
-r1l = round(r1json["qc_stats"]["read_mean"],1)
-
-# Read 2
-if str(args.type[0]) == "gz" :
- cmd1 = ["zcat", r2]
-else :
- cmd1 = ["cat", r2]
-cmd2 = ["fastq-scan"]
-pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False)
-r2json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0])
-r2q = round(r2json["qc_stats"]["qual_mean"],1)
-r2l = round(r2json["qc_stats"]["read_mean"],1)
-
-# Write output to be used by sum_mlst.py
-output = open(args.output[0],"w")
-output.write(str(r1l) + "\t" + str(r2l) + "\t" + str(r1q) + "\t" + str(r2q))
diff -r 2dc1074c9a6b -r 25a92dfb780a microrunqc/sum_mlst.py
--- a/microrunqc/sum_mlst.py Fri Mar 10 13:20:01 2023 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,79 +0,0 @@
-#!/usr/bin/env
-
-## Generate basic summary stats from SKESA, fastq-scan, and MLST output.
-## author: errol strain, estrain@gmail.com
-
-from argparse import (ArgumentParser, FileType)
-import sys
-import glob
-import subprocess
-from decimal import Decimal
-
-def parse_args():
- "Parse the input arguments, use '-h' for help."
-
- parser = ArgumentParser(description='Generate Basic Summary Statistics from SKESA assemblies, fastq-scan output, and MLST reports')
-
- # Read inputs
- parser.add_argument('--fasta', type=str, required=True, nargs=1, help='SKESA FASTA assembly')
- parser.add_argument('--mlst', type=str, required=True, nargs=1, help='MLST output')
- parser.add_argument('--fqscan', type=str, required=True, nargs=1, help='fastq-scan output')
- parser.add_argument('--med', type=str, required=True, nargs=1, help='Median Insert Size')
- parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File')
-
- return parser.parse_args()
-
-args =parse_args()
-
-# FASTA file
-fasta = args.fasta[0]
-
-# Get individual and total length of contigs
-cmd = ["awk", "/^>/ {if (seqlen){print seqlen}; ;seqlen=0;next; } { seqlen = seqlen +length($0)}END{print seqlen}",fasta]
-seqlen = subprocess.Popen(cmd,stdout= subprocess.PIPE).communicate()[0]
-intlen = list(map(int,seqlen.splitlines()))
-totlen = sum(intlen)
-# Count number of contigs
-numtigs = len(intlen)
-
-# Get coverage information from skesa fasta header
-cmd1 = ["grep",">",fasta]
-cmd2 = ["cut","-f","3","-d","_"]
-p1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE)
-p2 = subprocess.Popen(cmd2, stdin=p1.stdout, stdout=subprocess.PIPE).communicate()[0]
-covdep = map(float,p2.splitlines())
-covlist = [a*b for a,b in zip([float(i) for i in intlen],covdep)]
-covdep = round(sum(covlist)/totlen,1)
-
-# Calculate N50
-vals = [int(i) for i in intlen]
-vals.sort(reverse=True)
-n50=0
-for counter in range(0,len(vals)-1):
- if sum(vals[0:counter]) > (totlen/2):
- n50=vals[counter-1]
- break
-
-# Read in MLST output
-mlst = open(args.mlst[0],"r")
-profile = mlst.readline()
-els = profile.split("\t")
-
-# Read in median insert size
-medfile = open(args.med[0],"r")
-insert = medfile.readline()
-insert = insert.rstrip()
-
-# Read in fastq-scan
-fqfile = open(args.fqscan[0],"r")
-fq = fqfile.readline()
-fq = fq.rstrip()
-
-output = open(args.output[0],"w")
-
-filehead = str("File\tContigs\tLength\tEstCov\tN50\tMedianInsert\tMeanLength_R1\tMeanLength_R2\tMeanQ_R1\tMeanQ_R2\tScheme\tST\n")
-output.write(filehead)
-
-output.write(str(fasta) + "\t" + str(numtigs) + "\t" + str(totlen) + "\t" + str(covdep) + "\t" + str(n50) +"\t" + str(insert) + "\t" + str(fq))
-for counter in range(1,len(els)):
- output.write("\t" + str(els[counter]))
diff -r 2dc1074c9a6b -r 25a92dfb780a run_fastq_scan.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/run_fastq_scan.py Fri Mar 10 13:22:58 2023 +0000
@@ -0,0 +1,54 @@
+#!/usr/bin/env
+
+## Run fastq-scan to get mean read length and mean quality score
+## author: errol strain, estrain@gmail.com
+
+from argparse import (ArgumentParser, FileType)
+import sys
+import glob
+import subprocess
+import json
+
+def parse_args():
+ "Parse the input arguments, use '-h' for help."
+
+ parser = ArgumentParser(description='Run fastq-scan on a pair of gzipped FASTQ files')
+
+ # Read inputs
+ parser.add_argument('--fastq', type=str, required=True, nargs=2, help='FASTQ files')
+ parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File')
+ parser.add_argument('--type', type=str, required=True, nargs=1, help='File Type (text or gz)')
+
+ return parser.parse_args()
+
+args =parse_args()
+
+# FASTA file
+r1 = args.fastq[0]
+r2 = args.fastq[1]
+
+# Read 1
+if str(args.type[0]) == "gz" :
+ cmd1 = ["zcat", r1]
+else :
+ cmd1 = ["cat", r1]
+cmd2 = ["fastq-scan"]
+pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False)
+r1json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0])
+r1q = round(r1json["qc_stats"]["qual_mean"],1)
+r1l = round(r1json["qc_stats"]["read_mean"],1)
+
+# Read 2
+if str(args.type[0]) == "gz" :
+ cmd1 = ["zcat", r2]
+else :
+ cmd1 = ["cat", r2]
+cmd2 = ["fastq-scan"]
+pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False)
+r2json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0])
+r2q = round(r2json["qc_stats"]["qual_mean"],1)
+r2l = round(r2json["qc_stats"]["read_mean"],1)
+
+# Write output to be used by sum_mlst.py
+output = open(args.output[0],"w")
+output.write(str(r1l) + "\t" + str(r2l) + "\t" + str(r1q) + "\t" + str(r2q))
diff -r 2dc1074c9a6b -r 25a92dfb780a sum_mlst.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sum_mlst.py Fri Mar 10 13:22:58 2023 +0000
@@ -0,0 +1,79 @@
+#!/usr/bin/env
+
+## Generate basic summary stats from SKESA, fastq-scan, and MLST output.
+## author: errol strain, estrain@gmail.com
+
+from argparse import (ArgumentParser, FileType)
+import sys
+import glob
+import subprocess
+from decimal import Decimal
+
+def parse_args():
+ "Parse the input arguments, use '-h' for help."
+
+ parser = ArgumentParser(description='Generate Basic Summary Statistics from SKESA assemblies, fastq-scan output, and MLST reports')
+
+ # Read inputs
+ parser.add_argument('--fasta', type=str, required=True, nargs=1, help='SKESA FASTA assembly')
+ parser.add_argument('--mlst', type=str, required=True, nargs=1, help='MLST output')
+ parser.add_argument('--fqscan', type=str, required=True, nargs=1, help='fastq-scan output')
+ parser.add_argument('--med', type=str, required=True, nargs=1, help='Median Insert Size')
+ parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File')
+
+ return parser.parse_args()
+
+args =parse_args()
+
+# FASTA file
+fasta = args.fasta[0]
+
+# Get individual and total length of contigs
+cmd = ["awk", "/^>/ {if (seqlen){print seqlen}; ;seqlen=0;next; } { seqlen = seqlen +length($0)}END{print seqlen}",fasta]
+seqlen = subprocess.Popen(cmd,stdout= subprocess.PIPE).communicate()[0]
+intlen = list(map(int,seqlen.splitlines()))
+totlen = sum(intlen)
+# Count number of contigs
+numtigs = len(intlen)
+
+# Get coverage information from skesa fasta header
+cmd1 = ["grep",">",fasta]
+cmd2 = ["cut","-f","3","-d","_"]
+p1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE)
+p2 = subprocess.Popen(cmd2, stdin=p1.stdout, stdout=subprocess.PIPE).communicate()[0]
+covdep = map(float,p2.splitlines())
+covlist = [a*b for a,b in zip([float(i) for i in intlen],covdep)]
+covdep = round(sum(covlist)/totlen,1)
+
+# Calculate N50
+vals = [int(i) for i in intlen]
+vals.sort(reverse=True)
+n50=0
+for counter in range(0,len(vals)-1):
+ if sum(vals[0:counter]) > (totlen/2):
+ n50=vals[counter-1]
+ break
+
+# Read in MLST output
+mlst = open(args.mlst[0],"r")
+profile = mlst.readline()
+els = profile.split("\t")
+
+# Read in median insert size
+medfile = open(args.med[0],"r")
+insert = medfile.readline()
+insert = insert.rstrip()
+
+# Read in fastq-scan
+fqfile = open(args.fqscan[0],"r")
+fq = fqfile.readline()
+fq = fq.rstrip()
+
+output = open(args.output[0],"w")
+
+filehead = str("File\tContigs\tLength\tEstCov\tN50\tMedianInsert\tMeanLength_R1\tMeanLength_R2\tMeanQ_R1\tMeanQ_R2\tScheme\tST\n")
+output.write(filehead)
+
+output.write(str(fasta) + "\t" + str(numtigs) + "\t" + str(totlen) + "\t" + str(covdep) + "\t" + str(n50) +"\t" + str(insert) + "\t" + str(fq))
+for counter in range(1,len(els)):
+ output.write("\t" + str(els[counter]))