# HG changeset patch # User estrain # Date 1678454401 0 # Node ID 2dc1074c9a6b3ea6dd9f081c09f453ce4cec653a # Parent a53acd38d77e5fc2565cd1a2d5cb3aee8a14c702 Uploaded diff -r a53acd38d77e -r 2dc1074c9a6b median_size.py --- a/median_size.py Tue Mar 24 08:54:42 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -#!/usr/bin/env - -## Errol Strain (estrain@gmail.com) -## calculate median insert size from sam file - -import numpy as np - -def get_data(infile): - lengths = [] - for line in infile: - if line.startswith('@'): - pass - else: - line = line.rsplit() - length = int(line[8]) - if length > 0: - lengths.append(length) - else: - pass - return lengths - -if __name__ == "__main__": - import sys - lengths = get_data(sys.stdin) - md = int(np.median(lengths)) -print(md) diff -r a53acd38d77e -r 2dc1074c9a6b microrunqc.xml --- a/microrunqc.xml Tue Mar 24 08:54:42 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,192 +0,0 @@ - - - - skesa - mlst - bwa - numpy - fastq-scan - - - ${outname}.fasta; - - bwa index ${outname}.fasta; - bwa mem -t $num_cores ${outname}.fasta ${bwalist} | python $__tool_directory__/median_size.py > insert.median; - - mlst --nopath --threads $num_cores - #if $options.select=="advanced" - #if $options.minid - --minid $options.minid - #end if - #if $options.mincov - --mincov $options.mincov - #end if - #if $options.minscore - --minscore $options.minscore - #end if - #end if - ${outname}.fasta > ${outname}.mlst.tsv; - - python $__tool_directory__/run_fastq_scan.py --fastq ${bwalist} --out fq_out.tab --type ${fqscan}; - - python $__tool_directory__/sum_mlst.py --fasta ${outname}.fasta --mlst ${outname}.mlst.tsv --med insert.median --fqscan fq_out.tab --out sum_qc.txt - - ]]> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @misc{pope_dashnow_zobel_holt_raven_schultz_inouye_tomita_2014, - title={skesa: eSKESA is a de-novo sequence read assembler for cultured single isolate genomes - based on DeBruijn graphs. It uses conservative heuristics and is designed to - create breaks at repeat regions in the genome. This leads to excellent sequence - quality but not necessarily a large N50 statistic. It is a multi-threaded - application that scales well with the number of processors. For different runs - with the same inputs, including the order of reads, the order and orientation - of contigs in the output is deterministic. }, - url={https://github.com/ncbi/ngs-tools/tree/master/tools/skesa/}, - author={National Center for Biotechnology Information }, - } - - - @UNPUBLISHED{Seemann2016, - author = "Seemann T", - title = "MLST: Scan contig files against PubMLST typing schemes", - year = "2016", - url = {https://github.com/tseemann/mlst} - } - - diff -r a53acd38d77e -r 2dc1074c9a6b microrunqc/median_size.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microrunqc/median_size.py Fri Mar 10 13:20:01 2023 +0000 @@ -0,0 +1,26 @@ +#!/usr/bin/env + +## Errol Strain (estrain@gmail.com) +## calculate median insert size from sam file + +import numpy as np + +def get_data(infile): + lengths = [] + for line in infile: + if line.startswith('@'): + pass + else: + line = line.rsplit() + length = int(line[8]) + if length > 0: + lengths.append(length) + else: + pass + return lengths + +if __name__ == "__main__": + import sys + lengths = get_data(sys.stdin) + md = int(np.median(lengths)) +print(md) diff -r a53acd38d77e -r 2dc1074c9a6b microrunqc/microrunqc.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microrunqc/microrunqc.xml Fri Mar 10 13:20:01 2023 +0000 @@ -0,0 +1,192 @@ + + + + skesa + mlst + bwa + numpy + fastq-scan + + + ${outname}.fasta; + + bwa index ${outname}.fasta; + bwa mem -t $num_cores ${outname}.fasta ${bwalist} | python $__tool_directory__/median_size.py > insert.median; + + mlst --nopath --threads $num_cores + #if $options.select=="advanced" + #if $options.minid + --minid $options.minid + #end if + #if $options.mincov + --mincov $options.mincov + #end if + #if $options.minscore + --minscore $options.minscore + #end if + #end if + ${outname}.fasta > ${outname}.mlst.tsv; + + python $__tool_directory__/run_fastq_scan.py --fastq ${bwalist} --out fq_out.tab --type ${fqscan}; + + python $__tool_directory__/sum_mlst.py --fasta ${outname}.fasta --mlst ${outname}.mlst.tsv --med insert.median --fqscan fq_out.tab --out sum_qc.txt + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @misc{pope_dashnow_zobel_holt_raven_schultz_inouye_tomita_2014, + title={skesa: eSKESA is a de-novo sequence read assembler for cultured single isolate genomes + based on DeBruijn graphs. It uses conservative heuristics and is designed to + create breaks at repeat regions in the genome. This leads to excellent sequence + quality but not necessarily a large N50 statistic. It is a multi-threaded + application that scales well with the number of processors. For different runs + with the same inputs, including the order of reads, the order and orientation + of contigs in the output is deterministic. }, + url={https://github.com/ncbi/ngs-tools/tree/master/tools/skesa/}, + author={National Center for Biotechnology Information }, + } + + + @UNPUBLISHED{Seemann2016, + author = "Seemann T", + title = "MLST: Scan contig files against PubMLST typing schemes", + year = "2016", + url = {https://github.com/tseemann/mlst} + } + + diff -r a53acd38d77e -r 2dc1074c9a6b microrunqc/run_fastq_scan.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microrunqc/run_fastq_scan.py Fri Mar 10 13:20:01 2023 +0000 @@ -0,0 +1,54 @@ +#!/usr/bin/env + +## Run fastq-scan to get mean read length and mean quality score +## author: errol strain, estrain@gmail.com + +from argparse import (ArgumentParser, FileType) +import sys +import glob +import subprocess +import json + +def parse_args(): + "Parse the input arguments, use '-h' for help." + + parser = ArgumentParser(description='Run fastq-scan on a pair of gzipped FASTQ files') + + # Read inputs + parser.add_argument('--fastq', type=str, required=True, nargs=2, help='FASTQ files') + parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File') + parser.add_argument('--type', type=str, required=True, nargs=1, help='File Type (text or gz)') + + return parser.parse_args() + +args =parse_args() + +# FASTA file +r1 = args.fastq[0] +r2 = args.fastq[1] + +# Read 1 +if str(args.type[0]) == "gz" : + cmd1 = ["zcat", r1] +else : + cmd1 = ["cat", r1] +cmd2 = ["fastq-scan"] +pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False) +r1json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0]) +r1q = round(r1json["qc_stats"]["qual_mean"],1) +r1l = round(r1json["qc_stats"]["read_mean"],1) + +# Read 2 +if str(args.type[0]) == "gz" : + cmd1 = ["zcat", r2] +else : + cmd1 = ["cat", r2] +cmd2 = ["fastq-scan"] +pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False) +r2json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0]) +r2q = round(r2json["qc_stats"]["qual_mean"],1) +r2l = round(r2json["qc_stats"]["read_mean"],1) + +# Write output to be used by sum_mlst.py +output = open(args.output[0],"w") +output.write(str(r1l) + "\t" + str(r2l) + "\t" + str(r1q) + "\t" + str(r2q)) diff -r a53acd38d77e -r 2dc1074c9a6b microrunqc/sum_mlst.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microrunqc/sum_mlst.py Fri Mar 10 13:20:01 2023 +0000 @@ -0,0 +1,79 @@ +#!/usr/bin/env + +## Generate basic summary stats from SKESA, fastq-scan, and MLST output. +## author: errol strain, estrain@gmail.com + +from argparse import (ArgumentParser, FileType) +import sys +import glob +import subprocess +from decimal import Decimal + +def parse_args(): + "Parse the input arguments, use '-h' for help." + + parser = ArgumentParser(description='Generate Basic Summary Statistics from SKESA assemblies, fastq-scan output, and MLST reports') + + # Read inputs + parser.add_argument('--fasta', type=str, required=True, nargs=1, help='SKESA FASTA assembly') + parser.add_argument('--mlst', type=str, required=True, nargs=1, help='MLST output') + parser.add_argument('--fqscan', type=str, required=True, nargs=1, help='fastq-scan output') + parser.add_argument('--med', type=str, required=True, nargs=1, help='Median Insert Size') + parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File') + + return parser.parse_args() + +args =parse_args() + +# FASTA file +fasta = args.fasta[0] + +# Get individual and total length of contigs +cmd = ["awk", "/^>/ {if (seqlen){print seqlen}; ;seqlen=0;next; } { seqlen = seqlen +length($0)}END{print seqlen}",fasta] +seqlen = subprocess.Popen(cmd,stdout= subprocess.PIPE).communicate()[0] +intlen = list(map(int,seqlen.splitlines())) +totlen = sum(intlen) +# Count number of contigs +numtigs = len(intlen) + +# Get coverage information from skesa fasta header +cmd1 = ["grep",">",fasta] +cmd2 = ["cut","-f","3","-d","_"] +p1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE) +p2 = subprocess.Popen(cmd2, stdin=p1.stdout, stdout=subprocess.PIPE).communicate()[0] +covdep = map(float,p2.splitlines()) +covlist = [a*b for a,b in zip([float(i) for i in intlen],covdep)] +covdep = round(sum(covlist)/totlen,1) + +# Calculate N50 +vals = [int(i) for i in intlen] +vals.sort(reverse=True) +n50=0 +for counter in range(0,len(vals)-1): + if sum(vals[0:counter]) > (totlen/2): + n50=vals[counter-1] + break + +# Read in MLST output +mlst = open(args.mlst[0],"r") +profile = mlst.readline() +els = profile.split("\t") + +# Read in median insert size +medfile = open(args.med[0],"r") +insert = medfile.readline() +insert = insert.rstrip() + +# Read in fastq-scan +fqfile = open(args.fqscan[0],"r") +fq = fqfile.readline() +fq = fq.rstrip() + +output = open(args.output[0],"w") + +filehead = str("File\tContigs\tLength\tEstCov\tN50\tMedianInsert\tMeanLength_R1\tMeanLength_R2\tMeanQ_R1\tMeanQ_R2\tScheme\tST\n") +output.write(filehead) + +output.write(str(fasta) + "\t" + str(numtigs) + "\t" + str(totlen) + "\t" + str(covdep) + "\t" + str(n50) +"\t" + str(insert) + "\t" + str(fq)) +for counter in range(1,len(els)): + output.write("\t" + str(els[counter])) diff -r a53acd38d77e -r 2dc1074c9a6b run_fastq_scan.py --- a/run_fastq_scan.py Tue Mar 24 08:54:42 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -#!/usr/bin/env - -## Run fastq-scan to get mean read length and mean quality score -## author: errol strain, estrain@gmail.com - -from argparse import (ArgumentParser, FileType) -import sys -import glob -import subprocess -import json - -def parse_args(): - "Parse the input arguments, use '-h' for help." - - parser = ArgumentParser(description='Run fastq-scan on a pair of gzipped FASTQ files') - - # Read inputs - parser.add_argument('--fastq', type=str, required=True, nargs=2, help='FASTQ files') - parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File') - parser.add_argument('--type', type=str, required=True, nargs=1, help='File Type (text or gz)') - - return parser.parse_args() - -args =parse_args() - -# FASTA file -r1 = args.fastq[0] -r2 = args.fastq[1] - -# Read 1 -if str(args.type[0]) == "gz" : - cmd1 = ["zcat", r1] -else : - cmd1 = ["cat", r1] -cmd2 = ["fastq-scan"] -pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False) -r1json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0]) -r1q = round(r1json["qc_stats"]["qual_mean"],1) -r1l = round(r1json["qc_stats"]["read_mean"],1) - -# Read 2 -if str(args.type[0]) == "gz" : - cmd1 = ["zcat", r2] -else : - cmd1 = ["cat", r2] -cmd2 = ["fastq-scan"] -pcmd1= subprocess.Popen(cmd1,stdout= subprocess.PIPE,shell=False) -r2json = json.loads(subprocess.Popen(cmd2, stdin=pcmd1.stdout,stdout=subprocess.PIPE,shell=False).communicate()[0]) -r2q = round(r2json["qc_stats"]["qual_mean"],1) -r2l = round(r2json["qc_stats"]["read_mean"],1) - -# Write output to be used by sum_mlst.py -output = open(args.output[0],"w") -output.write(str(r1l) + "\t" + str(r2l) + "\t" + str(r1q) + "\t" + str(r2q)) diff -r a53acd38d77e -r 2dc1074c9a6b sum_mlst.py --- a/sum_mlst.py Tue Mar 24 08:54:42 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ -#!/usr/bin/env - -## Generate basic summary stats from SKESA, fastq-scan, and MLST output. -## author: errol strain, estrain@gmail.com - -from argparse import (ArgumentParser, FileType) -import sys -import glob -import subprocess -from decimal import Decimal - -def parse_args(): - "Parse the input arguments, use '-h' for help." - - parser = ArgumentParser(description='Generate Basic Summary Statistics from SKESA assemblies, fastq-scan output, and MLST reports') - - # Read inputs - parser.add_argument('--fasta', type=str, required=True, nargs=1, help='SKESA FASTA assembly') - parser.add_argument('--mlst', type=str, required=True, nargs=1, help='MLST output') - parser.add_argument('--fqscan', type=str, required=True, nargs=1, help='fastq-scan output') - parser.add_argument('--med', type=str, required=True, nargs=1, help='Median Insert Size') - parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File') - - return parser.parse_args() - -args =parse_args() - -# FASTA file -fasta = args.fasta[0] - -# Get individual and total length of contigs -cmd = ["awk", "/^>/ {if (seqlen){print seqlen}; ;seqlen=0;next; } { seqlen = seqlen +length($0)}END{print seqlen}",fasta] -seqlen = subprocess.Popen(cmd,stdout= subprocess.PIPE).communicate()[0] -intlen = list(map(int,seqlen.splitlines())) -totlen = sum(intlen) -# Count number of contigs -numtigs = len(intlen) - -# Get coverage information from skesa fasta header -cmd1 = ["grep",">",fasta] -cmd2 = ["cut","-f","3","-d","_"] -p1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE) -p2 = subprocess.Popen(cmd2, stdin=p1.stdout, stdout=subprocess.PIPE).communicate()[0] -covdep = map(float,p2.splitlines()) -covlist = [a*b for a,b in zip([float(i) for i in intlen],covdep)] -covdep = round(sum(covlist)/totlen,1) - -# Calculate N50 -vals = [int(i) for i in intlen] -vals.sort(reverse=True) -n50=0 -for counter in range(0,len(vals)-1): - if sum(vals[0:counter]) > (totlen/2): - n50=vals[counter-1] - break - -# Read in MLST output -mlst = open(args.mlst[0],"r") -profile = mlst.readline() -els = profile.split("\t") - -# Read in median insert size -medfile = open(args.med[0],"r") -insert = medfile.readline() -insert = insert.rstrip() - -# Read in fastq-scan -fqfile = open(args.fqscan[0],"r") -fq = fqfile.readline() -fq = fq.rstrip() - -output = open(args.output[0],"w") - -filehead = str("File\tContigs\tLength\tEstCov\tN50\tMedianInsert\tMeanLength_R1\tMeanLength_R2\tMeanQ_R1\tMeanQ_R2\tScheme\tST\n") -output.write(filehead) - -output.write(str(fasta) + "\t" + str(numtigs) + "\t" + str(totlen) + "\t" + str(covdep) + "\t" + str(n50) +"\t" + str(insert) + "\t" + str(fq)) -for counter in range(1,len(els)): - output.write("\t" + str(els[counter]))