Mercurial > repos > nml > assemblystats
changeset 1:7556309ffbaf draft default tip
"planemo upload for repository https://github.com/phac-nml/galaxy_tools commit fb4c29f720748f46ff501140f2cd306bab6614f9"
author | nml |
---|---|
date | Fri, 29 May 2020 13:51:50 -0400 |
parents | ad2b274663f8 |
children | |
files | README.rst README_ASSEMBLY_STATS assembly_stats_txt.py assembly_stats_txt.xml |
diffstat | 4 files changed, 157 insertions(+), 116 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.rst Fri May 29 13:51:50 2020 -0400 @@ -0,0 +1,16 @@ +#Created 07/01/2011 +#Konrad Paszkiewicz, University of Exeter + +#Modified by Mariam Iskander and Matthew Gopez, October 13th, 2017 + +Assembly stats + +This series of scripts calculates various metrics on an input FASTA file. Typically this is most useful on either denovo genomic or transcriptomic data. + +Prerequisites: + +1. The bundled perl script fasta_summary.pl + +Limitations: + +Ideally this should output a composite dataset of some sort
--- a/README_ASSEMBLY_STATS Tue Nov 07 12:28:31 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,16 +0,0 @@ -#Created 07/01/2011 -#Konrad Paszkiewicz, University of Exeter - -#Modified by Mariam Iskander and Matthew Gopez, October 13th, 2017 - -Assembly stats - -This series of scripts calculates various metrics on an input FASTA file. Typically this is most useful on either denovo genomic or transcriptomic data. - -Prerequisites: - -1. The bundled perl script fasta_summary.pl - -Limitations: - -Ideally this should output a composite dataset of some sort
--- a/assembly_stats_txt.py Tue Nov 07 12:28:31 2017 -0500 +++ b/assembly_stats_txt.py Fri May 29 13:51:50 2020 -0400 @@ -3,106 +3,143 @@ # Version 1.01 - bugs kindly corrected by Jan van Haarst # Modified by Matthew Gopez October 13th, 2017 +# Rewritten by Matthew Gopez May 25th, 2020 -import logging +import argparse import os +import shutil import subprocess -import sys +from pathlib import Path -log = logging.getLogger(__name__) - -assert sys.version_info[:2] >= (2, 4) - - -def stop_err(msg): - sys.stderr.write('%s\n' % msg) - sys.exit() +PERL_OUT_FILES = ['stats.txt', 'sorted_contigs.fa', 'histogram_bins.dat.png', + 'summed_contig_lengths.dat.png', 'histogram_bins.dat', + 'summed_contig_lengths.dat'] -def __main__(): +def init_parser(): + """Create argument parser and return parser obj.""" + parser = argparse.ArgumentParser(description="usage: %prog [options]") - # Parse Command Line + parser.add_argument( + "-d", + "--working-dir", + dest="working_dir", + required=True) - working_dir = sys.argv[2] - type = sys.argv[3] - bucket = sys.argv[4] - input = sys.argv[5] - stats = sys.argv[6] - sortedcontigs = sys.argv[7] - histogrampng = sys.argv[8] - summedcontigspng = sys.argv[9] - histogramdata = sys.argv[10] - summedcontigdata = sys.argv[11] - try: # for test - needs this done - os.makedirs(working_dir) - except Exception, e: - stop_err('Error running assembly_stats_txt.py ' + str(e)) + parser.add_argument( + "-t", + "--type", + dest="file_type", + required=True) - cmdline = '%s/fasta_summary.pl -i %s -t %s %s -o %s > /dev/null' \ - % (os.path.dirname(sys.argv[0]), input, type, bucket, - working_dir) - try: - proc = subprocess.Popen(args=cmdline, shell=True, - stderr=subprocess.PIPE) - returncode = proc.wait() + parser.add_argument( + "-b", + "--bucket", + dest="bucket", + action='store_true') - # get stderr, allowing for case where it's very large + parser.add_argument( + "-i", + "--input", + dest="input", + required=True) - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += proc.stderr.read(buffsize) - if not stderr or len(stderr) % buffsize != 0: - break - except OverflowError: - pass - if returncode != 0: - raise Exception - except Exception, e: - stop_err('Error running assembly_stats.py ' + str(e)) + parser.add_argument( + "-s", + "--stats", + dest="stats", + required=True) - stats_path = os.path.join(working_dir, 'stats.txt') - sorted_contigs_path = os.path.join(working_dir, 'sorted_contigs.fa') - histogram_png_path = os.path.join(working_dir, - 'histogram_bins.dat.png') - summed_contigs_path = os.path.join(working_dir, - 'summed_contig_lengths.dat.png') - histogram_data_path = os.path.join(working_dir, 'histogram_bins.dat') - summed_contigs_data_path = os.path.join(working_dir, - 'summed_contig_lengths.dat') + parser.add_argument( + "-sc", + "--sorted-contigs", + dest="sorted_contigs", + required=True) + + parser.add_argument( + "-hpng", + "--histogram-png", + dest="histogram_png", + required=True) - out = open(stats, 'w') - for line in open(stats_path): - out.write('%s' % line) - out.close() - - out = open(sortedcontigs, 'w') - for line in open(sorted_contigs_path): - out.write('%s' % line) - out.close() - - out = open(histogrampng, 'w') - for line in open(histogram_png_path): - out.write('%s' % line) - out.close() + parser.add_argument( + "-spng", + "--summed-contigs-png", + dest="summed_contigs_png", + required=True) - out = open(summedcontigspng, 'w') - for line in open(summed_contigs_path): - out.write('%s' % line) - out.close() + parser.add_argument( + "-hd", + "--histogram-data", + dest="histogram_data", + required=True) - out = open(histogramdata, 'w') - for line in open(histogram_data_path): - out.write('%s' % line) - out.close() + parser.add_argument( + "-scd", + "--summed-config-data", + dest="summed_contig_data", + required=True) - out = open(summedcontigdata, 'w') - for line in open(summed_contigs_data_path): - out.write('%s' % line) - out.close() + return parser -if __name__ == '__main__': - __main__() +def exec_fasta_summary(input_data, file_type, bucket, working_dir): + """Execute fasta_summary.pl script with user arguments.""" + script_dir = Path(__file__).parent.absolute() + + if bucket: + bucket_arg = '-b' + else: + bucket_arg = '' + + cli_command = \ + '{}/fasta_summary.pl -i {} -t {} {} -o {} > /dev/null'.format( + script_dir, input_data, file_type, bucket_arg, working_dir) + + try: + subprocess.check_output( + cli_command, + stderr=subprocess.STDOUT, + shell=True, + universal_newlines=True) + except subprocess.CalledProcessError as exc: + raise RuntimeError('Error running assembly_stats.py!\n' + 'Return Code: {}\nOutput: {}'.format( + exc.returncode, exc.output)) + + +def main(): + """This is where the magic happens. (not really) + + 1. Gets command line arguments. + 2. Grabs the user's desired parameters for running the perl script. + 3. Ensures the directories are in place. + 4. Executes fasta_summary.pl + 5. Move the out files from the perl script to the desired + location the user specified. + + """ + parser = init_parser() + args = parser.parse_args() + + working_dir = args.working_dir + + out_file_names = [args.stats, args.sorted_contigs, args.histogram_png, + args.summed_contigs_png, args.histogram_data, + args.summed_contig_data] + + # Ensure working directory is created. + Path(working_dir).mkdir(parents=True, exist_ok=True) + + # Execute Perl Script + exec_fasta_summary(args.input, args.file_type, args.bucket, working_dir) + + # Rename out files to desired file names + for perl_out_file, dest_file in zip(PERL_OUT_FILES, out_file_names): + shutil.move(os.path.join(working_dir, perl_out_file), + dest_file) + + +if __name__ == "__main__": + main()
--- a/assembly_stats_txt.xml Tue Nov 07 12:28:31 2017 -0500 +++ b/assembly_stats_txt.xml Fri May 29 13:51:50 2020 -0400 @@ -1,23 +1,27 @@ -<tool id="assemblystats" name="assemblystats" version="1.0.1"> +<tool id="assemblystats" name="assemblystats" version="1.1.0"> <description>Summarise an assembly (e.g. N50 metrics)</description> <requirements> - <requirement type="package" version="1.6.924">perl-bioperl</requirement> - <requirement type="package" version="5.0.4">gnuplot</requirement> + <requirement type="package" version="1.7.2">perl-bioperl</requirement> + <requirement type="package" version="5.2.7">gnuplot</requirement> + <requirement type="package" version="3.7.6">python</requirement> </requirements> <command detect_errors="exit_code"><![CDATA[ - python $__tool_directory__/assembly_stats_txt.py + python3 $__tool_directory__/assembly_stats_txt.py - '$type' - '$stats.extra_files_path' - '$type' - '$bucket' - '$input' - '$stats' - '$sortedcontigs' - '$histogrampng' - '$summedcontigspng' - '$histogramdata' - '$summedcontigdata' + -d '$stats.extra_files_path' + -t '$type' + -i '$input' + -s '$stats' + -sc '$sortedcontigs' + -hpng '$histogrampng' + -spng '$summedcontigspng' + -hd '$histogramdata' + -scd '$summedcontigdata' + + #if $bucket + -b + #end if + ]]></command> <inputs> <param label="Type of read" name="type" type="select" help="Is this from an genomic (contig) or transcriptomic assembly (isotig) or are these raw reads (read)"> @@ -25,7 +29,7 @@ <option value="isotig">Isotig (if from transcriptomic assembly)</option> <option value="read">Raw reads from sequencer in FASTA format (useful for 454 data)</option> </param> - <param name="bucket" type="boolean" label="Output histogram with bin sizes=1" truevalue="-b" falsevalue="" help="Use this to specify whether or not bin sizes of 1 should be used when plotting histograms"/> + <param name="bucket" type="boolean" label="Output histogram with bin sizes=1" truevalue="true" falsevalue="false" help="Use this to specify whether or not bin sizes of 1 should be used when plotting histograms"/> <param format="fasta" name="input" type="data" label="Source file in FASTA format"/> <param name = "all_outputs" type ="boolean" checked="false" label="Return all output files" help="If checked, all output files will be displayed. If not checked, only the file 'Assembly Statistics' will be provided." /> </inputs>