Mercurial > repos > nml > assemblystats

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst	Fri May 29 13:51:50 2020 -0400
@@ -0,0 +1,16 @@
+#Created 07/01/2011
+#Konrad Paszkiewicz, University of Exeter
+
+#Modified by Mariam Iskander and Matthew Gopez, October 13th, 2017
+
+Assembly stats
+
+This series of scripts calculates various metrics on an input FASTA file. Typically this is most useful on either denovo genomic or transcriptomic data.
+
+Prerequisites:
+
+1. The bundled perl script fasta_summary.pl
+
+Limitations:
+
+Ideally this should output a composite dataset of some sort
--- a/README_ASSEMBLY_STATS	Tue Nov 07 12:28:31 2017 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,16 +0,0 @@
-#Created 07/01/2011
-#Konrad Paszkiewicz, University of Exeter
-
-#Modified by Mariam Iskander and Matthew Gopez, October 13th, 2017
-
-Assembly stats
-
-This series of scripts calculates various metrics on an input FASTA file. Typically this is most useful on either denovo genomic or transcriptomic data.
-
-Prerequisites:
-
-1. The bundled perl script fasta_summary.pl
-
-Limitations:
-
-Ideally this should output a composite dataset of some sort
--- a/assembly_stats_txt.py	Tue Nov 07 12:28:31 2017 -0500
+++ b/assembly_stats_txt.py	Fri May 29 13:51:50 2020 -0400
@@ -3,106 +3,143 @@

 # Version 1.01 - bugs kindly corrected by Jan van Haarst
 # Modified by Matthew Gopez October 13th, 2017
+# Rewritten by Matthew Gopez May 25th, 2020

-import logging
+import argparse
 import os
+import shutil
 import subprocess
-import sys
+from pathlib import Path


-log = logging.getLogger(__name__)
-
-assert sys.version_info[:2] >= (2, 4)
-
-
-def stop_err(msg):
-    sys.stderr.write('%s\n' % msg)
-    sys.exit()
+PERL_OUT_FILES = ['stats.txt', 'sorted_contigs.fa', 'histogram_bins.dat.png',
+                  'summed_contig_lengths.dat.png', 'histogram_bins.dat',
+                  'summed_contig_lengths.dat']


-def __main__():
+def init_parser():
+    """Create argument parser and return parser obj."""
+    parser = argparse.ArgumentParser(description="usage: %prog [options]")

-    # Parse Command Line
+    parser.add_argument(
+        "-d",
+        "--working-dir",
+        dest="working_dir",
+        required=True)

-    working_dir = sys.argv[2]
-    type = sys.argv[3]
-    bucket = sys.argv[4]
-    input = sys.argv[5]
-    stats = sys.argv[6]
-    sortedcontigs = sys.argv[7]
-    histogrampng = sys.argv[8]
-    summedcontigspng = sys.argv[9]
-    histogramdata = sys.argv[10]
-    summedcontigdata = sys.argv[11]
-    try:  # for test - needs this done
-        os.makedirs(working_dir)
-    except Exception, e:
-        stop_err('Error running assembly_stats_txt.py ' + str(e))
+    parser.add_argument(
+        "-t",
+        "--type",
+        dest="file_type",
+        required=True)

-    cmdline = '%s/fasta_summary.pl -i %s -t %s %s -o %s > /dev/null' \
-        % (os.path.dirname(sys.argv[0]), input, type, bucket,
-           working_dir)
-    try:
-        proc = subprocess.Popen(args=cmdline, shell=True,
-                                stderr=subprocess.PIPE)
-        returncode = proc.wait()
+    parser.add_argument(
+        "-b",
+        "--bucket",
+        dest="bucket",
+        action='store_true')

-        # get stderr, allowing for case where it's very large
+    parser.add_argument(
+        "-i",
+        "--input",
+        dest="input",
+        required=True)

-        stderr = ''
-        buffsize = 1048576
-        try:
-            while True:
-                stderr += proc.stderr.read(buffsize)
-                if not stderr or len(stderr) % buffsize != 0:
-                    break
-        except OverflowError:
-            pass
-        if returncode != 0:
-            raise Exception
-    except Exception, e:
-        stop_err('Error running assembly_stats.py ' + str(e))
+    parser.add_argument(
+        "-s",
+        "--stats",
+        dest="stats",
+        required=True)

-    stats_path = os.path.join(working_dir, 'stats.txt')
-    sorted_contigs_path = os.path.join(working_dir, 'sorted_contigs.fa')
-    histogram_png_path = os.path.join(working_dir,
-                                      'histogram_bins.dat.png')
-    summed_contigs_path = os.path.join(working_dir,
-                                       'summed_contig_lengths.dat.png')
-    histogram_data_path = os.path.join(working_dir, 'histogram_bins.dat')
-    summed_contigs_data_path = os.path.join(working_dir,
-                                            'summed_contig_lengths.dat')
+    parser.add_argument(
+        "-sc",
+        "--sorted-contigs",
+        dest="sorted_contigs",
+        required=True)
+
+    parser.add_argument(
+        "-hpng",
+        "--histogram-png",
+        dest="histogram_png",
+        required=True)

-    out = open(stats, 'w')
-    for line in open(stats_path):
-        out.write('%s' % line)
-    out.close()
-
-    out = open(sortedcontigs, 'w')
-    for line in open(sorted_contigs_path):
-        out.write('%s' % line)
-    out.close()
-
-    out = open(histogrampng, 'w')
-    for line in open(histogram_png_path):
-        out.write('%s' % line)
-    out.close()
+    parser.add_argument(
+        "-spng",
+        "--summed-contigs-png",
+        dest="summed_contigs_png",
+        required=True)

-    out = open(summedcontigspng, 'w')
-    for line in open(summed_contigs_path):
-        out.write('%s' % line)
-    out.close()
+    parser.add_argument(
+        "-hd",
+        "--histogram-data",
+        dest="histogram_data",
+        required=True)

-    out = open(histogramdata, 'w')
-    for line in open(histogram_data_path):
-        out.write('%s' % line)
-    out.close()
+    parser.add_argument(
+        "-scd",
+        "--summed-config-data",
+        dest="summed_contig_data",
+        required=True)

-    out = open(summedcontigdata, 'w')
-    for line in open(summed_contigs_data_path):
-        out.write('%s' % line)
-    out.close()
+    return parser


-if __name__ == '__main__':
-    __main__()
+def exec_fasta_summary(input_data, file_type, bucket, working_dir):
+    """Execute fasta_summary.pl script with user arguments."""
+    script_dir = Path(__file__).parent.absolute()
+
+    if bucket:
+        bucket_arg = '-b'
+    else:
+        bucket_arg = ''
+
+    cli_command = \
+        '{}/fasta_summary.pl -i {} -t {} {} -o {} > /dev/null'.format(
+            script_dir, input_data, file_type, bucket_arg, working_dir)
+
+    try:
+        subprocess.check_output(
+            cli_command,
+            stderr=subprocess.STDOUT,
+            shell=True,
+            universal_newlines=True)
+    except subprocess.CalledProcessError as exc:
+        raise RuntimeError('Error running assembly_stats.py!\n'
+                           'Return Code: {}\nOutput: {}'.format(
+                            exc.returncode, exc.output))
+
+
+def main():
+    """This is where the magic happens. (not really)
+
+    1. Gets command line arguments.
+    2. Grabs the user's desired parameters for running the perl script.
+    3. Ensures the directories are in place.
+    4. Executes fasta_summary.pl
+    5. Move the out files from the perl script to the desired
+    location the user specified.
+
+    """
+    parser = init_parser()
+    args = parser.parse_args()
+
+    working_dir = args.working_dir
+
+    out_file_names = [args.stats, args.sorted_contigs, args.histogram_png,
+                      args.summed_contigs_png, args.histogram_data,
+                      args.summed_contig_data]
+
+    # Ensure working directory is created.
+    Path(working_dir).mkdir(parents=True, exist_ok=True)
+
+    # Execute Perl Script
+    exec_fasta_summary(args.input, args.file_type, args.bucket, working_dir)
+
+    # Rename out files to desired file names
+    for perl_out_file, dest_file in zip(PERL_OUT_FILES, out_file_names):
+        shutil.move(os.path.join(working_dir, perl_out_file),
+                    dest_file)
+
+
+if __name__ == "__main__":
+    main()
--- a/assembly_stats_txt.xml	Tue Nov 07 12:28:31 2017 -0500
+++ b/assembly_stats_txt.xml	Fri May 29 13:51:50 2020 -0400
@@ -1,23 +1,27 @@
-<tool id="assemblystats" name="assemblystats" version="1.0.1">
+<tool id="assemblystats" name="assemblystats" version="1.1.0">
 	<description>Summarise an assembly (e.g. N50 metrics)</description>
 	<requirements>
-		<requirement type="package" version="1.6.924">perl-bioperl</requirement>
-		<requirement type="package" version="5.0.4">gnuplot</requirement>
+		<requirement type="package" version="1.7.2">perl-bioperl</requirement>
+		<requirement type="package" version="5.2.7">gnuplot</requirement>
+		<requirement type="package" version="3.7.6">python</requirement>
 	</requirements>
 	<command detect_errors="exit_code"><![CDATA[
-		python $__tool_directory__/assembly_stats_txt.py
+		python3 $__tool_directory__/assembly_stats_txt.py

-		'$type'
-		'$stats.extra_files_path'
-		'$type'
-		'$bucket'
-		'$input'
-		'$stats'
-		'$sortedcontigs'
-		'$histogrampng'
-		'$summedcontigspng'
-		'$histogramdata'
-		'$summedcontigdata'
+		-d '$stats.extra_files_path'
+		-t '$type'
+		-i '$input'
+		-s '$stats'
+		-sc '$sortedcontigs'
+		-hpng '$histogrampng'
+		-spng '$summedcontigspng'
+		-hd '$histogramdata'
+		-scd '$summedcontigdata'
+
+		#if $bucket
+			-b
+		#end if
+
 		]]></command>
 		<inputs>
 			<param label="Type of read" name="type" type="select" help="Is this from an genomic (contig) or transcriptomic assembly (isotig) or are these raw reads (read)">
@@ -25,7 +29,7 @@
 				<option value="isotig">Isotig (if from transcriptomic assembly)</option>
 				<option value="read">Raw reads from sequencer in FASTA format (useful for 454 data)</option>
 			</param>
-			<param name="bucket" type="boolean" label="Output histogram with bin sizes=1" truevalue="-b" falsevalue="" help="Use this to specify whether or not bin sizes of 1 should be used when plotting histograms"/>
+			<param name="bucket" type="boolean" label="Output histogram with bin sizes=1" truevalue="true" falsevalue="false" help="Use this to specify whether or not bin sizes of 1 should be used when plotting histograms"/>
 			<param format="fasta" name="input" type="data" label="Source file in FASTA format"/>
 			<param name = "all_outputs" type ="boolean" checked="false" label="Return all output files" help="If checked, all output files will be displayed. If not checked, only the file 'Assembly Statistics' will be provided." />
 		</inputs>