diff seal-galaxy-cc1b1911/seal/recab_table_galaxy.py @ 0:244073d9abc1 draft default tip

Uploaded
author crs4
date Wed, 15 Oct 2014 09:41:10 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/recab_table_galaxy.py	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+# Copyright (C) 2011-2014 CRS4.
+#
+# This file is part of Seal.
+#
+# Seal is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# Seal is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with Seal.  If not, see <http://www.gnu.org/licenses/>.
+
+
+
+"""
+Calls the Seal RecabTable tool.  Then, it calls recab_table_fetch to
+concatenate all the partial tables and create a single csv file.
+"""
+
+
+# parameters:
+#    INPUT_DATA
+#    OUTPUT
+#    VCF
+#    NUM_REDUCERS
+#    [OTHER]
+
+import os
+import sys
+
+import hadoop_galaxy.pathset as pathset
+import subprocess
+import tempfile
+import pydoop.hdfs as phdfs
+
+# XXX: add --append-python-path to the possible arguments?
+
+def usage_error(msg=None):
+  if msg:
+    print >> sys.stderr, msg
+  print >> sys.stderr, os.path.basename(sys.argv[0]), "INPUT_DATA OUTPUT VCF NUM_REDUCERS [OTHER]"
+  sys.exit(1)
+
+
+def run_recab(input_path, output_path, vcf, num_red, other_args):
+  mydir = os.path.abspath(os.path.dirname(__file__))
+  cmd = [
+    'hadoop_galaxy',
+    '--input', input_path,
+    '--output', output_path,
+    '--executable', 'seal',
+    'recab_table',
+    '--vcf-file', vcf,
+    '--num-reducers', num_red
+  ]
+
+  if other_args:
+    cmd.extend(other_args)
+
+  # now execute the hadoop job
+  subprocess.check_call(cmd)
+
+def collect_table(pset, output_path):
+  # finally, fetch the result into the final output file
+  cmd = ['seal', 'recab_table_fetch']
+  cmd.extend(pset.get_paths())
+  cmd.append(output_path)
+  try:
+    # remove the file that galaxy creates.  recab_table_fetch refuses to
+    # overwrite it
+    os.unlink(output_path)
+  except IOError:
+    pass
+  subprocess.check_call(cmd)
+
+def cleanup(out_pathset):
+  # clean-up job output
+  for path in out_pathset:
+    try:
+      print >> sys.stderr, "Deleting output path", path
+      phdfs.rmr(path)
+    except StandardError as e:
+      print >> sys.stderr, "Error!", str(e)
+
+def main(args):
+  if len(args) < 5:
+    usage_error()
+
+  input_data            = args[0]
+  final_output          = args[1]
+  vcf                   = args[2]
+  num_reducers          = args[3]
+  other                 = args[4:]
+
+  # Create a temporary pathset to reference the recab_table
+  # output directory
+  with tempfile.NamedTemporaryFile(mode='rwb') as tmp_pathset_file:
+    try:
+      run_recab(input_data, tmp_pathset_file.name, vcf, num_reducers, other)
+      tmp_pathset_file.seek(0)
+      out_paths = pathset.FilePathset.from_file(tmp_pathset_file)
+      collect_table(out_paths, final_output)
+    finally:
+      cleanup(out_paths)
+
+if __name__ == "__main__":
+  main(sys.argv[1:])
+
+# vim: et ai ts=2 sw=2