Mercurial > repos > crs4 > sequel
view sequel_wrapper.py @ 0:58e1eb37fddc draft
Uploaded
author | crs4 |
---|---|
date | Tue, 15 Oct 2013 11:15:28 -0400 |
parents | |
children |
line wrap: on
line source
# -*- coding: utf-8 -*- """ SEQuel version 0.2 (andrea.pinna@crs4.it) """ import optparse import os import shutil import subprocess import sys import tempfile import uuid def external_insert_size(prep_file): """ Retrieve average external insert-size from pre-processing log """ with open(prep_file, 'r') as f: for line in f: if line[0:30] == '\tAverage external insert-size:': ext_int_size = line.split(':')[1][1:-1] break else: sys.exit('Average external insert-size not found in %s' % prep_file) return ext_int_size def __main__(): # load arguments print 'Parsing SEQuel input options...' parser = optparse.OptionParser() parser.add_option('--sequel_jar_path', dest='sequel_jar_path', help='') parser.add_option('--read1', dest='read1', help='') parser.add_option('--read2', dest='read2', help='') parser.add_option('--contigs', dest='contigs', help='') parser.add_option('--bases_length', dest='bases_length', type='int', help='') parser.add_option('-t', dest='n_threads', type='int', help='') parser.add_option('-p', dest='max_threads', type='int', help='') parser.add_option('-u', dest='min_threads', type='int', help='') parser.add_option('--kmer_size', dest='kmer_size', type='int', help='') parser.add_option('--max_positional_error', dest='max_positional_error', type='int', help='') parser.add_option('--min_fraction', dest='min_fraction', type='float', help='') parser.add_option('--min_aln_length', dest='min_aln_length', type='int', help='') parser.add_option('--min_avg_coverage', dest='min_avg_coverage', type='float', help='') parser.add_option('--discard_kmers', dest='discard_kmers', type='int', help='') parser.add_option('--discard_positional', dest='discard_positional', type='int', help='') parser.add_option('--min_aln_score', dest='min_aln_score', type='int', help='') parser.add_option('--single_cell_mode', action='store_true', dest='single_cell_mode', help='') parser.add_option('--report_changes', action='store_true', dest='report_changes', help='') parser.add_option('--extend_contig', action='store_true', dest='extend_contig', help='') parser.add_option('--reference_genome', dest='reference_genome', help='') parser.add_option('--contigs_refined', dest='contigs_refined', help='') parser.add_option('--logprep', dest='logprep', help='') parser.add_option('--logseq', dest='logseq', help='') parser.add_option('--logfile_prep', dest='logfile_prep', help='') parser.add_option('--logfile_seq', dest='logfile_seq', help='') (options, args) = parser.parse_args() if len(args) > 0: parser.error('Wrong number of arguments') prep_directory = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) # prep.pl dies if the directory already exists, so just define a unique name seq_directory = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) # SEQuel.jar would work in another directory (with '_1' suffix) if the directory already exists, so just define a unique name # Build SEQuel (pre-processing) command to be executed # basic preprocessing prep_input = "-r1 %s -r2 %s -c %s" % (options.read1, options.read2, options.contigs) bases_length = "-l %d" % (options.bases_length) if options.bases_length is not None else '' n_threads = "-t %d" % (options.n_threads) if options.n_threads is not None else '' cmd_prep_dir = "-o %s" % (prep_directory) cmd_seq_dir = "-o %s" % (seq_directory) # -i INT (external insert size of paired-end reads (from prep.log) # -p INT max_threads = "-p %d" % (options.max_threads) if options.max_threads is not None else '' # -u INT min_threads = "-u %d" % (options.min_threads) if options.min_threads is not None else '' # -A DIR input_directory = "-A %s" % (prep_directory) # -k INT kmer_size = "-k %d" % (options.kmer_size) if options.kmer_size is not None else '' # -d INT max_positional_error = "-d %d" % (options.max_positional_error) if options.max_positional_error is not None else '' # -f FLOAT min_fraction = "-f %s" % (options.min_fraction) if options.min_fraction is not None else '' # -l INT min_aln_length = "-l %d" % (options.min_aln_length) if options.min_aln_length is not None else '' # -v FLOAT min_avg_coverage = "-v %s" % (options.min_avg_coverage) if options.min_avg_coverage is not None else '' # -m INT discard_kmers = "-m %d" % (options.discard_kmers) if options.discard_kmers is not None else '' # -n INT discard_positional = "-n %d" % (options.discard_positional) if options.discard_positional is not None else '' # -q INT min_aln_score = "-q %d" % (options.min_aln_score) if options.min_aln_score is not None else '' # -s single_cell_mode = '-s' if options.single_cell_mode else '' # -r report_changes = '-r' if options.report_changes else '' # -e extend_contig = '-e' if options.extend_contig else '' # -g FILE reference_genome = "-g %s" % (options.reference_genome) if options.reference_genome else '' # contigs_refined.fa contigs_refined = options.contigs_refined # logprep & logseq logprep = options.logprep logseq = options.logseq # x.refined-fa # x.stdout # logfile logfile_prep = options.logfile_prep logfile_seq = options.logfile_seq # Build SEQuel (pre-processing) command cmd1 = ' '.join(['prep.pl', prep_input, bases_length, n_threads, cmd_prep_dir]) print '\nSEQuel (pre-processing) command to be executed:\n %s' % (cmd1) # Execution of SEQuel (pre-processing) print 'Executing SEQuel (pre-processing)...' logfile_prep_file = open(logfile_prep, 'w') if logfile_prep else sys.stdout try: try: subprocess.check_call(cmd1, stdout=logfile_prep_file, stderr=subprocess.STDOUT, shell=True) finally: if logfile_prep_file != sys.stdout: logfile_prep_file.close() print 'SEQuel (pre-processing) executed!' prep_file = os.path.join(prep_directory, 'prep.log') ext_ins_size = external_insert_size(prep_file) # Build SEQuel command cmd2 = 'java -Xmx12g -jar %s %s -i %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s' % (os.path.join(options.sequel_jar_path, 'SEQuel.jar'), input_directory, ext_ins_size, max_threads, min_threads, kmer_size, max_positional_error, min_fraction, min_aln_length, min_avg_coverage, discard_kmers, discard_positional, min_aln_score, single_cell_mode, report_changes, extend_contig, reference_genome, cmd_seq_dir) print '\nSEQuel command to be executed:\n %s' % (cmd2) # Execution of SEQuel print 'Executing SEQuel...' logfile_seq_file = open(logfile_seq, 'w') if logfile_seq else sys.stdout try: subprocess.check_call(cmd2, stdout=logfile_seq_file, stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError, e: if e.retcode == 24: sys.exit("Too many contigs in the assembly!") else: raise finally: if logfile_seq_file != sys.stdout: logfile_seq_file.close() print 'SEQuel executed!' shutil.move(prep_file, logprep) shutil.move(os.path.join(seq_directory, 'Log'), logseq) shutil.move(os.path.join(seq_directory, 'contigs_refined.fa'), contigs_refined) finally: shutil.rmtree(prep_directory, ignore_errors=True) shutil.rmtree(seq_directory, ignore_errors=True) if __name__ == "__main__": __main__()