annotate sequel_wrapper.py @ 0:58e1eb37fddc draft

Uploaded
author crs4
date Tue, 15 Oct 2013 11:15:28 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
1 # -*- coding: utf-8 -*-
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
2 """
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
3 SEQuel
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
4 version 0.2 (andrea.pinna@crs4.it)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
5 """
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
6
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
7 import optparse
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
8 import os
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
9 import shutil
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
10 import subprocess
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
11 import sys
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
12 import tempfile
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
13 import uuid
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
14
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
15 def external_insert_size(prep_file):
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
16 """ Retrieve average external insert-size from pre-processing log """
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
17 with open(prep_file, 'r') as f:
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
18 for line in f:
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
19 if line[0:30] == '\tAverage external insert-size:':
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
20 ext_int_size = line.split(':')[1][1:-1]
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
21 break
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
22 else:
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
23 sys.exit('Average external insert-size not found in %s' % prep_file)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
24 return ext_int_size
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
25
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
26
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
27 def __main__():
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
28 # load arguments
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
29 print 'Parsing SEQuel input options...'
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
30 parser = optparse.OptionParser()
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
31 parser.add_option('--sequel_jar_path', dest='sequel_jar_path', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
32 parser.add_option('--read1', dest='read1', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
33 parser.add_option('--read2', dest='read2', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
34 parser.add_option('--contigs', dest='contigs', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
35 parser.add_option('--bases_length', dest='bases_length', type='int', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
36 parser.add_option('-t', dest='n_threads', type='int', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
37 parser.add_option('-p', dest='max_threads', type='int', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
38 parser.add_option('-u', dest='min_threads', type='int', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
39 parser.add_option('--kmer_size', dest='kmer_size', type='int', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
40 parser.add_option('--max_positional_error', dest='max_positional_error', type='int', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
41 parser.add_option('--min_fraction', dest='min_fraction', type='float', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
42 parser.add_option('--min_aln_length', dest='min_aln_length', type='int', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
43 parser.add_option('--min_avg_coverage', dest='min_avg_coverage', type='float', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
44 parser.add_option('--discard_kmers', dest='discard_kmers', type='int', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
45 parser.add_option('--discard_positional', dest='discard_positional', type='int', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
46 parser.add_option('--min_aln_score', dest='min_aln_score', type='int', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
47 parser.add_option('--single_cell_mode', action='store_true', dest='single_cell_mode', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
48 parser.add_option('--report_changes', action='store_true', dest='report_changes', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
49 parser.add_option('--extend_contig', action='store_true', dest='extend_contig', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
50 parser.add_option('--reference_genome', dest='reference_genome', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
51 parser.add_option('--contigs_refined', dest='contigs_refined', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
52 parser.add_option('--logprep', dest='logprep', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
53 parser.add_option('--logseq', dest='logseq', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
54 parser.add_option('--logfile_prep', dest='logfile_prep', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
55 parser.add_option('--logfile_seq', dest='logfile_seq', help='')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
56 (options, args) = parser.parse_args()
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
57 if len(args) > 0:
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
58 parser.error('Wrong number of arguments')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
59
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
60 prep_directory = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) # prep.pl dies if the directory already exists, so just define a unique name
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
61 seq_directory = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) # SEQuel.jar would work in another directory (with '_1' suffix) if the directory already exists, so just define a unique name
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
62
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
63 # Build SEQuel (pre-processing) command to be executed
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
64 # basic preprocessing
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
65 prep_input = "-r1 %s -r2 %s -c %s" % (options.read1, options.read2, options.contigs)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
66 bases_length = "-l %d" % (options.bases_length) if options.bases_length is not None else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
67 n_threads = "-t %d" % (options.n_threads) if options.n_threads is not None else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
68 cmd_prep_dir = "-o %s" % (prep_directory)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
69 cmd_seq_dir = "-o %s" % (seq_directory)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
70 # -i INT (external insert size of paired-end reads (from prep.log)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
71 # -p INT
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
72 max_threads = "-p %d" % (options.max_threads) if options.max_threads is not None else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
73 # -u INT
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
74 min_threads = "-u %d" % (options.min_threads) if options.min_threads is not None else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
75 # -A DIR
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
76 input_directory = "-A %s" % (prep_directory)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
77 # -k INT
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
78 kmer_size = "-k %d" % (options.kmer_size) if options.kmer_size is not None else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
79 # -d INT
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
80 max_positional_error = "-d %d" % (options.max_positional_error) if options.max_positional_error is not None else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
81 # -f FLOAT
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
82 min_fraction = "-f %s" % (options.min_fraction) if options.min_fraction is not None else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
83 # -l INT
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
84 min_aln_length = "-l %d" % (options.min_aln_length) if options.min_aln_length is not None else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
85 # -v FLOAT
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
86 min_avg_coverage = "-v %s" % (options.min_avg_coverage) if options.min_avg_coverage is not None else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
87 # -m INT
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
88 discard_kmers = "-m %d" % (options.discard_kmers) if options.discard_kmers is not None else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
89 # -n INT
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
90 discard_positional = "-n %d" % (options.discard_positional) if options.discard_positional is not None else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
91 # -q INT
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
92 min_aln_score = "-q %d" % (options.min_aln_score) if options.min_aln_score is not None else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
93 # -s
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
94 single_cell_mode = '-s' if options.single_cell_mode else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
95 # -r
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
96 report_changes = '-r' if options.report_changes else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
97 # -e
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
98 extend_contig = '-e' if options.extend_contig else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
99 # -g FILE
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
100 reference_genome = "-g %s" % (options.reference_genome) if options.reference_genome else ''
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
101 # contigs_refined.fa
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
102 contigs_refined = options.contigs_refined
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
103 # logprep & logseq
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
104 logprep = options.logprep
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
105 logseq = options.logseq
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
106 # x.refined-fa
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
107 # x.stdout
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
108 # logfile
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
109 logfile_prep = options.logfile_prep
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
110 logfile_seq = options.logfile_seq
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
111
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
112 # Build SEQuel (pre-processing) command
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
113 cmd1 = ' '.join(['prep.pl', prep_input, bases_length, n_threads, cmd_prep_dir])
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
114 print '\nSEQuel (pre-processing) command to be executed:\n %s' % (cmd1)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
115
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
116 # Execution of SEQuel (pre-processing)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
117 print 'Executing SEQuel (pre-processing)...'
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
118 logfile_prep_file = open(logfile_prep, 'w') if logfile_prep else sys.stdout
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
119 try:
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
120 try:
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
121 subprocess.check_call(cmd1, stdout=logfile_prep_file, stderr=subprocess.STDOUT, shell=True)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
122 finally:
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
123 if logfile_prep_file != sys.stdout:
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
124 logfile_prep_file.close()
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
125 print 'SEQuel (pre-processing) executed!'
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
126
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
127 prep_file = os.path.join(prep_directory, 'prep.log')
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
128 ext_ins_size = external_insert_size(prep_file)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
129
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
130 # Build SEQuel command
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
131 cmd2 = 'java -Xmx12g -jar %s %s -i %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s' % (os.path.join(options.sequel_jar_path, 'SEQuel.jar'), input_directory, ext_ins_size, max_threads, min_threads, kmer_size, max_positional_error, min_fraction, min_aln_length, min_avg_coverage, discard_kmers, discard_positional, min_aln_score, single_cell_mode, report_changes, extend_contig, reference_genome, cmd_seq_dir)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
132 print '\nSEQuel command to be executed:\n %s' % (cmd2)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
133
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
134 # Execution of SEQuel
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
135 print 'Executing SEQuel...'
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
136 logfile_seq_file = open(logfile_seq, 'w') if logfile_seq else sys.stdout
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
137 try:
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
138 subprocess.check_call(cmd2, stdout=logfile_seq_file, stderr=subprocess.STDOUT, shell=True)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
139 except subprocess.CalledProcessError, e:
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
140 if e.retcode == 24:
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
141 sys.exit("Too many contigs in the assembly!")
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
142 else:
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
143 raise
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
144 finally:
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
145 if logfile_seq_file != sys.stdout:
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
146 logfile_seq_file.close()
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
147 print 'SEQuel executed!'
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
148
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
149 shutil.move(prep_file, logprep)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
150 shutil.move(os.path.join(seq_directory, 'Log'), logseq)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
151 shutil.move(os.path.join(seq_directory, 'contigs_refined.fa'), contigs_refined)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
152 finally:
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
153 shutil.rmtree(prep_directory, ignore_errors=True)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
154 shutil.rmtree(seq_directory, ignore_errors=True)
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
155
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
156
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
157 if __name__ == "__main__":
58e1eb37fddc Uploaded
crs4
parents:
diff changeset
158 __main__()