annotate sopra_wpc.py @ 2:87ffe493b6c1 draft default tip

Use GALAXY_SLOTS for multithreading in Bowtie. Create symlinks instead of copying files. Specify in help that Bowtie is used to align the reads to the contigs. Add readme.rst .
author crs4
date Mon, 03 Mar 2014 11:28:41 -0500
parents 988d5a82291a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
988d5a82291a Uploaded
crs4
parents:
diff changeset
1 # -*- coding: utf-8 -*-
988d5a82291a Uploaded
crs4
parents:
diff changeset
2 """
988d5a82291a Uploaded
crs4
parents:
diff changeset
3 SOPRA with prebuilt contigs workflow runner
988d5a82291a Uploaded
crs4
parents:
diff changeset
4 """
988d5a82291a Uploaded
crs4
parents:
diff changeset
5
988d5a82291a Uploaded
crs4
parents:
diff changeset
6 import optparse
988d5a82291a Uploaded
crs4
parents:
diff changeset
7 import os
988d5a82291a Uploaded
crs4
parents:
diff changeset
8 import tempfile
988d5a82291a Uploaded
crs4
parents:
diff changeset
9 import shutil
988d5a82291a Uploaded
crs4
parents:
diff changeset
10 import subprocess
988d5a82291a Uploaded
crs4
parents:
diff changeset
11 import sys
988d5a82291a Uploaded
crs4
parents:
diff changeset
12
988d5a82291a Uploaded
crs4
parents:
diff changeset
13
988d5a82291a Uploaded
crs4
parents:
diff changeset
14 # Copyright (c) Twisted Matrix Laboratories.
988d5a82291a Uploaded
crs4
parents:
diff changeset
15 def which(name, flags=os.X_OK):
988d5a82291a Uploaded
crs4
parents:
diff changeset
16 """ Search PATH for executable files with the given name. """
988d5a82291a Uploaded
crs4
parents:
diff changeset
17 result = []
988d5a82291a Uploaded
crs4
parents:
diff changeset
18 exts = filter(None, os.environ.get('PATHEXT', '').split(os.pathsep))
988d5a82291a Uploaded
crs4
parents:
diff changeset
19 path = os.environ.get('PATH', None)
988d5a82291a Uploaded
crs4
parents:
diff changeset
20 if path is None:
988d5a82291a Uploaded
crs4
parents:
diff changeset
21 return []
988d5a82291a Uploaded
crs4
parents:
diff changeset
22 for p in os.environ.get('PATH', '').split(os.pathsep):
988d5a82291a Uploaded
crs4
parents:
diff changeset
23 p = os.path.join(p, name)
988d5a82291a Uploaded
crs4
parents:
diff changeset
24 if os.access(p, flags):
988d5a82291a Uploaded
crs4
parents:
diff changeset
25 result.append(p)
988d5a82291a Uploaded
crs4
parents:
diff changeset
26 for e in exts:
988d5a82291a Uploaded
crs4
parents:
diff changeset
27 pext = p + e
988d5a82291a Uploaded
crs4
parents:
diff changeset
28 if os.access(pext, flags):
988d5a82291a Uploaded
crs4
parents:
diff changeset
29 result.append(pext)
988d5a82291a Uploaded
crs4
parents:
diff changeset
30 return result
988d5a82291a Uploaded
crs4
parents:
diff changeset
31
988d5a82291a Uploaded
crs4
parents:
diff changeset
32
988d5a82291a Uploaded
crs4
parents:
diff changeset
33 def __main__():
988d5a82291a Uploaded
crs4
parents:
diff changeset
34 parser = optparse.OptionParser(description='SOPRA with prebuilt contigs')
2
87ffe493b6c1 Use GALAXY_SLOTS for multithreading in Bowtie. Create symlinks instead of copying files. Specify in help that Bowtie is used to align the reads to the contigs. Add readme.rst .
crs4
parents: 0
diff changeset
35 parser.add_option('-p', dest='num_threads', type='int', help='Number of threads for Bowtie')
0
988d5a82291a Uploaded
crs4
parents:
diff changeset
36 parser.add_option('--contigs', action='append', dest='contigs', help='Contigs FASTA files, at least 1')
988d5a82291a Uploaded
crs4
parents:
diff changeset
37 parser.add_option('--mate', action='append', dest='mates', help='Paired-end Illumina libraries, at least 1 FASTA file')
988d5a82291a Uploaded
crs4
parents:
diff changeset
38 parser.add_option('-d', action='append', dest='insert_sizes', type='int', help='List of insert sizes for the corresponding mate pair libraries')
988d5a82291a Uploaded
crs4
parents:
diff changeset
39 parser.add_option('-v', dest='max_mismatches', type='int', help='Maximum number of mismatches when aligning reads on contigs with Bowtie')
988d5a82291a Uploaded
crs4
parents:
diff changeset
40 parser.add_option('-c', dest='c_option', type='int', help='If the number of times a read and its reverse complement appear in the library is equal to or more than this value, the pairing information from that read will be disregarded')
988d5a82291a Uploaded
crs4
parents:
diff changeset
41 parser.add_option('-w', dest='w_option', type='int', help='Minimum number of links between two contigs')
988d5a82291a Uploaded
crs4
parents:
diff changeset
42 parser.add_option('-L', dest='L_option', type='int', help='Minimum length of contigs to be used in scaffold assembly')
988d5a82291a Uploaded
crs4
parents:
diff changeset
43 parser.add_option('--h_option', dest='h_option', type='float', help='High coverage contigs (above mean coverage + h x std coverage) are not considered in the scaffold assembly mainly to exclude reads from repetitive regions')
988d5a82291a Uploaded
crs4
parents:
diff changeset
44 parser.add_option('--scaffolds', dest='scaffolds', help='scaffolds fasta file mandatory')
988d5a82291a Uploaded
crs4
parents:
diff changeset
45 parser.add_option('-l', '--logfile', dest='logfile', help='log file (default=stdout)')
988d5a82291a Uploaded
crs4
parents:
diff changeset
46 (options, args) = parser.parse_args()
988d5a82291a Uploaded
crs4
parents:
diff changeset
47 if len(args) > 0:
988d5a82291a Uploaded
crs4
parents:
diff changeset
48 parser.error('Wrong number of arguments')
988d5a82291a Uploaded
crs4
parents:
diff changeset
49
988d5a82291a Uploaded
crs4
parents:
diff changeset
50 contigs = options.contigs # a list of file paths
988d5a82291a Uploaded
crs4
parents:
diff changeset
51 mates = options.mates # a list of file paths
988d5a82291a Uploaded
crs4
parents:
diff changeset
52 insert_sizes = options.insert_sizes # a list of integers
988d5a82291a Uploaded
crs4
parents:
diff changeset
53 c_option = options.c_option
988d5a82291a Uploaded
crs4
parents:
diff changeset
54 w_option = options.w_option
988d5a82291a Uploaded
crs4
parents:
diff changeset
55 L_option = options.L_option
988d5a82291a Uploaded
crs4
parents:
diff changeset
56 h_option = options.h_option
988d5a82291a Uploaded
crs4
parents:
diff changeset
57 scaffolds = options.scaffolds
988d5a82291a Uploaded
crs4
parents:
diff changeset
58 logfile = options.logfile
988d5a82291a Uploaded
crs4
parents:
diff changeset
59
988d5a82291a Uploaded
crs4
parents:
diff changeset
60 s_scaf_path = which('s_scaf_v1.4.6.pl').pop()
2
87ffe493b6c1 Use GALAXY_SLOTS for multithreading in Bowtie. Create symlinks instead of copying files. Specify in help that Bowtie is used to align the reads to the contigs. Add readme.rst .
crs4
parents: 0
diff changeset
61 print 'Creating temporary directory'
0
988d5a82291a Uploaded
crs4
parents:
diff changeset
62 wd = tempfile.mkdtemp()
988d5a82291a Uploaded
crs4
parents:
diff changeset
63 try:
988d5a82291a Uploaded
crs4
parents:
diff changeset
64 fake_mates = [os.path.join(wd, os.path.basename(mate) + '.fasta') for mate in mates] # s_prep_contigAseq_v1.4.6.pl wants a mate file with extension [Ff][Aa][Ss][Tt][Aa] or [Ff][Aa]
988d5a82291a Uploaded
crs4
parents:
diff changeset
65 contigs_sopra = os.path.join(wd, 'contigs_sopra.fasta') # s_prep_contigAseq_v1.4.6.pl always writes all the prepared contigs to this file
988d5a82291a Uploaded
crs4
parents:
diff changeset
66 bowtie_build = os.path.join(wd, 'bowtie_build') # arbitrary basename for bowtie-build output files
988d5a82291a Uploaded
crs4
parents:
diff changeset
67 mate_sopras = [os.path.splitext(fake_mate)[0] + '_sopra.fasta' for fake_mate in fake_mates] # s_prep_contigAseq_v1.4.6.pl writes the prepared paired reads to these files
988d5a82291a Uploaded
crs4
parents:
diff changeset
68 mysam_mates = [mate_sopra + '.sam' for mate_sopra in mate_sopras] # arbitrary filenames for bowtie output in SAM format
988d5a82291a Uploaded
crs4
parents:
diff changeset
69 mysam_mates_parsed = [mysam_mate + '_parsed' for mysam_mate in mysam_mates] # s_parse_sam_v1.4.6.pl writes its output to these files
2
87ffe493b6c1 Use GALAXY_SLOTS for multithreading in Bowtie. Create symlinks instead of copying files. Specify in help that Bowtie is used to align the reads to the contigs. Add readme.rst .
crs4
parents: 0
diff changeset
70 orientdistinfo = os.path.join(wd, "orientdistinfo_c%d" % c_option) # s_read_parsed_sam_v1.4.6.pl writes its output to this file
0
988d5a82291a Uploaded
crs4
parents:
diff changeset
71 scaffolds_file = os.path.join(wd, "scaffolds_h%s_L%d_w%d.fasta" % (h_option, L_option, w_option)) # s_scaf_v1.4.6.pl writes its output to this file
988d5a82291a Uploaded
crs4
parents:
diff changeset
72
988d5a82291a Uploaded
crs4
parents:
diff changeset
73 for i in range(len(mates)):
2
87ffe493b6c1 Use GALAXY_SLOTS for multithreading in Bowtie. Create symlinks instead of copying files. Specify in help that Bowtie is used to align the reads to the contigs. Add readme.rst .
crs4
parents: 0
diff changeset
74 print "Creating symbolic link %s pointing to %s" % (fake_mates[i], mates[i])
87ffe493b6c1 Use GALAXY_SLOTS for multithreading in Bowtie. Create symlinks instead of copying files. Specify in help that Bowtie is used to align the reads to the contigs. Add readme.rst .
crs4
parents: 0
diff changeset
75 os.symlink(mates[i], fake_mates[i])
0
988d5a82291a Uploaded
crs4
parents:
diff changeset
76
988d5a82291a Uploaded
crs4
parents:
diff changeset
77 log = open(logfile, 'w') if logfile else sys.stdout
988d5a82291a Uploaded
crs4
parents:
diff changeset
78 try:
988d5a82291a Uploaded
crs4
parents:
diff changeset
79 cmd_step1 = "s_prep_contigAseq_v1.4.6.pl -contig %s -mate %s -a %s" % (" ".join(contigs), " ".join(fake_mates), wd)
988d5a82291a Uploaded
crs4
parents:
diff changeset
80 print "SOPRA with prebuilt contigs (preparation) command to be executed:\n %s" % cmd_step1
988d5a82291a Uploaded
crs4
parents:
diff changeset
81 subprocess.check_call(args=cmd_step1, stdout=log, shell=True)
988d5a82291a Uploaded
crs4
parents:
diff changeset
82
988d5a82291a Uploaded
crs4
parents:
diff changeset
83 cmd_step2 = "bowtie-build %s %s" % (contigs_sopra, bowtie_build)
988d5a82291a Uploaded
crs4
parents:
diff changeset
84 print "SOPRA with prebuilt contigs (Bowtie building index) command to be executed:\n %s" % cmd_step2
988d5a82291a Uploaded
crs4
parents:
diff changeset
85 subprocess.check_call(args=cmd_step2, stdout=log, shell=True)
988d5a82291a Uploaded
crs4
parents:
diff changeset
86
988d5a82291a Uploaded
crs4
parents:
diff changeset
87 for i in range(len(mate_sopras)):
2
87ffe493b6c1 Use GALAXY_SLOTS for multithreading in Bowtie. Create symlinks instead of copying files. Specify in help that Bowtie is used to align the reads to the contigs. Add readme.rst .
crs4
parents: 0
diff changeset
88 cmd_step3 = "bowtie -p %d -v %d -m 1 -f --sam %s %s %s" % (options.num_threads, options.max_mismatches, bowtie_build, mate_sopras[i], mysam_mates[i])
0
988d5a82291a Uploaded
crs4
parents:
diff changeset
89 print "SOPRA with prebuilt contigs (Bowtie alignment of library %d) command to be executed:\n %s" % (i+1, cmd_step3)
988d5a82291a Uploaded
crs4
parents:
diff changeset
90 subprocess.check_call(args=cmd_step3, stdout=log, stderr=subprocess.STDOUT, shell=True) # need to redirect stderr because bowtie writes some logging info there
988d5a82291a Uploaded
crs4
parents:
diff changeset
91
988d5a82291a Uploaded
crs4
parents:
diff changeset
92 cmd_step4 = "s_parse_sam_v1.4.6.pl -sam %s -a %s" % (' '.join(mysam_mates), wd)
988d5a82291a Uploaded
crs4
parents:
diff changeset
93 print "SOPRA with prebuilt contigs (removing reads not mapped in a proper pair) command to be executed:\n %s" % cmd_step4
988d5a82291a Uploaded
crs4
parents:
diff changeset
94 subprocess.check_call(args=cmd_step4, stdout=log, shell=True)
988d5a82291a Uploaded
crs4
parents:
diff changeset
95
988d5a82291a Uploaded
crs4
parents:
diff changeset
96 cmd_step5 = "s_read_parsed_sam_v1.4.6.pl -c %d -a %s" % (c_option, wd)
988d5a82291a Uploaded
crs4
parents:
diff changeset
97 for i in range(len(mysam_mates_parsed)):
988d5a82291a Uploaded
crs4
parents:
diff changeset
98 cmd_step5 += " -parsed %s -d %d" % (mysam_mates_parsed[i], insert_sizes[i])
988d5a82291a Uploaded
crs4
parents:
diff changeset
99 print "SOPRA with prebuilt contigs (read parsed SAM) command to be executed:\n %s" % cmd_step5
988d5a82291a Uploaded
crs4
parents:
diff changeset
100 subprocess.check_call(args=cmd_step5, stdout=log, shell=True)
988d5a82291a Uploaded
crs4
parents:
diff changeset
101
988d5a82291a Uploaded
crs4
parents:
diff changeset
102 cmd_step6 = "perl -X %s -w %d -L %d -h %s -o %s -a %s" % (s_scaf_path, w_option, L_option, h_option, orientdistinfo, wd) # need to call with perl -X because: 1) otherwise some Perl warnings are written on stderr; 2) simply redirecting stderr would hide real errors since it always returns exit status 0
988d5a82291a Uploaded
crs4
parents:
diff changeset
103 print "SOPRA with prebuilt contigs (scaffold assembly) command to be executed:\n %s" % cmd_step6
988d5a82291a Uploaded
crs4
parents:
diff changeset
104 subprocess.check_call(args=cmd_step6, stdout=log, shell=True)
988d5a82291a Uploaded
crs4
parents:
diff changeset
105 finally:
988d5a82291a Uploaded
crs4
parents:
diff changeset
106 if log != sys.stdout:
988d5a82291a Uploaded
crs4
parents:
diff changeset
107 log.close()
988d5a82291a Uploaded
crs4
parents:
diff changeset
108
2
87ffe493b6c1 Use GALAXY_SLOTS for multithreading in Bowtie. Create symlinks instead of copying files. Specify in help that Bowtie is used to align the reads to the contigs. Add readme.rst .
crs4
parents: 0
diff changeset
109 print "Moving result file %s to %s" % (scaffolds_file, scaffolds)
0
988d5a82291a Uploaded
crs4
parents:
diff changeset
110 shutil.move(scaffolds_file, scaffolds)
988d5a82291a Uploaded
crs4
parents:
diff changeset
111 finally:
988d5a82291a Uploaded
crs4
parents:
diff changeset
112 shutil.rmtree(wd)
988d5a82291a Uploaded
crs4
parents:
diff changeset
113
988d5a82291a Uploaded
crs4
parents:
diff changeset
114
988d5a82291a Uploaded
crs4
parents:
diff changeset
115 if __name__ == "__main__":
988d5a82291a Uploaded
crs4
parents:
diff changeset
116 __main__()