annotate ssake.py @ 1:386166019772 draft

Uploaded
author crs4
date Tue, 07 Jan 2014 04:48:50 -0500
parents 0ec408bcfc80
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
1 # -*- coding: utf-8 -*-
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
2 """
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
3 SSAKE wrapper
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
4 """
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
5
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
6 import logging
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
7 import optparse
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
8 import os
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
9 import shutil
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
10 import subprocess
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
11 import tempfile
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
12
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
13 def execute(cmd):
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
14 """ """
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
15 subprocess.check_call(args=cmd, stdout=open(os.devnull, 'w'), shell=True)
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
16
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
17
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
18 def which(name, flags=os.X_OK):
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
19 """
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
20 Search PATH for executable files with the given name.
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
21 """
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
22 result = []
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
23 exts = filter(None, os.environ.get('PATHEXT', '').split(os.pathsep))
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
24 path = os.environ.get('PATH', None)
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
25 if path is None:
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
26 return []
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
27 for p in os.environ.get('PATH', '').split(os.pathsep):
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
28 p = os.path.join(p, str(name))
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
29 if os.access(p, flags):
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
30 result.append(p)
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
31 for e in exts:
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
32 pext = p + e
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
33 if os.access(pext, flags):
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
34 result.append(pext)
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
35 return result
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
36
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
37
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
38 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
39 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
40 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
41
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
42
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
43 def __main__():
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
44 parser = optparse.OptionParser()
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
45 parser.add_option('--if_unpaired', dest='if_unpaired', help='Unpaired FASTA input file name')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
46 parser.add_option('--if_paired_r1', dest='if_paired_r1', help='Paired FASTA reads 1 input file name')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
47 parser.add_option('--if_paired_r2', dest='if_paired_r2', help='Paired FASTA reads 2 input file name')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
48 parser.add_option('-s', dest='seeds_file', help='FASTA as seeds, input file name')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
49 parser.add_option('-w', dest='mindepthofcoverage', type='int', help='minimum depth of coverage allowed for contigs')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
50 parser.add_option('-m', dest='minoverlappingbases', type='int', default=20, help='Minimum number of overlapping bases with the seed/contig during overhang consensus build up (default -m 20)')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
51 parser.add_option('-o', dest='mincall', type='int', default=2, help='mincall -o ')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
52 parser.add_option('-r', dest='baseratio', type='float', default=0.7, help='baseratio -r')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
53 parser.add_option('-k', dest='minnumlinks', type='int', default=4, help='Minimum number of links (read pairs) to compute scaffold -k')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
54 parser.add_option('-e', dest='error', type='float', default=0.75, help='Error (%) allowed on mean distance -e')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
55 parser.add_option('-a', dest='maxlinkratio', type='float', default=0.5, help='Maximum link ratio between two best contig pairs -a')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
56 parser.add_option('-x', dest='minoverlap', type='int', default=20, help='Minimum overlap required between contigs to merge adjacent contigs in a scaffold -x')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
57 parser.add_option('--ignore_header', dest='ignore_header', choices=['0', '1'], default='1', help='Ignore read name/header *will use less RAM if set to 1* -h')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
58 parser.add_option('--kind_of_reads', dest='kind_of_reads', choices=['0', '1', '2'], help='Kind of reads (-p)')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
59 parser.add_option('--iz', dest='insert_size', type='int', help='Library insert size')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
60 parser.add_option('--prefix', dest='prefix', default='ssake_pre', help='prefix')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
61 parser.add_option('--out1', dest='contigs', help='contig file')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
62 parser.add_option('--out2', dest='short', help='short file')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
63 parser.add_option('--out3', dest='singlets', help='singlets file')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
64 parser.add_option('--loglevel', choices=LOG_LEVELS, default='INFO', help='logging level (default: INFO)')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
65 parser.add_option('--logfile', help='log file (default=stderr)')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
66 (options, args) = parser.parse_args()
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
67 if len(args) > 0:
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
68 parser.error('Wrong number of arguments')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
69
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
70 log_level = getattr(logging, options.loglevel)
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
71 kwargs = {'format': LOG_FORMAT,
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
72 'datefmt': LOG_DATEFMT,
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
73 'level': log_level}
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
74 if options.logfile:
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
75 kwargs['filename'] = options.logfile
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
76 logging.basicConfig(**kwargs)
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
77 logger = logging.getLogger('SSAKE scaffold assembly')
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
78
1
386166019772 Uploaded
crs4
parents: 0
diff changeset
79 executables = ('SSAKE', 'makePairedOutput2EQUALfiles.pl', 'makePairedOutput2UNEQUALfiles.pl')
386166019772 Uploaded
crs4
parents: 0
diff changeset
80 logger.debug(which(executables[0]))
386166019772 Uploaded
crs4
parents: 0
diff changeset
81 logger.debug(which(executables[1]))
386166019772 Uploaded
crs4
parents: 0
diff changeset
82 logger.debug(which(executables[2]))
386166019772 Uploaded
crs4
parents: 0
diff changeset
83 logger.debug('Creating temp dir')
386166019772 Uploaded
crs4
parents: 0
diff changeset
84 kind_of_reads = int(options.kind_of_reads)
386166019772 Uploaded
crs4
parents: 0
diff changeset
85 if not (kind_of_reads):
386166019772 Uploaded
crs4
parents: 0
diff changeset
86 infile = options.if_unpaired
386166019772 Uploaded
crs4
parents: 0
diff changeset
87 paired = 0
386166019772 Uploaded
crs4
parents: 0
diff changeset
88 else:
386166019772 Uploaded
crs4
parents: 0
diff changeset
89 infile_r1 = options.if_paired_r1
386166019772 Uploaded
crs4
parents: 0
diff changeset
90 infile_r2 = options.if_paired_r2
386166019772 Uploaded
crs4
parents: 0
diff changeset
91 paired = 1
386166019772 Uploaded
crs4
parents: 0
diff changeset
92 insert_size = options.insert_size
386166019772 Uploaded
crs4
parents: 0
diff changeset
93 minnumlinks = options.minnumlinks
386166019772 Uploaded
crs4
parents: 0
diff changeset
94 error = options.error
386166019772 Uploaded
crs4
parents: 0
diff changeset
95 maxlinkratio = options.maxlinkratio
386166019772 Uploaded
crs4
parents: 0
diff changeset
96 minoverlap = options.minoverlap
386166019772 Uploaded
crs4
parents: 0
diff changeset
97 mindepthofcoverage = options.mindepthofcoverage
386166019772 Uploaded
crs4
parents: 0
diff changeset
98 minoverlappingbases = options.minoverlappingbases
386166019772 Uploaded
crs4
parents: 0
diff changeset
99 mincall = options.mincall
386166019772 Uploaded
crs4
parents: 0
diff changeset
100 baseratio = options.baseratio
386166019772 Uploaded
crs4
parents: 0
diff changeset
101 ignore_header = options.ignore_header
386166019772 Uploaded
crs4
parents: 0
diff changeset
102 prefix = options.prefix
386166019772 Uploaded
crs4
parents: 0
diff changeset
103 contigs = options.contigs
386166019772 Uploaded
crs4
parents: 0
diff changeset
104 short = options.short
386166019772 Uploaded
crs4
parents: 0
diff changeset
105 singlets = options.singlets
386166019772 Uploaded
crs4
parents: 0
diff changeset
106 seeds = " -s %s" % options.seeds_file if options.seeds_file else ''
386166019772 Uploaded
crs4
parents: 0
diff changeset
107 wd = tempfile.mkdtemp()
386166019772 Uploaded
crs4
parents: 0
diff changeset
108 try:
386166019772 Uploaded
crs4
parents: 0
diff changeset
109 os.chdir(wd)
386166019772 Uploaded
crs4
parents: 0
diff changeset
110 if kind_of_reads == 1:
386166019772 Uploaded
crs4
parents: 0
diff changeset
111 cmd = "%s %s %s %d" % (
386166019772 Uploaded
crs4
parents: 0
diff changeset
112 executables[1], infile_r1, infile_r2,
386166019772 Uploaded
crs4
parents: 0
diff changeset
113 insert_size)
386166019772 Uploaded
crs4
parents: 0
diff changeset
114 logger.info("Preparing data")
386166019772 Uploaded
crs4
parents: 0
diff changeset
115 execute(cmd)
386166019772 Uploaded
crs4
parents: 0
diff changeset
116 paired_file = "%s/paired.fa" % wd
386166019772 Uploaded
crs4
parents: 0
diff changeset
117 command = "%s -f %s -k %d -e %s -a %s -x %d" % (executables[0], paired_file, minnumlinks, error, maxlinkratio, minoverlap)
386166019772 Uploaded
crs4
parents: 0
diff changeset
118 elif kind_of_reads == 2:
386166019772 Uploaded
crs4
parents: 0
diff changeset
119 cmd = "%s %s %s %d" % (
386166019772 Uploaded
crs4
parents: 0
diff changeset
120 executables[2], infile_r1, infile_r2,
386166019772 Uploaded
crs4
parents: 0
diff changeset
121 insert_size)
386166019772 Uploaded
crs4
parents: 0
diff changeset
122 logger.info("Preparing data")
386166019772 Uploaded
crs4
parents: 0
diff changeset
123 execute(cmd)
386166019772 Uploaded
crs4
parents: 0
diff changeset
124 paired_file = "%s/paired.fa" % wd
386166019772 Uploaded
crs4
parents: 0
diff changeset
125 unpaired_file = "%s/unpaired.fa" % wd
386166019772 Uploaded
crs4
parents: 0
diff changeset
126 command = "%s -f %s -g %s -k %d -e %s -a %s -x %d" % (executables[0], paired_file, unpaired_file, minnumlinks, error, maxlinkratio, minoverlap)
386166019772 Uploaded
crs4
parents: 0
diff changeset
127 else:
386166019772 Uploaded
crs4
parents: 0
diff changeset
128 command = "%s -f %s" % (executables[0], infile)
386166019772 Uploaded
crs4
parents: 0
diff changeset
129 command += " %s -w %d -m %d -o %d -r %s -h %s -b %s -p %s" % (seeds, mindepthofcoverage, minoverlappingbases, mincall, baseratio, ignore_header, prefix, paired)
386166019772 Uploaded
crs4
parents: 0
diff changeset
130 logger.debug(command)
386166019772 Uploaded
crs4
parents: 0
diff changeset
131 logger.info("Executing SSAKE")
386166019772 Uploaded
crs4
parents: 0
diff changeset
132 execute(command)
386166019772 Uploaded
crs4
parents: 0
diff changeset
133
386166019772 Uploaded
crs4
parents: 0
diff changeset
134 with open("%s.log" % os.path.join(wd, prefix), 'rb') as ssake_log_file:
386166019772 Uploaded
crs4
parents: 0
diff changeset
135 logger.info("\n".join(["Log from SSAKE", ssake_log_file.read()]))
386166019772 Uploaded
crs4
parents: 0
diff changeset
136 logger.info("Moving result files")
386166019772 Uploaded
crs4
parents: 0
diff changeset
137 shutil.move("%s.contigs" % os.path.join(wd, prefix), contigs)
386166019772 Uploaded
crs4
parents: 0
diff changeset
138 shutil.move("%s.short" % os.path.join(wd, prefix), short)
386166019772 Uploaded
crs4
parents: 0
diff changeset
139 shutil.move("%s.singlets" % os.path.join(wd, prefix), singlets)
386166019772 Uploaded
crs4
parents: 0
diff changeset
140 finally:
386166019772 Uploaded
crs4
parents: 0
diff changeset
141 shutil.rmtree(wd)
386166019772 Uploaded
crs4
parents: 0
diff changeset
142
0
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
143
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
144 if __name__ == "__main__":
0ec408bcfc80 Uploaded
crs4
parents:
diff changeset
145 __main__()