annotate macs2_wrapper.py @ 0:642c0da30ca6 draft

Initial upload.
author stemcellcommons
date Thu, 17 Oct 2013 12:47:49 -0400
parents
children c05f607d116c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
1 #purpose: macs2 python wrapper
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
2 #author: Ziru Zhou
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
3 #date: November, 2012
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
4
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
5 import sys, subprocess, tempfile, shutil, glob, os, os.path, gzip
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
6 from galaxy import eggs
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
7 import pkg_resources
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
8 pkg_resources.require( "simplejson" )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
9 import simplejson
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
10
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
11 CHUNK_SIZE = 1024
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
12
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
13 #==========================================================================================
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
14 #functions
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
15 #==========================================================================================
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
16 def gunzip_cat_glob_path( glob_path, target_filename, delete = False ):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
17 out = open( target_filename, 'wb' )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
18 for filename in glob.glob( glob_path ):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
19 fh = gzip.open( filename, 'rb' )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
20 while True:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
21 data = fh.read( CHUNK_SIZE )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
22 if data:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
23 out.write( data )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
24 else:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
25 break
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
26 fh.close()
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
27 if delete:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
28 os.unlink( filename )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
29 out.close()
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
30
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
31 def xls_to_interval( xls_file, interval_file, header = None ):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
32 out = open( interval_file, 'wb' )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
33 if header:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
34 out.write( '#%s\n' % header )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
35 wrote_header = False
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
36 #From macs readme: Coordinates in XLS is 1-based which is different with BED format.
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
37 for line in open( xls_file ):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
38 #keep all existing comment lines
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
39 if line.startswith( '#' ):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
40 out.write( line )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
41 #added for macs2 since there is an extra newline
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
42 elif line.startswith( '\n' ):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
43 out.write( line )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
44 elif not wrote_header:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
45 out.write( '#%s' % line )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
46 print line
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
47 wrote_header = True
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
48 else:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
49 fields = line.split( '\t' )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
50 if len( fields ) > 1:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
51 fields[1] = str( int( fields[1] ) - 1 )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
52 out.write( '\t'.join( fields ) )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
53 out.close()
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
54
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
55 #==========================================================================================
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
56 #main
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
57 #==========================================================================================
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
58 def main():
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
59 #take in options file and output file names
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
60 options = simplejson.load( open( sys.argv[1] ) )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
61 outputs = simplejson.load( open( sys.argv[2] ) )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
62
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
63 #=================================================================================
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
64 #parse options and execute macs2
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
65 #=================================================================================
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
66 #default inputs that are in every major command
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
67 experiment_name = '_'.join( options['experiment_name'].split() ) #save experiment name here, it will be used by macs for some file names
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
68 cmdline = "macs2 %s -t %s" % ( options['command'], ",".join( options['input_chipseq'] ) )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
69 if options['input_control']:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
70 cmdline = "%s -c %s" % ( cmdline, ",".join( options['input_control'] ) )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
71
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
72 #=================================================================================
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
73 if (options['command'] == "callpeak"):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
74 output_bed = outputs['output_bed_file']
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
75 output_extra_html = outputs['output_extra_file']
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
76 output_extra_path = outputs['output_extra_file_path']
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
77 output_peaks = outputs['output_peaks_file']
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
78 output_narrowpeaks = outputs['output_narrowpeaks_file']
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
79 output_xls_to_interval_peaks_file = outputs['output_xls_to_interval_peaks_file']
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
80 output_xls_to_interval_negative_peaks_file = outputs['output_xls_to_interval_negative_peaks_file']
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
81
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
82 if 'pvalue' in options:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
83 cmdline = "%s --format='%s' --name='%s' --gsize='%s' --bw='%s' --pvalue='%s' --mfold %s %s %s %s" % ( cmdline, options['format'], experiment_name, options['gsize'], options['bw'], options['pvalue'], options['mfoldlo'], options['mfoldhi'], options['nolambda'], options['bdg'] )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
84 elif 'qvalue' in options:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
85 cmdline = "%s --format='%s' --name='%s' --gsize='%s' --bw='%s' --qvalue='%s' --mfold %s %s %s %s" % ( cmdline, options['format'], experiment_name, options['gsize'], options['bw'], options['qvalue'], options['mfoldlo'], options['mfoldhi'], options['nolambda'], options['bdg'] )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
86
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
87 if 'nomodel' in options:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
88 cmdline = "%s --nomodel --shiftsize='%s'" % ( cmdline, options['nomodel'] )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
89 #=================================================================================
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
90 if (options['command'] == "bdgcmp"):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
91 output_bdgcmp = outputs['output_bdgcmp_file']
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
92
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
93 cmdline = "%s -m %s -p %s -o bdgcmp_out.bdg" % ( cmdline, options['m'], options['pseudocount'] )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
94 #=================================================================================
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
95
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
96 tmp_dir = tempfile.mkdtemp() #macs makes very messy output, need to contain it into a temp dir, then provide to user
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
97 stderr_name = tempfile.NamedTemporaryFile().name # redirect stderr here, macs provides lots of info via stderr, make it into a report
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
98 proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir, stderr=open( stderr_name, 'wb' ) )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
99 proc.wait()
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
100 #We don't want to set tool run to error state if only warnings or info, e.g. mfold could be decreased to improve model, but let user view macs log
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
101 #Do not terminate if error code, allow dataset (e.g. log) creation and cleanup
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
102 if proc.returncode:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
103 stderr_f = open( stderr_name )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
104 while True:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
105 chunk = stderr_f.read( CHUNK_SIZE )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
106 if not chunk:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
107 stderr_f.close()
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
108 break
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
109 sys.stderr.write( chunk )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
110
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
111 #=================================================================================
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
112 #copy files created by macs2 to appripriate directory with the provided names
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
113 #=================================================================================
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
114
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
115 #=================================================================================
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
116 #move files generated by callpeak command
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
117 if (options['command'] == "callpeak"):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
118 #run R to create pdf from model script
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
119 if os.path.exists( os.path.join( tmp_dir, "%s_model.r" % experiment_name ) ):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
120 cmdline = 'R --vanilla --slave < "%s_model.r" > "%s_model.r.log"' % ( experiment_name, experiment_name )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
121 proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
122 proc.wait()
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
123
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
124 #move bed out to proper output file
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
125 created_bed_name = os.path.join( tmp_dir, "%s_peaks.bed" % experiment_name )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
126 if os.path.exists( created_bed_name ):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
127 shutil.move( created_bed_name, output_bed )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
128
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
129 #OICR peak_xls file
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
130 created_peak_xls_file = os.path.join( tmp_dir, "%s_peaks.xls" % experiment_name )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
131 if os.path.exists( created_peak_xls_file ):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
132 # shutil.copy( created_peak_xls_file, os.path.join ( "/mnt/galaxyData/tmp/", "%s_peaks.xls" % ( os.path.basename(output_extra_path) )))
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
133 shutil.copyfile( created_peak_xls_file, output_peaks )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
134
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
135 #peaks.encodepeaks (narrowpeaks) file
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
136 created_narrowpeak_file = os.path.join (tmp_dir, "%s_peaks.encodePeak" % experiment_name )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
137 if os.path.exists( created_narrowpeak_file ):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
138 shutil.move (created_narrowpeak_file, output_narrowpeaks )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
139
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
140 #parse xls files to interval files as needed
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
141 #if 'xls_to_interval' in options:
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
142 if (options['xls_to_interval'] == "True"):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
143 create_peak_xls_file = os.path.join( tmp_dir, '%s_peaks.xls' % experiment_name )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
144 if os.path.exists( create_peak_xls_file ):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
145 xls_to_interval( create_peak_xls_file, output_xls_to_interval_peaks_file, header = 'peaks file' )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
146 create_peak_xls_file = os.path.join( tmp_dir, '%s_negative_peaks.xls' % experiment_name )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
147 if os.path.exists( create_peak_xls_file ):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
148 print "negative file exists"
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
149 xls_to_interval( create_peak_xls_file, output_xls_to_interval_negative_peaks_file, header = 'negative peaks file' )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
150
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
151 #move all remaining files to extra files path of html file output to allow user download
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
152 out_html = open( output_extra_html, 'wb' )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
153 out_html.write( '<html><head><title>Additional output created by MACS (%s)</title></head><body><h3>Additional Files:</h3><p><ul>\n' % experiment_name )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
154 os.mkdir( output_extra_path )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
155 for filename in sorted( os.listdir( tmp_dir ) ):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
156 shutil.move( os.path.join( tmp_dir, filename ), os.path.join( output_extra_path, filename ) )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
157 out_html.write( '<li><a href="%s">%s</a></li>\n' % ( filename, filename ) )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
158 #out_html.write( '<li><a href="%s">%s</a>peakxls %s SomethingDifferent tmp_dir %s path %s exp_name %s</li>\n' % ( created_peak_xls_file, filename, filename, tmp_dir, output_extra_path, experiment_name ) )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
159 out_html.write( '</ul></p>\n' )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
160 out_html.write( '<h3>Messages from MACS:</h3>\n<p><pre>%s</pre></p>\n' % open( stderr_name, 'rb' ).read() )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
161 out_html.write( '</body></html>\n' )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
162 out_html.close()
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
163
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
164 #=================================================================================
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
165 #move files generated by bdgcmp command
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
166 if (options['command'] == "bdgcmp"):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
167 created_bdgcmp_file = os.path.join (tmp_dir, "bdgcmp_out.bdg" )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
168 if os.path.exists( created_bdgcmp_file ):
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
169 shutil.move (created_bdgcmp_file, output_bdgcmp )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
170
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
171 #=================================================================================
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
172 #cleanup
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
173 #=================================================================================
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
174 os.unlink( stderr_name )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
175 os.rmdir( tmp_dir )
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
176
642c0da30ca6 Initial upload.
stemcellcommons
parents:
diff changeset
177 if __name__ == "__main__": main()