0
|
1 #!/usr/bin/env python
|
|
2 # Jeremy Liu
|
|
3 # February 2015
|
|
4 # Adapted from Dan Blackenburg's sample data manager
|
|
5
|
|
6 import sys
|
|
7 import os
|
|
8 import tempfile
|
|
9 import shutil
|
|
10 import optparse
|
|
11 import urllib2
|
|
12 #import uuid
|
|
13 from ftplib import FTP
|
|
14 import tarfile
|
|
15 import zipfile
|
|
16 import gzip
|
|
17 import bz2
|
|
18
|
|
19 from galaxy.util.json import from_json_string, to_json_string
|
|
20
|
|
21 CHUNK_SIZE = 2**20 #1mb
|
|
22
|
|
23 def download_motif_databases( data_manager_dict, params, target_directory, motif_db ):
|
|
24
|
|
25 # Select download URL, file name, data table name, and path using motif_db selector variable
|
|
26 if motif_db == "encode":
|
|
27 BGZ = ['http://compbio.med.harvard.edu/motif-enrichment/pouya_motifs.bed.bgz',
|
|
28 "pouya_motifs.bed.bgz", "encode_bgz", "Encode Motifs (hg19) BGZ"]
|
|
29 TBI = ['http://compbio.med.harvard.edu/motif-enrichment/pouya_motifs.bed.bgz.tbi',
|
|
30 "pouya_motifs.bed.bgz.tbi", "encode_tbi", "Encode Motifs (hg19) TBI"]
|
|
31 PWM = ['http://compbio.med.harvard.edu/motif-enrichment/pwms/pouya.pwms.from.seq.meme.txt',
|
|
32 "pouya.pwms.from.seq.meme.txt", "encode_pwm", "Encode Motifs (hg19) PWM MEME"]
|
|
33 elif motif_db == "jaspar":
|
|
34 BGZ = ['http://compbio.med.harvard.edu/motif-enrichment/jaspar_jolma_motifs.bed.bgz',
|
|
35 "jaspar_jolma_motifs.bed.bgz", "jaspar_bgz", "Jaspar and Jolma Motifs (hg19) BGZ"]
|
|
36 TBI = ['http://compbio.med.harvard.edu/motif-enrichment/jaspar_jolma_motifs.bed.bgz.tbi',
|
|
37 "jaspar_jolma_motifs.bed.bgz.tbi", "jaspar_tbi", "Jaspar and Jolma Motifs (hg19) TBI"]
|
|
38 PWM = ['http://compbio.med.harvard.edu/motif-enrichment/pwms/jaspar.jolma.pwms.from.seq.meme.txt',
|
|
39 "jaspar.jolma.pwms.from.seq.meme.txt", "jaspar_pwm", "Jaspar and Jolma Motifs (hg19) PWM MEME"]
|
|
40 elif motif_db == "mouse":
|
|
41 BGZ = ['http://compbio.med.harvard.edu/motif-enrichment/mm9_motifs_split.bed.bgz',
|
|
42 "mm9_motifs_split.bed.bgz", "mouse_bgz", "Mouse Motifs (mm9) BGZ"]
|
|
43 TBI = ['http://compbio.med.harvard.edu/motif-enrichment/mm9_motifs_split.bed.bgz.tbi',
|
|
44 "mm9_motifs_split.bed.bgz.tbi", "mouse_tbi", "Mouse Motifs (mm9) TBI"]
|
|
45 PWM = ['http://compbio.med.harvard.edu/motif-enrichment/pwms/mm9.pwms.from.seq.meme.txt',
|
|
46 "mm9.pwms.from.seq.meme.txt", "mouse_pwm", "Mouse Motifs (mm9) PWM MEME"]
|
|
47 else:
|
|
48 BGZ = ['http://compbio.med.harvard.edu/motif-enrichment/pouya_test_motifs.bed.bgz',
|
|
49 "pouya_test_motifs.bed.bgz", "test_bgz", "Test Encode Motifs (hg19) BGZ"]
|
|
50 TBI = ['http://compbio.med.harvard.edu/motif-enrichment/pouya_test_motifs.bed.bgz.tbi',
|
|
51 "pouya_test_motifs.bed.bgz.tbi", "test_tbi", "Test Encode Motifs (hg19) TBI"]
|
|
52 PWM = ['http://compbio.med.harvard.edu/motif-enrichment/pwms/pouya.pwms.from.seq.meme.txt',
|
|
53 "pouya.pwms.from.seq.meme.txt", "test_pwm", "Test Encode Motifs (hg19) PWM MEME"]
|
|
54
|
|
55
|
|
56 # Save and add motif bgz file to motif_databases data table
|
|
57 bgz_reader = urllib2.urlopen( BGZ[0] )
|
|
58 bgz_data_table_entry = _stream_fasta_to_file( bgz_reader, target_directory, params,
|
|
59 BGZ[1], BGZ[2], BGZ[3] )
|
|
60 _add_data_table_entry( data_manager_dict, 'motif_databases', bgz_data_table_entry )
|
|
61
|
|
62 # Save and add motif tbi file to motif_databases data table
|
|
63 tbi_reader = urllib2.urlopen( TBI[0] )
|
|
64 tbi_data_table_entry = _stream_fasta_to_file( tbi_reader, target_directory, params,
|
|
65 TBI[1], TBI[2], TBI[3] )
|
|
66 _add_data_table_entry( data_manager_dict, 'motif_databases', tbi_data_table_entry )
|
|
67
|
|
68 # Save and add motif pwm file to motif_databases data table
|
|
69 tbi_reader = urllib2.urlopen( PWM[0] )
|
|
70 tbi_data_table_entry = _stream_fasta_to_file( tbi_reader, target_directory, params,
|
|
71 PWM[1], PWM[2], PWM[3] )
|
|
72 _add_data_table_entry( data_manager_dict, 'motif_databases', tbi_data_table_entry )
|
|
73
|
|
74 def _add_data_table_entry( data_manager_dict, data_table, data_table_entry ):
|
|
75 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
|
|
76 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get( data_table, [] )
|
|
77 data_manager_dict['data_tables'][data_table].append( data_table_entry )
|
|
78 return data_manager_dict
|
|
79
|
|
80 def _stream_fasta_to_file( fasta_stream, target_directory, params,
|
|
81 fasta_base_filename, value, name, close_stream=True ):
|
|
82 fasta_filename = os.path.join( target_directory, fasta_base_filename )
|
|
83 fasta_writer = open( fasta_filename, 'wb+' )
|
|
84
|
|
85 while True:
|
|
86 buffer = fasta_stream.read(CHUNK_SIZE)
|
|
87 if not buffer:
|
|
88 break
|
|
89
|
|
90 fasta_writer.write(buffer)
|
|
91
|
|
92 fasta_stream.close()
|
|
93 fasta_writer.close()
|
|
94
|
|
95 return dict( value=value, name=name, path=fasta_base_filename )
|
|
96
|
|
97 def main():
|
|
98 #Parse Command Line
|
|
99 parser = optparse.OptionParser()
|
|
100 parser.add_option( '-m', '--motif_db', dest='motif_db', action='store', type="string", default=None, help='motif_db' )
|
|
101 (options, args) = parser.parse_args()
|
|
102
|
|
103 filename = args[0]
|
|
104
|
|
105 params = from_json_string( open( filename ).read() )
|
|
106 target_directory = params[ 'output_data' ][0]['extra_files_path']
|
|
107 os.mkdir( target_directory )
|
|
108 data_manager_dict = {}
|
|
109
|
|
110 #Fetch the Motif Database
|
|
111 download_motif_databases( data_manager_dict, params, target_directory, options.motif_db )
|
|
112
|
|
113 #save info to json file
|
|
114 open( filename, 'wb' ).write( to_json_string( data_manager_dict ) )
|
|
115
|
|
116 if __name__ == "__main__": main()
|