Mercurial > repos > iuc > meme_psp_gen
comparison fimo_wrapper.py @ 0:a0fa4efeeee3 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/meme commit 3f116ddc83447056068573320c148a9bfca9aa2e
author | iuc |
---|---|
date | Wed, 23 Aug 2017 20:57:34 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a0fa4efeeee3 |
---|---|
1 #!/usr/bin/env python | |
2 import argparse | |
3 import os | |
4 import shutil | |
5 import string | |
6 import subprocess | |
7 import sys | |
8 import tempfile | |
9 | |
10 BUFFSIZE = 1048576 | |
11 # Translation table for reverse Complement, with ambiguity codes. | |
12 DNA_COMPLEMENT = string.maketrans("ACGTRYKMBDHVacgtrykmbdhv", "TGCAYRMKVHDBtgcayrmkvhdb") | |
13 | |
14 | |
15 def get_stderr(tmp_stderr): | |
16 tmp_stderr.seek(0) | |
17 stderr = '' | |
18 try: | |
19 while True: | |
20 stderr += tmp_stderr.read(BUFFSIZE) | |
21 if not stderr or len(stderr) % BUFFSIZE != 0: | |
22 break | |
23 except OverflowError: | |
24 pass | |
25 return stderr | |
26 | |
27 | |
28 def reverse(sequence): | |
29 # Reverse sequence string. | |
30 return sequence[::-1] | |
31 | |
32 | |
33 def dna_complement(sequence): | |
34 # Complement DNA sequence string. | |
35 return sequence.translate(DNA_COMPLEMENT) | |
36 | |
37 | |
38 def dna_reverse_complement(sequence): | |
39 # Returns the reverse complement of the sequence. | |
40 sequence = reverse(sequence) | |
41 return dna_complement(sequence) | |
42 | |
43 | |
44 def stop_err(msg): | |
45 sys.stderr.write(msg) | |
46 sys.exit(1) | |
47 | |
48 | |
49 parser = argparse.ArgumentParser() | |
50 parser.add_argument('--input_motifs', dest='input_motifs', help='MEME output formatted files for input to fimo') | |
51 parser.add_argument('--input_fasta', dest='input_fasta', help='Fassta sequence file') | |
52 parser.add_argument('--options_type', dest='options_type', help='Basic or Advance options') | |
53 parser.add_argument('--input_psp', dest='input_psp', default=None, help='File containing position specific priors') | |
54 parser.add_argument('--input_prior_dist', dest='input_prior_dist', default=None, help='File containing binned distribution of priors') | |
55 parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, help='The alpha parameter for calculating position specific priors') | |
56 parser.add_argument('--bgfile', dest='bgfile', default=None, help='Background file type, used only if not "default"') | |
57 parser.add_argument('--max_strand', action='store_true', help='If matches on both strands at a given position satisfy the output threshold, only report the match for the strand with the higher score') | |
58 parser.add_argument('--max_stored_scores', dest='max_stored_scores', type=int, help='Maximum score count to store') | |
59 parser.add_argument('--motif', dest='motifs', action='append', default=[], help='Specify motif by id') | |
60 parser.add_argument('--output_separate_motifs', dest='output_separate_motifs', default='no', help='Output one dataset per motif') | |
61 parser.add_argument('--motif_pseudo', dest='motif_pseudo', type=float, default=0.1, help='Pseudocount to add to counts in motif matrix') | |
62 parser.add_argument('--no_qvalue', action='store_true', help='Do not compute a q-value for each p-value') | |
63 parser.add_argument('--norc', action='store_true', help='Do not score the reverse complement DNA strand') | |
64 parser.add_argument('--output_path', dest='output_path', help='Output files directory') | |
65 parser.add_argument('--parse_genomic_coord', dest='parse_genomic_coord', default='no', help='Check each sequence header for UCSC style genomic coordinates') | |
66 parser.add_argument('--remove_duplicate_coords', dest='remove_duplicate_coords', default='no', help='Remove duplicate entries in unique GFF coordinates') | |
67 parser.add_argument('--qv_thresh', action='store_true', help='Use q-values for the output threshold') | |
68 parser.add_argument('--thresh', dest='thresh', type=float, help='p-value threshold') | |
69 parser.add_argument('--gff_output', dest='gff_output', help='Gff output file') | |
70 parser.add_argument('--html_output', dest='html_output', help='HTML output file') | |
71 parser.add_argument('--interval_output', dest='interval_output', help='Interval output file') | |
72 parser.add_argument('--txt_output', dest='txt_output', help='Text output file') | |
73 parser.add_argument('--xml_output', dest='xml_output', help='XML output file') | |
74 args = parser.parse_args() | |
75 | |
76 fimo_cmd_list = ['fimo'] | |
77 if args.options_type == 'advanced': | |
78 fimo_cmd_list.append('--alpha %4f' % args.alpha) | |
79 if args.bgfile is not None: | |
80 fimo_cmd_list.append('--bgfile "%s"' % args.bgfile) | |
81 if args.max_strand: | |
82 fimo_cmd_list.append('--max-strand') | |
83 fimo_cmd_list.append('--max-stored-scores %d' % args.max_stored_scores) | |
84 if len(args.motifs) > 0: | |
85 for motif in args.motifs: | |
86 fimo_cmd_list.append('--motif "%s"' % motif) | |
87 fimo_cmd_list.append('--motif-pseudo %4f' % args.motif_pseudo) | |
88 if args.no_qvalue: | |
89 fimo_cmd_list.append('--no-qvalue') | |
90 if args.norc: | |
91 fimo_cmd_list.append('--norc') | |
92 if args.parse_genomic_coord == 'yes': | |
93 fimo_cmd_list.append('--parse-genomic-coord') | |
94 if args.qv_thresh: | |
95 fimo_cmd_list.append('--qv-thresh') | |
96 fimo_cmd_list.append('--thresh %4f' % args.thresh) | |
97 if args.input_psp is not None: | |
98 fimo_cmd_list.append('--psp "%s"' % args.input_psp) | |
99 if args.input_prior_dist is not None: | |
100 fimo_cmd_list.append('--prior-dist "%s"' % args.input_prior_dist) | |
101 fimo_cmd_list.append('--o "%s"' % (args.output_path)) | |
102 fimo_cmd_list.append('--verbosity 1') | |
103 fimo_cmd_list.append(args.input_motifs) | |
104 fimo_cmd_list.append(args.input_fasta) | |
105 | |
106 fimo_cmd = ' '.join(fimo_cmd_list) | |
107 | |
108 try: | |
109 tmp_stderr = tempfile.NamedTemporaryFile() | |
110 proc = subprocess.Popen(args=fimo_cmd, shell=True, stderr=tmp_stderr) | |
111 returncode = proc.wait() | |
112 if returncode != 0: | |
113 stderr = get_stderr(tmp_stderr) | |
114 stop_err(stderr) | |
115 except Exception as e: | |
116 stop_err('Error running FIMO:\n%s' % e) | |
117 | |
118 shutil.move(os.path.join(args.output_path, 'fimo.txt'), args.txt_output) | |
119 | |
120 gff_file = os.path.join(args.output_path, 'fimo.gff') | |
121 if args.remove_duplicate_coords == 'yes': | |
122 tmp_stderr = tempfile.NamedTemporaryFile() | |
123 # Identify and eliminating identical motif occurrences. These | |
124 # are identical if the combination of chrom, start, end and | |
125 # motif id are identical. | |
126 cmd = 'sort -k1,1 -k4,4n -k5,5n -k9.1,9.6 -u -o %s %s' % (gff_file, gff_file) | |
127 proc = subprocess.Popen(args=cmd, stderr=tmp_stderr, shell=True) | |
128 returncode = proc.wait() | |
129 if returncode != 0: | |
130 stderr = get_stderr(tmp_stderr) | |
131 stop_err(stderr) | |
132 # Sort GFF output by a combination of chrom, score, start. | |
133 cmd = 'sort -k1,1 -k4,4n -k6,6n -o %s %s' % (gff_file, gff_file) | |
134 proc = subprocess.Popen(args=cmd, stderr=tmp_stderr, shell=True) | |
135 returncode = proc.wait() | |
136 if returncode != 0: | |
137 stderr = get_stderr(tmp_stderr) | |
138 stop_err(stderr) | |
139 if args.output_separate_motifs == 'yes': | |
140 # Create the collection output directory. | |
141 collection_path = (os.path.join(os.getcwd(), 'output')) | |
142 # Keep track of motif occurrences. | |
143 header_line = None | |
144 motif_ids = [] | |
145 file_handles = [] | |
146 for line in open(gff_file, 'r'): | |
147 if line.startswith('#'): | |
148 if header_line is None: | |
149 header_line = line | |
150 continue | |
151 items = line.split('\t') | |
152 attribute = items[8] | |
153 attributes = attribute.split(';') | |
154 name = attributes[0] | |
155 motif_id = name.split('=')[1] | |
156 file_name = os.path.join(collection_path, 'MOTIF%s.gff' % motif_id) | |
157 if motif_id in motif_ids: | |
158 i = motif_ids.index(motif_id) | |
159 fh = file_handles[i] | |
160 fh.write(line) | |
161 else: | |
162 fh = open(file_name, 'wb') | |
163 if header_line is not None: | |
164 fh.write(header_line) | |
165 fh.write(line) | |
166 motif_ids.append(motif_id) | |
167 file_handles.append(fh) | |
168 for file_handle in file_handles: | |
169 file_handle.close() | |
170 else: | |
171 shutil.move(gff_file, args.gff_output) | |
172 shutil.move(os.path.join(args.output_path, 'fimo.xml'), args.xml_output) | |
173 shutil.move(os.path.join(args.output_path, 'fimo.html'), args.html_output) | |
174 | |
175 out_file = open(args.interval_output, 'wb') | |
176 out_file.write("#%s\n" % "\t".join(("chr", "start", "end", "pattern name", "score", "strand", "matched sequence", "p-value", "q-value"))) | |
177 for line in open(args.txt_output): | |
178 if line.startswith('#'): | |
179 continue | |
180 fields = line.rstrip("\n\r").split("\t") | |
181 start, end = int(fields[2]), int(fields[3]) | |
182 sequence = fields[7] | |
183 if start > end: | |
184 # Flip start and end and set strand. | |
185 start, end = end, start | |
186 strand = "-" | |
187 # We want sequences relative to strand; FIMO always provides + stranded sequence. | |
188 sequence = dna_reverse_complement(sequence) | |
189 else: | |
190 strand = "+" | |
191 # Make 0-based start position. | |
192 start -= 1 | |
193 out_file.write("%s\n" % "\t".join([fields[1], str(start), str(end), fields[0], fields[4], strand, sequence, fields[5], fields[6]])) | |
194 out_file.close() |