Mercurial > repos > iuc > meme_fimo
comparison fimo_wrapper.py @ 5:eca84de658b0 draft
Uploaded
| author | iuc |
|---|---|
| date | Fri, 17 Jun 2016 13:15:48 -0400 |
| parents | fd522a964017 |
| children | cdcc4bb60bc3 |
comparison
equal
deleted
inserted
replaced
| 4:cd54079f0f72 | 5:eca84de658b0 |
|---|---|
| 8 import tempfile | 8 import tempfile |
| 9 | 9 |
| 10 BUFFSIZE = 1048576 | 10 BUFFSIZE = 1048576 |
| 11 # Translation table for reverse Complement, with ambiguity codes. | 11 # Translation table for reverse Complement, with ambiguity codes. |
| 12 DNA_COMPLEMENT = string.maketrans("ACGTRYKMBDHVacgtrykmbdhv", "TGCAYRMKVHDBtgcayrmkvhdb") | 12 DNA_COMPLEMENT = string.maketrans("ACGTRYKMBDHVacgtrykmbdhv", "TGCAYRMKVHDBtgcayrmkvhdb") |
| 13 | |
| 14 | |
| 15 def get_stderr(tmp_stderr): | |
| 16 tmp_stderr.seek(0) | |
| 17 stderr = '' | |
| 18 try: | |
| 19 while True: | |
| 20 stderr += tmp_stderr.read(BUFFSIZE) | |
| 21 if not stderr or len(stderr) % BUFFSIZE != 0: | |
| 22 break | |
| 23 except OverflowError: | |
| 24 pass | |
| 25 return stderr | |
| 13 | 26 |
| 14 | 27 |
| 15 def reverse(sequence): | 28 def reverse(sequence): |
| 16 # Reverse sequence string. | 29 # Reverse sequence string. |
| 17 return sequence[::-1] | 30 return sequence[::-1] |
| 41 parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, help='The alpha parameter for calculating position specific priors') | 54 parser.add_argument('--alpha', dest='alpha', type=float, default=1.0, help='The alpha parameter for calculating position specific priors') |
| 42 parser.add_argument('--bgfile', dest='bgfile', default=None, help='Background file type, used only if not "default"') | 55 parser.add_argument('--bgfile', dest='bgfile', default=None, help='Background file type, used only if not "default"') |
| 43 parser.add_argument('--max_strand', action='store_true', help='If matches on both strands at a given position satisfy the output threshold, only report the match for the strand with the higher score') | 56 parser.add_argument('--max_strand', action='store_true', help='If matches on both strands at a given position satisfy the output threshold, only report the match for the strand with the higher score') |
| 44 parser.add_argument('--max_stored_scores', dest='max_stored_scores', type=int, help='Maximum score count to store') | 57 parser.add_argument('--max_stored_scores', dest='max_stored_scores', type=int, help='Maximum score count to store') |
| 45 parser.add_argument('--motif', dest='motifs', action='append', default=[], help='Specify motif by id') | 58 parser.add_argument('--motif', dest='motifs', action='append', default=[], help='Specify motif by id') |
| 59 parser.add_argument('--output_separate_motifs', default="no", help='Output one dataset per motif') | |
| 46 parser.add_argument('--motif_pseudo', dest='motif_pseudo', type=float, default=0.1, help='Pseudocount to add to counts in motif matrix') | 60 parser.add_argument('--motif_pseudo', dest='motif_pseudo', type=float, default=0.1, help='Pseudocount to add to counts in motif matrix') |
| 47 parser.add_argument('--no_qvalue', action='store_true', help='Do not compute a q-value for each p-value') | 61 parser.add_argument('--no_qvalue', action='store_true', help='Do not compute a q-value for each p-value') |
| 48 parser.add_argument('--norc', action='store_true', help='Do not score the reverse complement DNA strand') | 62 parser.add_argument('--norc', action='store_true', help='Do not score the reverse complement DNA strand') |
| 49 parser.add_argument('--output_path', dest='output_path', help='Output files directory') | 63 parser.add_argument('--output_path', dest='output_path', help='Output files directory') |
| 50 parser.add_argument('--parse_genomic_coord', action='store_true', help='Check each sequence header for UCSC style genomic coordinates') | 64 parser.add_argument('--parse_genomic_coord', default='no', help='Check each sequence header for UCSC style genomic coordinates') |
| 65 parser.add_argument('--remove_duplicate_coords', default='no', help='Remove duplicate entries in unique GFF coordinates') | |
| 51 parser.add_argument('--qv_thresh', action='store_true', help='Use q-values for the output threshold') | 66 parser.add_argument('--qv_thresh', action='store_true', help='Use q-values for the output threshold') |
| 52 parser.add_argument('--thresh', dest='thresh', type=float, help='p-value threshold') | 67 parser.add_argument('--thresh', dest='thresh', type=float, help='p-value threshold') |
| 53 parser.add_argument('--gff_output', dest='gff_output', help='Gff output file') | 68 parser.add_argument('--gff_output', dest='gff_output', help='Gff output file') |
| 54 parser.add_argument('--html_output', dest='html_output', help='HTML output file') | 69 parser.add_argument('--html_output', dest='html_output', help='HTML output file') |
| 55 parser.add_argument('--interval_output', dest='interval_output', help='Interval output file') | 70 parser.add_argument('--interval_output', dest='interval_output', help='Interval output file') |
| 71 fimo_cmd_list.append('--motif-pseudo %4f' % args.motif_pseudo) | 86 fimo_cmd_list.append('--motif-pseudo %4f' % args.motif_pseudo) |
| 72 if args.no_qvalue: | 87 if args.no_qvalue: |
| 73 fimo_cmd_list.append('--no-qvalue') | 88 fimo_cmd_list.append('--no-qvalue') |
| 74 if args.norc: | 89 if args.norc: |
| 75 fimo_cmd_list.append('--norc') | 90 fimo_cmd_list.append('--norc') |
| 76 if args.parse_genomic_coord: | 91 if args.parse_genomic_coord == 'yes': |
| 77 fimo_cmd_list.append('--parse-genomic-coord') | 92 fimo_cmd_list.append('--parse-genomic-coord') |
| 78 if args.qv_thresh: | 93 if args.qv_thresh: |
| 79 fimo_cmd_list.append('--qv-thresh') | 94 fimo_cmd_list.append('--qv-thresh') |
| 80 fimo_cmd_list.append('--thresh %4f' % args.thresh) | 95 fimo_cmd_list.append('--thresh %4f' % args.thresh) |
| 81 if args.input_psp is not None: | 96 if args.input_psp is not None: |
| 91 | 106 |
| 92 try: | 107 try: |
| 93 tmp_stderr = tempfile.NamedTemporaryFile() | 108 tmp_stderr = tempfile.NamedTemporaryFile() |
| 94 proc = subprocess.Popen(args=fimo_cmd, shell=True, stderr=tmp_stderr) | 109 proc = subprocess.Popen(args=fimo_cmd, shell=True, stderr=tmp_stderr) |
| 95 returncode = proc.wait() | 110 returncode = proc.wait() |
| 96 tmp_stderr.seek(0) | |
| 97 stderr = '' | |
| 98 try: | |
| 99 while True: | |
| 100 stderr += tmp_stderr.read(BUFFSIZE) | |
| 101 if not stderr or len(stderr) % BUFFSIZE != 0: | |
| 102 break | |
| 103 except OverflowError: | |
| 104 pass | |
| 105 if returncode != 0: | 111 if returncode != 0: |
| 112 stderr = get_stderr(tmp_stderr) | |
| 106 stop_err(stderr) | 113 stop_err(stderr) |
| 107 except Exception, e: | 114 except Exception, e: |
| 108 stop_err('Error running FIMO:\n%s' % str(e)) | 115 stop_err('Error running FIMO:\n%s' % str(e)) |
| 109 | 116 |
| 110 shutil.move(os.path.join(args.output_path, 'fimo.txt'), args.txt_output) | 117 shutil.move(os.path.join(args.output_path, 'fimo.txt'), args.txt_output) |
| 111 shutil.move(os.path.join(args.output_path, 'fimo.gff'), args.gff_output) | 118 |
| 119 gff_file = os.path.join(args.output_path, 'fimo.gff') | |
| 120 if args.remove_duplicate_coords == 'yes': | |
| 121 tmp_stderr = tempfile.NamedTemporaryFile() | |
| 122 # Identify and eliminating identical motif occurrences. These | |
| 123 # are identical if the combination of chrom, start, end and | |
| 124 # motif id are identical. | |
| 125 cmd = 'sort -k1,1 -k4,4n -k5,5n -k9.1,9.6 -u -o %s %s' % (gff_file, gff_file) | |
| 126 proc = subprocess.Popen(args=cmd, stderr=tmp_stderr, shell=True) | |
| 127 returncode = proc.wait() | |
| 128 if returncode != 0: | |
| 129 stderr = get_stderr(tmp_stderr) | |
| 130 stop_err(stderr) | |
| 131 # Sort GFF output by a combination of chrom, score, start. | |
| 132 cmd = 'sort -k1,1 -k4,4n -k6,6n -o %s %s' % (gff_file, gff_file) | |
| 133 proc = subprocess.Popen(args=cmd, stderr=tmp_stderr, shell=True) | |
| 134 returncode = proc.wait() | |
| 135 if returncode != 0: | |
| 136 stderr = get_stderr(tmp_stderr) | |
| 137 stop_err(stderr) | |
| 138 if args.output_separate_motifs == 'yes': | |
| 139 # Create the collection output directory. | |
| 140 collection_path = (os.path.join(os.getcwd(), 'output')) | |
| 141 # Keep track of motif occurrences. | |
| 142 header_line = None | |
| 143 motif_ids = [] | |
| 144 file_handles = [] | |
| 145 for line in open(gff_file, 'r'): | |
| 146 if line.startswith('#'): | |
| 147 if header_line is None: | |
| 148 header_line = line | |
| 149 continue | |
| 150 items = line.split('\t') | |
| 151 attribute = items[8] | |
| 152 attributes = attribute.split(';') | |
| 153 name = attributes[0] | |
| 154 motif_id = name.split('=')[1] | |
| 155 file_name = os.path.join(collection_path, 'MOTIF%s.gff' % motif_id) | |
| 156 if motif_id in motif_ids: | |
| 157 i = motif_ids.index(motif_id) | |
| 158 fh = file_handles[i] | |
| 159 fh.write(line) | |
| 160 else: | |
| 161 fh = open(file_name, 'wb') | |
| 162 if header_line is not None: | |
| 163 fh.write(header_line) | |
| 164 fh.write(line) | |
| 165 motif_ids.append(motif_id) | |
| 166 file_handles.append(fh) | |
| 167 for file_handle in file_handles: | |
| 168 file_handle.close() | |
| 169 else: | |
| 170 shutil.move(gff_file, args.gff_output) | |
| 112 shutil.move(os.path.join(args.output_path, 'fimo.xml'), args.xml_output) | 171 shutil.move(os.path.join(args.output_path, 'fimo.xml'), args.xml_output) |
| 113 shutil.move(os.path.join(args.output_path, 'fimo.html'), args.html_output) | 172 shutil.move(os.path.join(args.output_path, 'fimo.html'), args.html_output) |
| 114 | 173 |
| 115 out_file = open(args.interval_output, 'wb') | 174 out_file = open(args.interval_output, 'wb') |
| 116 out_file.write("#%s\n" % "\t".join(("chr", "start", "end", "pattern name", "score", "strand", "matched sequence", "p-value", "q-value"))) | 175 out_file.write("#%s\n" % "\t".join(("chr", "start", "end", "pattern name", "score", "strand", "matched sequence", "p-value", "q-value"))) |
