Mercurial > repos > bgruening > trna_prediction
comparison aragorn_out_to_gff3.py @ 1:d788d1abe238 draft
Uploaded
author | bgruening |
---|---|
date | Thu, 22 Jan 2015 13:15:51 -0500 |
parents | |
children | 358f58401cd6 |
comparison
equal
deleted
inserted
replaced
0:d34f31cbc9dd | 1:d788d1abe238 |
---|---|
1 #!/usr/bin/env python | |
2 import re | |
3 | |
4 def start_pattern(string): | |
5 return re.match(r'^[0-9]+\.$', string) \ | |
6 or string.startswith('Number of possible') \ | |
7 or string.startswith('Searching for') | |
8 | |
9 def blank_line(string): | |
10 return re.match(r'^\s*$', string) | |
11 | |
12 def blocks(iterable): | |
13 accumulator = [] | |
14 run_of_blanklines = 0 | |
15 for line in iterable: | |
16 # Count blank lines | |
17 if blank_line(line): | |
18 run_of_blanklines += 1 | |
19 else: | |
20 run_of_blanklines = 0 | |
21 | |
22 if start_pattern(line) or run_of_blanklines > 2 or 'Mean G+C' in line: | |
23 if accumulator: | |
24 yield accumulator | |
25 accumulator = [line] | |
26 else: | |
27 accumulator.append(line) | |
28 if accumulator: | |
29 yield accumulator | |
30 | |
31 IMPORTANT_INFO = { | |
32 'trna': re.compile(r'tRNA-(?P<codon>[A-Za-z]{3})\((?P<anticodon>[A-Za-z]{3})\)'), | |
33 'trna-alt': re.compile(r'tRNA-\?\((?P<codon>[^\)]+)\)\((?P<anticodon>[A-Za-z]{2,})\)'), | |
34 'bases': re.compile(r'(?P<bases>[0-9]+) bases, %GC = (?P<gc>[0-9.]+)'), | |
35 'sequence': re.compile(r'Sequence (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'), | |
36 'possible_pseudogene': re.compile(r'(?P<pseudo>Possible Pseudogene)'), | |
37 } | |
38 INFO_GROUPS = ('codon', 'anticodon', 'bases', 'gc', 'complement', 'start', 'end', 'pseudo') | |
39 | |
40 def important_info(block): | |
41 info = {} | |
42 for line in block: | |
43 for matcher in IMPORTANT_INFO: | |
44 matches = IMPORTANT_INFO[matcher].search(line) | |
45 if matches: | |
46 for group in INFO_GROUPS: | |
47 try: | |
48 info[group] = matches.group(group) | |
49 except: | |
50 pass | |
51 return info | |
52 | |
53 IMPORTANT_INFO_TMRNA = { | |
54 'tag_peptide': re.compile(r'Tag peptide:\s+(?P<pep>[A-Z*]*)'), | |
55 'location': re.compile(r'Location (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'), | |
56 } | |
57 INFO_GROUPS_TMRNA = ('start', 'end', 'pep') | |
58 | |
59 def important_info_tmrna(block): | |
60 info = {} | |
61 for line in block: | |
62 for matcher in IMPORTANT_INFO_TMRNA: | |
63 matches = IMPORTANT_INFO_TMRNA[matcher].search(line) | |
64 if matches: | |
65 for group in INFO_GROUPS_TMRNA: | |
66 try: | |
67 info[group] = matches.group(group) | |
68 except: | |
69 pass | |
70 return info | |
71 | |
72 import fileinput | |
73 stdin_data = [] | |
74 for line in fileinput.input(): | |
75 stdin_data.append(line) | |
76 | |
77 possible_blocks = [line for line in blocks(stdin_data)] | |
78 | |
79 seqid = None | |
80 print '##gff-version-3' | |
81 # We're off to a GREAT start, if I'm accessing by index you just know that I'm going to do terrible | |
82 # awful things | |
83 for block_idx in range(len(possible_blocks)): | |
84 block = possible_blocks[block_idx] | |
85 data = None | |
86 fasta_defline = None | |
87 | |
88 if block[0].startswith('Searching for') or 'nucleotides in sequence' in block[-1]: | |
89 # Try and get a sequence ID out of it | |
90 try: | |
91 fasta_defline = block[-2].strip() | |
92 except: | |
93 # Failing that, ignore it. | |
94 pass | |
95 else: | |
96 # They DUPLICATE results in multiple places, including a fasta version | |
97 # in the 'full report'. | |
98 possible_ugliness = [x for x in block if x.startswith('>t')] | |
99 if len(possible_ugliness) > 0: | |
100 continue | |
101 | |
102 # However, if it didn't have one of those all important pieces of | |
103 # information, then it's either a different important piece of | |
104 # information, or complete junk | |
105 data = important_info(block) | |
106 | |
107 # I am not proud of any of this. We essentially say "if that block | |
108 # didn't come up with useful info, then try making it a tmrna" | |
109 if len(data.keys()) == 0: | |
110 data = important_info_tmrna(block) | |
111 # And if that fails, just none it. | |
112 if len(data.keys()) == 0: | |
113 data = None | |
114 else: | |
115 # But if it didn't, confirm that we're a tmRNA | |
116 data['type'] = 'tmRNA' | |
117 else: | |
118 # If we did have keys, and didn't pass through any of the tmRNA | |
119 # checks, we're tRNA | |
120 data['type'] = 'tRNA' | |
121 | |
122 # If we got a sequence ID in this block, set the defline | |
123 if 'nucleotides in sequence' in block[-1]: | |
124 try: | |
125 fasta_defline = block[-2].strip() | |
126 except: | |
127 pass | |
128 | |
129 # if a defline is available, try and extract the fasta header ID | |
130 if fasta_defline is not None: | |
131 try: | |
132 seqid = fasta_defline[0:fasta_defline.index(' ')] | |
133 except: | |
134 seqid = fasta_defline | |
135 | |
136 # If there's data | |
137 if data is not None and len(data.keys()) > 1: | |
138 | |
139 # Deal with our flags/notes. | |
140 if data['type'] == 'tRNA': | |
141 # Are these acceptable GFF3 tags? | |
142 notes = { | |
143 'Codon': data['codon'], | |
144 'Anticodon': data['anticodon'], | |
145 } | |
146 if 'pseudo' in data: | |
147 notes['Note'] = 'Possible pseudogene' | |
148 else: | |
149 notes = { | |
150 'Note': 'Tag peptide: ' + data['pep'] + '' | |
151 } | |
152 | |
153 notestr = ';'.join(['%s="%s"' % (k,v) for k,v in notes.iteritems()]) | |
154 | |
155 print '\t'.join([ | |
156 seqid, | |
157 'aragorn', | |
158 data['type'], | |
159 data['start'], | |
160 data['end'], | |
161 '.', | |
162 '.', | |
163 '.', | |
164 notestr | |
165 ]) |