1
|
1 #!/usr/bin/env python
|
|
2 import re
|
|
3
|
|
4 def start_pattern(string):
|
|
5 return re.match(r'^[0-9]+\.$', string) \
|
|
6 or string.startswith('Number of possible') \
|
|
7 or string.startswith('Searching for')
|
|
8
|
|
9 def blank_line(string):
|
|
10 return re.match(r'^\s*$', string)
|
|
11
|
|
12 def blocks(iterable):
|
|
13 accumulator = []
|
|
14 run_of_blanklines = 0
|
|
15 for line in iterable:
|
|
16 # Count blank lines
|
|
17 if blank_line(line):
|
|
18 run_of_blanklines += 1
|
|
19 else:
|
|
20 run_of_blanklines = 0
|
|
21
|
|
22 if start_pattern(line) or run_of_blanklines > 2 or 'Mean G+C' in line:
|
|
23 if accumulator:
|
|
24 yield accumulator
|
|
25 accumulator = [line]
|
|
26 else:
|
|
27 accumulator.append(line)
|
|
28 if accumulator:
|
|
29 yield accumulator
|
|
30
|
|
31 IMPORTANT_INFO = {
|
|
32 'trna': re.compile(r'tRNA-(?P<codon>[A-Za-z]{3})\((?P<anticodon>[A-Za-z]{3})\)'),
|
|
33 'trna-alt': re.compile(r'tRNA-\?\((?P<codon>[^\)]+)\)\((?P<anticodon>[A-Za-z]{2,})\)'),
|
|
34 'bases': re.compile(r'(?P<bases>[0-9]+) bases, %GC = (?P<gc>[0-9.]+)'),
|
|
35 'sequence': re.compile(r'Sequence (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
|
|
36 'possible_pseudogene': re.compile(r'(?P<pseudo>Possible Pseudogene)'),
|
|
37 }
|
|
38 INFO_GROUPS = ('codon', 'anticodon', 'bases', 'gc', 'complement', 'start', 'end', 'pseudo')
|
|
39
|
|
40 def important_info(block):
|
|
41 info = {}
|
|
42 for line in block:
|
|
43 for matcher in IMPORTANT_INFO:
|
|
44 matches = IMPORTANT_INFO[matcher].search(line)
|
|
45 if matches:
|
|
46 for group in INFO_GROUPS:
|
|
47 try:
|
|
48 info[group] = matches.group(group)
|
|
49 except:
|
|
50 pass
|
|
51 return info
|
|
52
|
|
53 IMPORTANT_INFO_TMRNA = {
|
|
54 'tag_peptide': re.compile(r'Tag peptide:\s+(?P<pep>[A-Z*]*)'),
|
|
55 'location': re.compile(r'Location (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
|
|
56 }
|
|
57 INFO_GROUPS_TMRNA = ('start', 'end', 'pep')
|
|
58
|
|
59 def important_info_tmrna(block):
|
|
60 info = {}
|
|
61 for line in block:
|
|
62 for matcher in IMPORTANT_INFO_TMRNA:
|
|
63 matches = IMPORTANT_INFO_TMRNA[matcher].search(line)
|
|
64 if matches:
|
|
65 for group in INFO_GROUPS_TMRNA:
|
|
66 try:
|
|
67 info[group] = matches.group(group)
|
|
68 except:
|
|
69 pass
|
|
70 return info
|
|
71
|
|
72 import fileinput
|
|
73 stdin_data = []
|
|
74 for line in fileinput.input():
|
|
75 stdin_data.append(line)
|
|
76
|
|
77 possible_blocks = [line for line in blocks(stdin_data)]
|
|
78
|
|
79 seqid = None
|
|
80 print '##gff-version-3'
|
|
81 # We're off to a GREAT start, if I'm accessing by index you just know that I'm going to do terrible
|
|
82 # awful things
|
|
83 for block_idx in range(len(possible_blocks)):
|
|
84 block = possible_blocks[block_idx]
|
|
85 data = None
|
|
86 fasta_defline = None
|
|
87
|
|
88 if block[0].startswith('Searching for') or 'nucleotides in sequence' in block[-1]:
|
|
89 # Try and get a sequence ID out of it
|
|
90 try:
|
|
91 fasta_defline = block[-2].strip()
|
|
92 except:
|
|
93 # Failing that, ignore it.
|
|
94 pass
|
|
95 else:
|
|
96 # They DUPLICATE results in multiple places, including a fasta version
|
|
97 # in the 'full report'.
|
|
98 possible_ugliness = [x for x in block if x.startswith('>t')]
|
|
99 if len(possible_ugliness) > 0:
|
|
100 continue
|
|
101
|
|
102 # However, if it didn't have one of those all important pieces of
|
|
103 # information, then it's either a different important piece of
|
|
104 # information, or complete junk
|
|
105 data = important_info(block)
|
|
106
|
|
107 # I am not proud of any of this. We essentially say "if that block
|
|
108 # didn't come up with useful info, then try making it a tmrna"
|
|
109 if len(data.keys()) == 0:
|
|
110 data = important_info_tmrna(block)
|
|
111 # And if that fails, just none it.
|
|
112 if len(data.keys()) == 0:
|
|
113 data = None
|
|
114 else:
|
|
115 # But if it didn't, confirm that we're a tmRNA
|
|
116 data['type'] = 'tmRNA'
|
|
117 else:
|
|
118 # If we did have keys, and didn't pass through any of the tmRNA
|
|
119 # checks, we're tRNA
|
|
120 data['type'] = 'tRNA'
|
|
121
|
|
122 # If we got a sequence ID in this block, set the defline
|
|
123 if 'nucleotides in sequence' in block[-1]:
|
|
124 try:
|
|
125 fasta_defline = block[-2].strip()
|
|
126 except:
|
|
127 pass
|
|
128
|
|
129 # if a defline is available, try and extract the fasta header ID
|
|
130 if fasta_defline is not None:
|
|
131 try:
|
|
132 seqid = fasta_defline[0:fasta_defline.index(' ')]
|
|
133 except:
|
|
134 seqid = fasta_defline
|
|
135
|
|
136 # If there's data
|
|
137 if data is not None and len(data.keys()) > 1:
|
|
138
|
|
139 # Deal with our flags/notes.
|
|
140 if data['type'] == 'tRNA':
|
|
141 # Are these acceptable GFF3 tags?
|
|
142 notes = {
|
|
143 'Codon': data['codon'],
|
|
144 'Anticodon': data['anticodon'],
|
|
145 }
|
|
146 if 'pseudo' in data:
|
|
147 notes['Note'] = 'Possible pseudogene'
|
|
148 else:
|
|
149 notes = {
|
|
150 'Note': 'Tag peptide: ' + data['pep'] + ''
|
|
151 }
|
|
152
|
|
153 notestr = ';'.join(['%s="%s"' % (k,v) for k,v in notes.iteritems()])
|
|
154
|
|
155 print '\t'.join([
|
|
156 seqid,
|
|
157 'aragorn',
|
|
158 data['type'],
|
|
159 data['start'],
|
|
160 data['end'],
|
|
161 '.',
|
|
162 '.',
|
|
163 '.',
|
|
164 notestr
|
|
165 ])
|