comparison aragorn_out_to_gff3.py @ 1:d788d1abe238 draft

Uploaded
author bgruening
date Thu, 22 Jan 2015 13:15:51 -0500
parents
children 358f58401cd6
comparison
equal deleted inserted replaced
0:d34f31cbc9dd 1:d788d1abe238
1 #!/usr/bin/env python
2 import re
3
4 def start_pattern(string):
5 return re.match(r'^[0-9]+\.$', string) \
6 or string.startswith('Number of possible') \
7 or string.startswith('Searching for')
8
9 def blank_line(string):
10 return re.match(r'^\s*$', string)
11
12 def blocks(iterable):
13 accumulator = []
14 run_of_blanklines = 0
15 for line in iterable:
16 # Count blank lines
17 if blank_line(line):
18 run_of_blanklines += 1
19 else:
20 run_of_blanklines = 0
21
22 if start_pattern(line) or run_of_blanklines > 2 or 'Mean G+C' in line:
23 if accumulator:
24 yield accumulator
25 accumulator = [line]
26 else:
27 accumulator.append(line)
28 if accumulator:
29 yield accumulator
30
31 IMPORTANT_INFO = {
32 'trna': re.compile(r'tRNA-(?P<codon>[A-Za-z]{3})\((?P<anticodon>[A-Za-z]{3})\)'),
33 'trna-alt': re.compile(r'tRNA-\?\((?P<codon>[^\)]+)\)\((?P<anticodon>[A-Za-z]{2,})\)'),
34 'bases': re.compile(r'(?P<bases>[0-9]+) bases, %GC = (?P<gc>[0-9.]+)'),
35 'sequence': re.compile(r'Sequence (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
36 'possible_pseudogene': re.compile(r'(?P<pseudo>Possible Pseudogene)'),
37 }
38 INFO_GROUPS = ('codon', 'anticodon', 'bases', 'gc', 'complement', 'start', 'end', 'pseudo')
39
40 def important_info(block):
41 info = {}
42 for line in block:
43 for matcher in IMPORTANT_INFO:
44 matches = IMPORTANT_INFO[matcher].search(line)
45 if matches:
46 for group in INFO_GROUPS:
47 try:
48 info[group] = matches.group(group)
49 except:
50 pass
51 return info
52
53 IMPORTANT_INFO_TMRNA = {
54 'tag_peptide': re.compile(r'Tag peptide:\s+(?P<pep>[A-Z*]*)'),
55 'location': re.compile(r'Location (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
56 }
57 INFO_GROUPS_TMRNA = ('start', 'end', 'pep')
58
59 def important_info_tmrna(block):
60 info = {}
61 for line in block:
62 for matcher in IMPORTANT_INFO_TMRNA:
63 matches = IMPORTANT_INFO_TMRNA[matcher].search(line)
64 if matches:
65 for group in INFO_GROUPS_TMRNA:
66 try:
67 info[group] = matches.group(group)
68 except:
69 pass
70 return info
71
72 import fileinput
73 stdin_data = []
74 for line in fileinput.input():
75 stdin_data.append(line)
76
77 possible_blocks = [line for line in blocks(stdin_data)]
78
79 seqid = None
80 print '##gff-version-3'
81 # We're off to a GREAT start, if I'm accessing by index you just know that I'm going to do terrible
82 # awful things
83 for block_idx in range(len(possible_blocks)):
84 block = possible_blocks[block_idx]
85 data = None
86 fasta_defline = None
87
88 if block[0].startswith('Searching for') or 'nucleotides in sequence' in block[-1]:
89 # Try and get a sequence ID out of it
90 try:
91 fasta_defline = block[-2].strip()
92 except:
93 # Failing that, ignore it.
94 pass
95 else:
96 # They DUPLICATE results in multiple places, including a fasta version
97 # in the 'full report'.
98 possible_ugliness = [x for x in block if x.startswith('>t')]
99 if len(possible_ugliness) > 0:
100 continue
101
102 # However, if it didn't have one of those all important pieces of
103 # information, then it's either a different important piece of
104 # information, or complete junk
105 data = important_info(block)
106
107 # I am not proud of any of this. We essentially say "if that block
108 # didn't come up with useful info, then try making it a tmrna"
109 if len(data.keys()) == 0:
110 data = important_info_tmrna(block)
111 # And if that fails, just none it.
112 if len(data.keys()) == 0:
113 data = None
114 else:
115 # But if it didn't, confirm that we're a tmRNA
116 data['type'] = 'tmRNA'
117 else:
118 # If we did have keys, and didn't pass through any of the tmRNA
119 # checks, we're tRNA
120 data['type'] = 'tRNA'
121
122 # If we got a sequence ID in this block, set the defline
123 if 'nucleotides in sequence' in block[-1]:
124 try:
125 fasta_defline = block[-2].strip()
126 except:
127 pass
128
129 # if a defline is available, try and extract the fasta header ID
130 if fasta_defline is not None:
131 try:
132 seqid = fasta_defline[0:fasta_defline.index(' ')]
133 except:
134 seqid = fasta_defline
135
136 # If there's data
137 if data is not None and len(data.keys()) > 1:
138
139 # Deal with our flags/notes.
140 if data['type'] == 'tRNA':
141 # Are these acceptable GFF3 tags?
142 notes = {
143 'Codon': data['codon'],
144 'Anticodon': data['anticodon'],
145 }
146 if 'pseudo' in data:
147 notes['Note'] = 'Possible pseudogene'
148 else:
149 notes = {
150 'Note': 'Tag peptide: ' + data['pep'] + ''
151 }
152
153 notestr = ';'.join(['%s="%s"' % (k,v) for k,v in notes.iteritems()])
154
155 print '\t'.join([
156 seqid,
157 'aragorn',
158 data['type'],
159 data['start'],
160 data['end'],
161 '.',
162 '.',
163 '.',
164 notestr
165 ])