annotate aragorn_out_to_gff3.py @ 1:d788d1abe238 draft

Uploaded
author bgruening
date Thu, 22 Jan 2015 13:15:51 -0500
parents
children 358f58401cd6
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
1 #!/usr/bin/env python
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
2 import re
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
3
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
4 def start_pattern(string):
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
5 return re.match(r'^[0-9]+\.$', string) \
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
6 or string.startswith('Number of possible') \
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
7 or string.startswith('Searching for')
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
8
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
9 def blank_line(string):
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
10 return re.match(r'^\s*$', string)
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
11
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
12 def blocks(iterable):
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
13 accumulator = []
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
14 run_of_blanklines = 0
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
15 for line in iterable:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
16 # Count blank lines
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
17 if blank_line(line):
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
18 run_of_blanklines += 1
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
19 else:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
20 run_of_blanklines = 0
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
21
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
22 if start_pattern(line) or run_of_blanklines > 2 or 'Mean G+C' in line:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
23 if accumulator:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
24 yield accumulator
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
25 accumulator = [line]
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
26 else:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
27 accumulator.append(line)
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
28 if accumulator:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
29 yield accumulator
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
30
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
31 IMPORTANT_INFO = {
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
32 'trna': re.compile(r'tRNA-(?P<codon>[A-Za-z]{3})\((?P<anticodon>[A-Za-z]{3})\)'),
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
33 'trna-alt': re.compile(r'tRNA-\?\((?P<codon>[^\)]+)\)\((?P<anticodon>[A-Za-z]{2,})\)'),
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
34 'bases': re.compile(r'(?P<bases>[0-9]+) bases, %GC = (?P<gc>[0-9.]+)'),
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
35 'sequence': re.compile(r'Sequence (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
36 'possible_pseudogene': re.compile(r'(?P<pseudo>Possible Pseudogene)'),
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
37 }
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
38 INFO_GROUPS = ('codon', 'anticodon', 'bases', 'gc', 'complement', 'start', 'end', 'pseudo')
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
39
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
40 def important_info(block):
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
41 info = {}
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
42 for line in block:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
43 for matcher in IMPORTANT_INFO:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
44 matches = IMPORTANT_INFO[matcher].search(line)
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
45 if matches:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
46 for group in INFO_GROUPS:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
47 try:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
48 info[group] = matches.group(group)
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
49 except:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
50 pass
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
51 return info
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
52
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
53 IMPORTANT_INFO_TMRNA = {
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
54 'tag_peptide': re.compile(r'Tag peptide:\s+(?P<pep>[A-Z*]*)'),
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
55 'location': re.compile(r'Location (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
56 }
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
57 INFO_GROUPS_TMRNA = ('start', 'end', 'pep')
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
58
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
59 def important_info_tmrna(block):
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
60 info = {}
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
61 for line in block:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
62 for matcher in IMPORTANT_INFO_TMRNA:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
63 matches = IMPORTANT_INFO_TMRNA[matcher].search(line)
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
64 if matches:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
65 for group in INFO_GROUPS_TMRNA:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
66 try:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
67 info[group] = matches.group(group)
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
68 except:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
69 pass
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
70 return info
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
71
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
72 import fileinput
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
73 stdin_data = []
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
74 for line in fileinput.input():
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
75 stdin_data.append(line)
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
76
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
77 possible_blocks = [line for line in blocks(stdin_data)]
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
78
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
79 seqid = None
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
80 print '##gff-version-3'
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
81 # We're off to a GREAT start, if I'm accessing by index you just know that I'm going to do terrible
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
82 # awful things
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
83 for block_idx in range(len(possible_blocks)):
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
84 block = possible_blocks[block_idx]
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
85 data = None
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
86 fasta_defline = None
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
87
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
88 if block[0].startswith('Searching for') or 'nucleotides in sequence' in block[-1]:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
89 # Try and get a sequence ID out of it
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
90 try:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
91 fasta_defline = block[-2].strip()
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
92 except:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
93 # Failing that, ignore it.
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
94 pass
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
95 else:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
96 # They DUPLICATE results in multiple places, including a fasta version
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
97 # in the 'full report'.
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
98 possible_ugliness = [x for x in block if x.startswith('>t')]
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
99 if len(possible_ugliness) > 0:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
100 continue
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
101
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
102 # However, if it didn't have one of those all important pieces of
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
103 # information, then it's either a different important piece of
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
104 # information, or complete junk
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
105 data = important_info(block)
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
106
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
107 # I am not proud of any of this. We essentially say "if that block
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
108 # didn't come up with useful info, then try making it a tmrna"
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
109 if len(data.keys()) == 0:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
110 data = important_info_tmrna(block)
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
111 # And if that fails, just none it.
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
112 if len(data.keys()) == 0:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
113 data = None
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
114 else:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
115 # But if it didn't, confirm that we're a tmRNA
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
116 data['type'] = 'tmRNA'
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
117 else:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
118 # If we did have keys, and didn't pass through any of the tmRNA
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
119 # checks, we're tRNA
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
120 data['type'] = 'tRNA'
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
121
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
122 # If we got a sequence ID in this block, set the defline
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
123 if 'nucleotides in sequence' in block[-1]:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
124 try:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
125 fasta_defline = block[-2].strip()
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
126 except:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
127 pass
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
128
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
129 # if a defline is available, try and extract the fasta header ID
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
130 if fasta_defline is not None:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
131 try:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
132 seqid = fasta_defline[0:fasta_defline.index(' ')]
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
133 except:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
134 seqid = fasta_defline
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
135
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
136 # If there's data
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
137 if data is not None and len(data.keys()) > 1:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
138
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
139 # Deal with our flags/notes.
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
140 if data['type'] == 'tRNA':
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
141 # Are these acceptable GFF3 tags?
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
142 notes = {
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
143 'Codon': data['codon'],
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
144 'Anticodon': data['anticodon'],
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
145 }
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
146 if 'pseudo' in data:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
147 notes['Note'] = 'Possible pseudogene'
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
148 else:
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
149 notes = {
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
150 'Note': 'Tag peptide: ' + data['pep'] + ''
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
151 }
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
152
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
153 notestr = ';'.join(['%s="%s"' % (k,v) for k,v in notes.iteritems()])
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
154
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
155 print '\t'.join([
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
156 seqid,
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
157 'aragorn',
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
158 data['type'],
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
159 data['start'],
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
160 data['end'],
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
161 '.',
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
162 '.',
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
163 '.',
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
164 notestr
d788d1abe238 Uploaded
bgruening
parents:
diff changeset
165 ])