Mercurial > repos > bgruening > trna_prediction
comparison aragorn_out_to_gff3.py @ 2:358f58401cd6 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/trna_prediction commit cfb19d75629f02e0dea4475c16c016ed5510eb44
author | bgruening |
---|---|
date | Wed, 26 Jul 2017 10:14:05 -0400 |
parents | d788d1abe238 |
children |
comparison
equal
deleted
inserted
replaced
1:d788d1abe238 | 2:358f58401cd6 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 import re | 2 import sys |
3 | 3 |
4 def start_pattern(string): | 4 full_gene_model = False |
5 return re.match(r'^[0-9]+\.$', string) \ | 5 if '--full' in sys.argv: |
6 or string.startswith('Number of possible') \ | 6 full_gene_model = True |
7 or string.startswith('Searching for') | |
8 | 7 |
9 def blank_line(string): | 8 genome_id = None |
10 return re.match(r'^\s*$', string) | 9 stdin_data = [] |
10 KEY_ORDER = ('parent', 'source', 'type', 'start', 'end', 'score', 'strand', | |
11 '8', 'quals') | |
11 | 12 |
12 def blocks(iterable): | 13 # Table of amino acids |
13 accumulator = [] | 14 aa_table = { |
14 run_of_blanklines = 0 | 15 'Ala' : 'A', |
15 for line in iterable: | 16 'Arg' : 'R', |
16 # Count blank lines | 17 'Asn' : 'N', |
17 if blank_line(line): | 18 'Asp' : 'D', |
18 run_of_blanklines += 1 | 19 'Cys' : 'C', |
19 else: | 20 'Gln' : 'Q', |
20 run_of_blanklines = 0 | 21 'Glu' : 'E', |
22 'Gly' : 'G', | |
23 'His' : 'H', | |
24 'Ile' : 'I', | |
25 'Leu' : 'L', | |
26 'Lys' : 'K', | |
27 'Met' : 'M', | |
28 'Phe' : 'F', | |
29 'Pro' : 'P', | |
30 'Ser' : 'S', | |
31 'Thr' : 'T', | |
32 'Trp' : 'W', | |
33 'Tyr' : 'Y', | |
34 'Val' : 'V', | |
35 'Pyl' : 'O', | |
36 'seC' : 'U', | |
37 '???' : 'X' } | |
21 | 38 |
22 if start_pattern(line) or run_of_blanklines > 2 or 'Mean G+C' in line: | 39 def output_line(gff3): |
23 if accumulator: | 40 print '\t'.join(str(gff3[x]) for x in KEY_ORDER) |
24 yield accumulator | |
25 accumulator = [line] | |
26 else: | |
27 accumulator.append(line) | |
28 if accumulator: | |
29 yield accumulator | |
30 | 41 |
31 IMPORTANT_INFO = { | 42 print '##gff-version 3' |
32 'trna': re.compile(r'tRNA-(?P<codon>[A-Za-z]{3})\((?P<anticodon>[A-Za-z]{3})\)'), | 43 for line in sys.stdin: |
33 'trna-alt': re.compile(r'tRNA-\?\((?P<codon>[^\)]+)\)\((?P<anticodon>[A-Za-z]{2,})\)'), | 44 if line.startswith('>'): |
34 'bases': re.compile(r'(?P<bases>[0-9]+) bases, %GC = (?P<gc>[0-9.]+)'), | 45 genome_id = line[1:].strip() |
35 'sequence': re.compile(r'Sequence (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'), | 46 if ' ' in genome_id: |
36 'possible_pseudogene': re.compile(r'(?P<pseudo>Possible Pseudogene)'), | 47 genome_id = genome_id[0:genome_id.index(' ')] |
37 } | 48 else: |
38 INFO_GROUPS = ('codon', 'anticodon', 'bases', 'gc', 'complement', 'start', 'end', 'pseudo') | 49 data = line.split() |
50 if len(data) == 5: | |
51 # Parse data | |
52 strand = '-' if data[2].startswith('c') else '+' | |
53 start, end = data[2][data[2].index('[') + 1:-1].split(',') | |
39 | 54 |
40 def important_info(block): | 55 gff3 = { |
41 info = {} | 56 'parent': genome_id, |
42 for line in block: | 57 'source': 'aragorn', |
43 for matcher in IMPORTANT_INFO: | 58 'start': int(start), |
44 matches = IMPORTANT_INFO[matcher].search(line) | 59 'end': int(end), |
45 if matches: | 60 'strand': strand, |
46 for group in INFO_GROUPS: | 61 'score': '.', |
47 try: | 62 '8': '.', |
48 info[group] = matches.group(group) | |
49 except: | |
50 pass | |
51 return info | |
52 | |
53 IMPORTANT_INFO_TMRNA = { | |
54 'tag_peptide': re.compile(r'Tag peptide:\s+(?P<pep>[A-Z*]*)'), | |
55 'location': re.compile(r'Location (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'), | |
56 } | |
57 INFO_GROUPS_TMRNA = ('start', 'end', 'pep') | |
58 | |
59 def important_info_tmrna(block): | |
60 info = {} | |
61 for line in block: | |
62 for matcher in IMPORTANT_INFO_TMRNA: | |
63 matches = IMPORTANT_INFO_TMRNA[matcher].search(line) | |
64 if matches: | |
65 for group in INFO_GROUPS_TMRNA: | |
66 try: | |
67 info[group] = matches.group(group) | |
68 except: | |
69 pass | |
70 return info | |
71 | |
72 import fileinput | |
73 stdin_data = [] | |
74 for line in fileinput.input(): | |
75 stdin_data.append(line) | |
76 | |
77 possible_blocks = [line for line in blocks(stdin_data)] | |
78 | |
79 seqid = None | |
80 print '##gff-version-3' | |
81 # We're off to a GREAT start, if I'm accessing by index you just know that I'm going to do terrible | |
82 # awful things | |
83 for block_idx in range(len(possible_blocks)): | |
84 block = possible_blocks[block_idx] | |
85 data = None | |
86 fasta_defline = None | |
87 | |
88 if block[0].startswith('Searching for') or 'nucleotides in sequence' in block[-1]: | |
89 # Try and get a sequence ID out of it | |
90 try: | |
91 fasta_defline = block[-2].strip() | |
92 except: | |
93 # Failing that, ignore it. | |
94 pass | |
95 else: | |
96 # They DUPLICATE results in multiple places, including a fasta version | |
97 # in the 'full report'. | |
98 possible_ugliness = [x for x in block if x.startswith('>t')] | |
99 if len(possible_ugliness) > 0: | |
100 continue | |
101 | |
102 # However, if it didn't have one of those all important pieces of | |
103 # information, then it's either a different important piece of | |
104 # information, or complete junk | |
105 data = important_info(block) | |
106 | |
107 # I am not proud of any of this. We essentially say "if that block | |
108 # didn't come up with useful info, then try making it a tmrna" | |
109 if len(data.keys()) == 0: | |
110 data = important_info_tmrna(block) | |
111 # And if that fails, just none it. | |
112 if len(data.keys()) == 0: | |
113 data = None | |
114 else: | |
115 # But if it didn't, confirm that we're a tmRNA | |
116 data['type'] = 'tmRNA' | |
117 else: | |
118 # If we did have keys, and didn't pass through any of the tmRNA | |
119 # checks, we're tRNA | |
120 data['type'] = 'tRNA' | |
121 | |
122 # If we got a sequence ID in this block, set the defline | |
123 if 'nucleotides in sequence' in block[-1]: | |
124 try: | |
125 fasta_defline = block[-2].strip() | |
126 except: | |
127 pass | |
128 | |
129 # if a defline is available, try and extract the fasta header ID | |
130 if fasta_defline is not None: | |
131 try: | |
132 seqid = fasta_defline[0:fasta_defline.index(' ')] | |
133 except: | |
134 seqid = fasta_defline | |
135 | |
136 # If there's data | |
137 if data is not None and len(data.keys()) > 1: | |
138 | |
139 # Deal with our flags/notes. | |
140 if data['type'] == 'tRNA': | |
141 # Are these acceptable GFF3 tags? | |
142 notes = { | |
143 'Codon': data['codon'], | |
144 'Anticodon': data['anticodon'], | |
145 } | |
146 if 'pseudo' in data: | |
147 notes['Note'] = 'Possible pseudogene' | |
148 else: | |
149 notes = { | |
150 'Note': 'Tag peptide: ' + data['pep'] + '' | |
151 } | 63 } |
152 | 64 |
153 notestr = ';'.join(['%s="%s"' % (k,v) for k,v in notes.iteritems()]) | 65 aa_long = data[1][5:] |
66 aa_short = aa_table[aa_long] | |
67 anticodon = data[4][1:data[4].index(")")].upper().replace("T", "U") | |
68 name = 'trn{}-{}'.format(aa_short, anticodon) | |
154 | 69 |
155 print '\t'.join([ | 70 if not full_gene_model: |
156 seqid, | 71 gff3.update({ |
157 'aragorn', | 72 'type': 'tRNA', |
158 data['type'], | 73 'quals': 'ID=tRNA{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name), |
159 data['start'], | 74 }) |
160 data['end'], | 75 output_line(gff3) |
161 '.', | 76 else: |
162 '.', | 77 gff3.update({ |
163 '.', | 78 'type': 'gene', |
164 notestr | 79 'quals': 'ID=gene{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name), |
165 ]) | 80 }) |
81 output_line(gff3) | |
82 gff3.update({ | |
83 'type': 'tRNA', | |
84 'quals': 'ID=tRNA{0}.{1};Parent=gene{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name), | |
85 }) | |
86 output_line(gff3) | |
87 | |
88 # If no introns | |
89 if ')i(' not in data[4]: | |
90 gff3['type'] = 'exon' | |
91 gff3['quals'] = 'Parent=tRNA{0}.{1}'.format(genome_id, *data) | |
92 output_line(gff3) | |
93 else: | |
94 intron_location = data[4][data[4].rindex('(') + 1:-1].split(',') | |
95 intron_start, intron_length = map(int, intron_location) | |
96 if strand == '+': | |
97 original_end = gff3['end'] | |
98 else: | |
99 original_end = gff3['start'] | |
100 | |
101 # EXON | |
102 gff3.update({ | |
103 'type': 'exon', | |
104 'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data), | |
105 }) | |
106 if strand == '+': | |
107 gff3['end'] = gff3['start'] + intron_start - 2 | |
108 else: | |
109 gff3['start'] = gff3['end'] - intron_start + 2 | |
110 | |
111 output_line(gff3) | |
112 | |
113 # INTRON | |
114 gff3.update({ | |
115 'type': 'intron', | |
116 'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data), | |
117 }) | |
118 if strand == '+': | |
119 gff3['start'] = gff3['end'] + 1 | |
120 gff3['end'] = gff3['start'] + intron_length + 2 | |
121 else: | |
122 gff3['end'] = gff3['start'] - 1 | |
123 gff3['start'] = gff3['end'] - intron_length + 1 | |
124 | |
125 output_line(gff3) | |
126 | |
127 # EXON | |
128 gff3.update({ | |
129 'type': 'exon', | |
130 'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data), | |
131 }) | |
132 if strand == '+': | |
133 gff3['start'] = gff3['end'] + 1 | |
134 gff3['end'] = original_end | |
135 else: | |
136 gff3['end'] = gff3['start'] - 1 | |
137 gff3['start'] = original_end | |
138 | |
139 output_line(gff3) |