comparison aragorn_out_to_gff3.py @ 2:358f58401cd6 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/trna_prediction commit cfb19d75629f02e0dea4475c16c016ed5510eb44
author bgruening
date Wed, 26 Jul 2017 10:14:05 -0400
parents d788d1abe238
children
comparison
equal deleted inserted replaced
1:d788d1abe238 2:358f58401cd6
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 import re 2 import sys
3 3
4 def start_pattern(string): 4 full_gene_model = False
5 return re.match(r'^[0-9]+\.$', string) \ 5 if '--full' in sys.argv:
6 or string.startswith('Number of possible') \ 6 full_gene_model = True
7 or string.startswith('Searching for')
8 7
9 def blank_line(string): 8 genome_id = None
10 return re.match(r'^\s*$', string) 9 stdin_data = []
10 KEY_ORDER = ('parent', 'source', 'type', 'start', 'end', 'score', 'strand',
11 '8', 'quals')
11 12
12 def blocks(iterable): 13 # Table of amino acids
13 accumulator = [] 14 aa_table = {
14 run_of_blanklines = 0 15 'Ala' : 'A',
15 for line in iterable: 16 'Arg' : 'R',
16 # Count blank lines 17 'Asn' : 'N',
17 if blank_line(line): 18 'Asp' : 'D',
18 run_of_blanklines += 1 19 'Cys' : 'C',
19 else: 20 'Gln' : 'Q',
20 run_of_blanklines = 0 21 'Glu' : 'E',
22 'Gly' : 'G',
23 'His' : 'H',
24 'Ile' : 'I',
25 'Leu' : 'L',
26 'Lys' : 'K',
27 'Met' : 'M',
28 'Phe' : 'F',
29 'Pro' : 'P',
30 'Ser' : 'S',
31 'Thr' : 'T',
32 'Trp' : 'W',
33 'Tyr' : 'Y',
34 'Val' : 'V',
35 'Pyl' : 'O',
36 'seC' : 'U',
37 '???' : 'X' }
21 38
22 if start_pattern(line) or run_of_blanklines > 2 or 'Mean G+C' in line: 39 def output_line(gff3):
23 if accumulator: 40 print '\t'.join(str(gff3[x]) for x in KEY_ORDER)
24 yield accumulator
25 accumulator = [line]
26 else:
27 accumulator.append(line)
28 if accumulator:
29 yield accumulator
30 41
31 IMPORTANT_INFO = { 42 print '##gff-version 3'
32 'trna': re.compile(r'tRNA-(?P<codon>[A-Za-z]{3})\((?P<anticodon>[A-Za-z]{3})\)'), 43 for line in sys.stdin:
33 'trna-alt': re.compile(r'tRNA-\?\((?P<codon>[^\)]+)\)\((?P<anticodon>[A-Za-z]{2,})\)'), 44 if line.startswith('>'):
34 'bases': re.compile(r'(?P<bases>[0-9]+) bases, %GC = (?P<gc>[0-9.]+)'), 45 genome_id = line[1:].strip()
35 'sequence': re.compile(r'Sequence (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'), 46 if ' ' in genome_id:
36 'possible_pseudogene': re.compile(r'(?P<pseudo>Possible Pseudogene)'), 47 genome_id = genome_id[0:genome_id.index(' ')]
37 } 48 else:
38 INFO_GROUPS = ('codon', 'anticodon', 'bases', 'gc', 'complement', 'start', 'end', 'pseudo') 49 data = line.split()
50 if len(data) == 5:
51 # Parse data
52 strand = '-' if data[2].startswith('c') else '+'
53 start, end = data[2][data[2].index('[') + 1:-1].split(',')
39 54
40 def important_info(block): 55 gff3 = {
41 info = {} 56 'parent': genome_id,
42 for line in block: 57 'source': 'aragorn',
43 for matcher in IMPORTANT_INFO: 58 'start': int(start),
44 matches = IMPORTANT_INFO[matcher].search(line) 59 'end': int(end),
45 if matches: 60 'strand': strand,
46 for group in INFO_GROUPS: 61 'score': '.',
47 try: 62 '8': '.',
48 info[group] = matches.group(group)
49 except:
50 pass
51 return info
52
53 IMPORTANT_INFO_TMRNA = {
54 'tag_peptide': re.compile(r'Tag peptide:\s+(?P<pep>[A-Z*]*)'),
55 'location': re.compile(r'Location (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
56 }
57 INFO_GROUPS_TMRNA = ('start', 'end', 'pep')
58
59 def important_info_tmrna(block):
60 info = {}
61 for line in block:
62 for matcher in IMPORTANT_INFO_TMRNA:
63 matches = IMPORTANT_INFO_TMRNA[matcher].search(line)
64 if matches:
65 for group in INFO_GROUPS_TMRNA:
66 try:
67 info[group] = matches.group(group)
68 except:
69 pass
70 return info
71
72 import fileinput
73 stdin_data = []
74 for line in fileinput.input():
75 stdin_data.append(line)
76
77 possible_blocks = [line for line in blocks(stdin_data)]
78
79 seqid = None
80 print '##gff-version-3'
81 # We're off to a GREAT start, if I'm accessing by index you just know that I'm going to do terrible
82 # awful things
83 for block_idx in range(len(possible_blocks)):
84 block = possible_blocks[block_idx]
85 data = None
86 fasta_defline = None
87
88 if block[0].startswith('Searching for') or 'nucleotides in sequence' in block[-1]:
89 # Try and get a sequence ID out of it
90 try:
91 fasta_defline = block[-2].strip()
92 except:
93 # Failing that, ignore it.
94 pass
95 else:
96 # They DUPLICATE results in multiple places, including a fasta version
97 # in the 'full report'.
98 possible_ugliness = [x for x in block if x.startswith('>t')]
99 if len(possible_ugliness) > 0:
100 continue
101
102 # However, if it didn't have one of those all important pieces of
103 # information, then it's either a different important piece of
104 # information, or complete junk
105 data = important_info(block)
106
107 # I am not proud of any of this. We essentially say "if that block
108 # didn't come up with useful info, then try making it a tmrna"
109 if len(data.keys()) == 0:
110 data = important_info_tmrna(block)
111 # And if that fails, just none it.
112 if len(data.keys()) == 0:
113 data = None
114 else:
115 # But if it didn't, confirm that we're a tmRNA
116 data['type'] = 'tmRNA'
117 else:
118 # If we did have keys, and didn't pass through any of the tmRNA
119 # checks, we're tRNA
120 data['type'] = 'tRNA'
121
122 # If we got a sequence ID in this block, set the defline
123 if 'nucleotides in sequence' in block[-1]:
124 try:
125 fasta_defline = block[-2].strip()
126 except:
127 pass
128
129 # if a defline is available, try and extract the fasta header ID
130 if fasta_defline is not None:
131 try:
132 seqid = fasta_defline[0:fasta_defline.index(' ')]
133 except:
134 seqid = fasta_defline
135
136 # If there's data
137 if data is not None and len(data.keys()) > 1:
138
139 # Deal with our flags/notes.
140 if data['type'] == 'tRNA':
141 # Are these acceptable GFF3 tags?
142 notes = {
143 'Codon': data['codon'],
144 'Anticodon': data['anticodon'],
145 }
146 if 'pseudo' in data:
147 notes['Note'] = 'Possible pseudogene'
148 else:
149 notes = {
150 'Note': 'Tag peptide: ' + data['pep'] + ''
151 } 63 }
152 64
153 notestr = ';'.join(['%s="%s"' % (k,v) for k,v in notes.iteritems()]) 65 aa_long = data[1][5:]
66 aa_short = aa_table[aa_long]
67 anticodon = data[4][1:data[4].index(")")].upper().replace("T", "U")
68 name = 'trn{}-{}'.format(aa_short, anticodon)
154 69
155 print '\t'.join([ 70 if not full_gene_model:
156 seqid, 71 gff3.update({
157 'aragorn', 72 'type': 'tRNA',
158 data['type'], 73 'quals': 'ID=tRNA{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name),
159 data['start'], 74 })
160 data['end'], 75 output_line(gff3)
161 '.', 76 else:
162 '.', 77 gff3.update({
163 '.', 78 'type': 'gene',
164 notestr 79 'quals': 'ID=gene{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name),
165 ]) 80 })
81 output_line(gff3)
82 gff3.update({
83 'type': 'tRNA',
84 'quals': 'ID=tRNA{0}.{1};Parent=gene{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name),
85 })
86 output_line(gff3)
87
88 # If no introns
89 if ')i(' not in data[4]:
90 gff3['type'] = 'exon'
91 gff3['quals'] = 'Parent=tRNA{0}.{1}'.format(genome_id, *data)
92 output_line(gff3)
93 else:
94 intron_location = data[4][data[4].rindex('(') + 1:-1].split(',')
95 intron_start, intron_length = map(int, intron_location)
96 if strand == '+':
97 original_end = gff3['end']
98 else:
99 original_end = gff3['start']
100
101 # EXON
102 gff3.update({
103 'type': 'exon',
104 'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data),
105 })
106 if strand == '+':
107 gff3['end'] = gff3['start'] + intron_start - 2
108 else:
109 gff3['start'] = gff3['end'] - intron_start + 2
110
111 output_line(gff3)
112
113 # INTRON
114 gff3.update({
115 'type': 'intron',
116 'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data),
117 })
118 if strand == '+':
119 gff3['start'] = gff3['end'] + 1
120 gff3['end'] = gff3['start'] + intron_length + 2
121 else:
122 gff3['end'] = gff3['start'] - 1
123 gff3['start'] = gff3['end'] - intron_length + 1
124
125 output_line(gff3)
126
127 # EXON
128 gff3.update({
129 'type': 'exon',
130 'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data),
131 })
132 if strand == '+':
133 gff3['start'] = gff3['end'] + 1
134 gff3['end'] = original_end
135 else:
136 gff3['end'] = gff3['start'] - 1
137 gff3['start'] = original_end
138
139 output_line(gff3)