Mercurial > repos > bgruening > trna_prediction
diff aragorn_out_to_gff3.py @ 2:358f58401cd6 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/trna_prediction commit cfb19d75629f02e0dea4475c16c016ed5510eb44
author | bgruening |
---|---|
date | Wed, 26 Jul 2017 10:14:05 -0400 |
parents | d788d1abe238 |
children |
line wrap: on
line diff
--- a/aragorn_out_to_gff3.py Thu Jan 22 13:15:51 2015 -0500 +++ b/aragorn_out_to_gff3.py Wed Jul 26 10:14:05 2017 -0400 @@ -1,165 +1,139 @@ #!/usr/bin/env python -import re - -def start_pattern(string): - return re.match(r'^[0-9]+\.$', string) \ - or string.startswith('Number of possible') \ - or string.startswith('Searching for') - -def blank_line(string): - return re.match(r'^\s*$', string) - -def blocks(iterable): - accumulator = [] - run_of_blanklines = 0 - for line in iterable: - # Count blank lines - if blank_line(line): - run_of_blanklines += 1 - else: - run_of_blanklines = 0 - - if start_pattern(line) or run_of_blanklines > 2 or 'Mean G+C' in line: - if accumulator: - yield accumulator - accumulator = [line] - else: - accumulator.append(line) - if accumulator: - yield accumulator +import sys -IMPORTANT_INFO = { - 'trna': re.compile(r'tRNA-(?P<codon>[A-Za-z]{3})\((?P<anticodon>[A-Za-z]{3})\)'), - 'trna-alt': re.compile(r'tRNA-\?\((?P<codon>[^\)]+)\)\((?P<anticodon>[A-Za-z]{2,})\)'), - 'bases': re.compile(r'(?P<bases>[0-9]+) bases, %GC = (?P<gc>[0-9.]+)'), - 'sequence': re.compile(r'Sequence (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'), - 'possible_pseudogene': re.compile(r'(?P<pseudo>Possible Pseudogene)'), -} -INFO_GROUPS = ('codon', 'anticodon', 'bases', 'gc', 'complement', 'start', 'end', 'pseudo') +full_gene_model = False +if '--full' in sys.argv: + full_gene_model = True -def important_info(block): - info = {} - for line in block: - for matcher in IMPORTANT_INFO: - matches = IMPORTANT_INFO[matcher].search(line) - if matches: - for group in INFO_GROUPS: - try: - info[group] = matches.group(group) - except: - pass - return info - -IMPORTANT_INFO_TMRNA = { - 'tag_peptide': re.compile(r'Tag peptide:\s+(?P<pep>[A-Z*]*)'), - 'location': re.compile(r'Location (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'), -} -INFO_GROUPS_TMRNA = ('start', 'end', 'pep') - -def important_info_tmrna(block): - info = {} - for line in block: - for matcher in IMPORTANT_INFO_TMRNA: - matches = IMPORTANT_INFO_TMRNA[matcher].search(line) - if matches: - for group in INFO_GROUPS_TMRNA: - try: - info[group] = matches.group(group) - except: - pass - return info - -import fileinput +genome_id = None stdin_data = [] -for line in fileinput.input(): - stdin_data.append(line) - -possible_blocks = [line for line in blocks(stdin_data)] - -seqid = None -print '##gff-version-3' -# We're off to a GREAT start, if I'm accessing by index you just know that I'm going to do terrible -# awful things -for block_idx in range(len(possible_blocks)): - block = possible_blocks[block_idx] - data = None - fasta_defline = None - - if block[0].startswith('Searching for') or 'nucleotides in sequence' in block[-1]: - # Try and get a sequence ID out of it - try: - fasta_defline = block[-2].strip() - except: - # Failing that, ignore it. - pass - else: - # They DUPLICATE results in multiple places, including a fasta version - # in the 'full report'. - possible_ugliness = [x for x in block if x.startswith('>t')] - if len(possible_ugliness) > 0: - continue - - # However, if it didn't have one of those all important pieces of - # information, then it's either a different important piece of - # information, or complete junk - data = important_info(block) +KEY_ORDER = ('parent', 'source', 'type', 'start', 'end', 'score', 'strand', + '8', 'quals') - # I am not proud of any of this. We essentially say "if that block - # didn't come up with useful info, then try making it a tmrna" - if len(data.keys()) == 0: - data = important_info_tmrna(block) - # And if that fails, just none it. - if len(data.keys()) == 0: - data = None - else: - # But if it didn't, confirm that we're a tmRNA - data['type'] = 'tmRNA' - else: - # If we did have keys, and didn't pass through any of the tmRNA - # checks, we're tRNA - data['type'] = 'tRNA' - - # If we got a sequence ID in this block, set the defline - if 'nucleotides in sequence' in block[-1]: - try: - fasta_defline = block[-2].strip() - except: - pass +# Table of amino acids +aa_table = { + 'Ala' : 'A', + 'Arg' : 'R', + 'Asn' : 'N', + 'Asp' : 'D', + 'Cys' : 'C', + 'Gln' : 'Q', + 'Glu' : 'E', + 'Gly' : 'G', + 'His' : 'H', + 'Ile' : 'I', + 'Leu' : 'L', + 'Lys' : 'K', + 'Met' : 'M', + 'Phe' : 'F', + 'Pro' : 'P', + 'Ser' : 'S', + 'Thr' : 'T', + 'Trp' : 'W', + 'Tyr' : 'Y', + 'Val' : 'V', + 'Pyl' : 'O', + 'seC' : 'U', + '???' : 'X' } - # if a defline is available, try and extract the fasta header ID - if fasta_defline is not None: - try: - seqid = fasta_defline[0:fasta_defline.index(' ')] - except: - seqid = fasta_defline - - # If there's data - if data is not None and len(data.keys()) > 1: +def output_line(gff3): + print '\t'.join(str(gff3[x]) for x in KEY_ORDER) - # Deal with our flags/notes. - if data['type'] == 'tRNA': - # Are these acceptable GFF3 tags? - notes = { - 'Codon': data['codon'], - 'Anticodon': data['anticodon'], - } - if 'pseudo' in data: - notes['Note'] = 'Possible pseudogene' - else: - notes = { - 'Note': 'Tag peptide: ' + data['pep'] + '' +print '##gff-version 3' +for line in sys.stdin: + if line.startswith('>'): + genome_id = line[1:].strip() + if ' ' in genome_id: + genome_id = genome_id[0:genome_id.index(' ')] + else: + data = line.split() + if len(data) == 5: + # Parse data + strand = '-' if data[2].startswith('c') else '+' + start, end = data[2][data[2].index('[') + 1:-1].split(',') + + gff3 = { + 'parent': genome_id, + 'source': 'aragorn', + 'start': int(start), + 'end': int(end), + 'strand': strand, + 'score': '.', + '8': '.', } - notestr = ';'.join(['%s="%s"' % (k,v) for k,v in notes.iteritems()]) + aa_long = data[1][5:] + aa_short = aa_table[aa_long] + anticodon = data[4][1:data[4].index(")")].upper().replace("T", "U") + name = 'trn{}-{}'.format(aa_short, anticodon) + + if not full_gene_model: + gff3.update({ + 'type': 'tRNA', + 'quals': 'ID=tRNA{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name), + }) + output_line(gff3) + else: + gff3.update({ + 'type': 'gene', + 'quals': 'ID=gene{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name), + }) + output_line(gff3) + gff3.update({ + 'type': 'tRNA', + 'quals': 'ID=tRNA{0}.{1};Parent=gene{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name), + }) + output_line(gff3) + + # If no introns + if ')i(' not in data[4]: + gff3['type'] = 'exon' + gff3['quals'] = 'Parent=tRNA{0}.{1}'.format(genome_id, *data) + output_line(gff3) + else: + intron_location = data[4][data[4].rindex('(') + 1:-1].split(',') + intron_start, intron_length = map(int, intron_location) + if strand == '+': + original_end = gff3['end'] + else: + original_end = gff3['start'] - print '\t'.join([ - seqid, - 'aragorn', - data['type'], - data['start'], - data['end'], - '.', - '.', - '.', - notestr - ]) + # EXON + gff3.update({ + 'type': 'exon', + 'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data), + }) + if strand == '+': + gff3['end'] = gff3['start'] + intron_start - 2 + else: + gff3['start'] = gff3['end'] - intron_start + 2 + + output_line(gff3) + + # INTRON + gff3.update({ + 'type': 'intron', + 'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data), + }) + if strand == '+': + gff3['start'] = gff3['end'] + 1 + gff3['end'] = gff3['start'] + intron_length + 2 + else: + gff3['end'] = gff3['start'] - 1 + gff3['start'] = gff3['end'] - intron_length + 1 + + output_line(gff3) + + # EXON + gff3.update({ + 'type': 'exon', + 'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data), + }) + if strand == '+': + gff3['start'] = gff3['end'] + 1 + gff3['end'] = original_end + else: + gff3['end'] = gff3['start'] - 1 + gff3['start'] = original_end + + output_line(gff3)