trna_prediction: aragorn_out_to

comparison aragorn_out_to_gff3.py @ 2:358f58401cd6 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/trna_prediction commit cfb19d75629f02e0dea4475c16c016ed5510eb44

author	bgruening
date	Wed, 26 Jul 2017 10:14:05 -0400
parents	d788d1abe238
children

comparison

equal deleted inserted replaced

-:d788d1abe238
+:358f58401cd6
 #!/usr/bin/env python
-import re
+import sys
-def start_pattern(string):
+full_gene_model = False
-return re.match(r'^[0-9]+\.$', string) \
+if '--full' in sys.argv:
-or string.startswith('Number of possible') \
+full_gene_model = True
-or string.startswith('Searching for')
-def blank_line(string):
+genome_id = None
-return re.match(r'^\s*$', string)
+stdin_data = []
+KEY_ORDER = ('parent', 'source', 'type', 'start', 'end', 'score', 'strand',
+'8', 'quals')
-def blocks(iterable):
+# Table of amino acids
-accumulator = []
+aa_table = {
-run_of_blanklines = 0
+'Ala' : 'A',
-for line in iterable:
+'Arg' : 'R',
-# Count blank lines
+'Asn' : 'N',
-if blank_line(line):
+'Asp' : 'D',
-run_of_blanklines += 1
+'Cys' : 'C',
-else:
+'Gln' : 'Q',
-run_of_blanklines = 0
+'Glu' : 'E',
+'Gly' : 'G',
+'His' : 'H',
+'Ile' : 'I',
+'Leu' : 'L',
+'Lys' : 'K',
+'Met' : 'M',
+'Phe' : 'F',
+'Pro' : 'P',
+'Ser' : 'S',
+'Thr' : 'T',
+'Trp' : 'W',
+'Tyr' : 'Y',
+'Val' : 'V',
+'Pyl' : 'O',
+'seC' : 'U',
+'???' : 'X' }
-if start_pattern(line) or run_of_blanklines > 2 or 'Mean G+C' in line:
+def output_line(gff3):
-if accumulator:
+print '\t'.join(str(gff3[x]) for x in KEY_ORDER)
-yield accumulator
-accumulator = [line]
-else:
-accumulator.append(line)
-if accumulator:
-yield accumulator
-IMPORTANT_INFO = {
+print '##gff-version 3'
-'trna': re.compile(r'tRNA-(?P<codon>[A-Za-z]{3})\((?P<anticodon>[A-Za-z]{3})\)'),
+for line in sys.stdin:
-'trna-alt': re.compile(r'tRNA-\?\((?P<codon>[^\)]+)\)\((?P<anticodon>[A-Za-z]{2,})\)'),
+if line.startswith('>'):
-'bases': re.compile(r'(?P<bases>[0-9]+) bases, %GC = (?P<gc>[0-9.]+)'),
+genome_id = line[1:].strip()
-'sequence': re.compile(r'Sequence (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
+if ' ' in genome_id:
-'possible_pseudogene': re.compile(r'(?P<pseudo>Possible Pseudogene)'),
+genome_id = genome_id[0:genome_id.index(' ')]
-}
+else:
-INFO_GROUPS = ('codon', 'anticodon', 'bases', 'gc', 'complement', 'start', 'end', 'pseudo')
+data = line.split()
+if len(data) == 5:
+# Parse data
+strand = '-' if data[2].startswith('c') else '+'
+start, end = data[2][data[2].index('[') + 1:-1].split(',')
-def important_info(block):
+gff3 = {
-info = {}
+'parent': genome_id,
-for line in block:
+'source': 'aragorn',
-for matcher in IMPORTANT_INFO:
+'start': int(start),
-matches = IMPORTANT_INFO[matcher].search(line)
+'end': int(end),
-if matches:
+'strand': strand,
-for group in INFO_GROUPS:
+'score': '.',
-try:
+'8': '.',
-info[group] = matches.group(group)
-except:
-pass
-return info
-IMPORTANT_INFO_TMRNA = {
-'tag_peptide': re.compile(r'Tag peptide:\s+(?P<pep>[A-Z*]*)'),
-'location': re.compile(r'Location (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
-}
-INFO_GROUPS_TMRNA = ('start', 'end', 'pep')
-def important_info_tmrna(block):
-info = {}
-for line in block:
-for matcher in IMPORTANT_INFO_TMRNA:
-matches = IMPORTANT_INFO_TMRNA[matcher].search(line)
-if matches:
-for group in INFO_GROUPS_TMRNA:
-try:
-info[group] = matches.group(group)
-except:
-pass
-return info
-import fileinput
-stdin_data = []
-for line in fileinput.input():
-stdin_data.append(line)
-possible_blocks = [line for line in blocks(stdin_data)]
-seqid = None
-print '##gff-version-3'
-# We're off to a GREAT start, if I'm accessing by index you just know that I'm going to do terrible
-# awful things
-for block_idx in range(len(possible_blocks)):
-block = possible_blocks[block_idx]
-data = None
-fasta_defline = None
-if block[0].startswith('Searching for') or 'nucleotides in sequence' in block[-1]:
-# Try and get a sequence ID out of it
-try:
-fasta_defline = block[-2].strip()
-except:
-# Failing that, ignore it.
-pass
-else:
-# They DUPLICATE results in multiple places, including a fasta version
-# in the 'full report'.
-possible_ugliness = [x for x in block if x.startswith('>t')]
-if len(possible_ugliness) > 0:
-continue
-# However, if it didn't have one of those all important pieces of
-# information, then it's either a different important piece of
-# information, or complete junk
-data = important_info(block)
-# I am not proud of any of this. We essentially say "if that block
-# didn't come up with useful info, then try making it a tmrna"
-if len(data.keys()) == 0:
-data = important_info_tmrna(block)
-# And if that fails, just none it.
-if len(data.keys()) == 0:
-data = None
-else:
-# But if it didn't, confirm that we're a tmRNA
-data['type'] = 'tmRNA'
-else:
-# If we did have keys, and didn't pass through any of the tmRNA
-# checks, we're tRNA
-data['type'] = 'tRNA'
-# If we got a sequence ID in this block, set the defline
-if 'nucleotides in sequence' in block[-1]:
-try:
-fasta_defline = block[-2].strip()
-except:
-pass
-# if a defline is available, try and extract the fasta header ID
-if fasta_defline is not None:
-try:
-seqid = fasta_defline[0:fasta_defline.index(' ')]
-except:
-seqid = fasta_defline
-# If there's data
-if data is not None and len(data.keys()) > 1:
-# Deal with our flags/notes.
-if data['type'] == 'tRNA':
-# Are these acceptable GFF3 tags?
-notes = {
-'Codon': data['codon'],
-'Anticodon': data['anticodon'],
-}
-if 'pseudo' in data:
-notes['Note'] = 'Possible pseudogene'
-else:
-notes = {
-'Note': 'Tag peptide: ' + data['pep'] + ''
 }
-notestr = ';'.join(['%s="%s"' % (k,v) for k,v in notes.iteritems()])
+aa_long = data[1][5:]
+aa_short = aa_table[aa_long]
+anticodon = data[4][1:data[4].index(")")].upper().replace("T", "U")
+name = 'trn{}-{}'.format(aa_short, anticodon)
-print '\t'.join([
+if not full_gene_model:
-seqid,
+gff3.update({
-'aragorn',
+'type': 'tRNA',
-data['type'],
+'quals': 'ID=tRNA{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name),
-data['start'],
+})
-data['end'],
+output_line(gff3)
-'.',
+else:
-'.',
+gff3.update({
-'.',
+'type': 'gene',
-notestr
+'quals': 'ID=gene{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name),
-])
+})
+output_line(gff3)
+gff3.update({
+'type': 'tRNA',
+'quals': 'ID=tRNA{0}.{1};Parent=gene{0}.{1};Name={name};product={2}'.format(genome_id, *data, name = name),
+})
+output_line(gff3)
+# If no introns
+if ')i(' not in data[4]:
+gff3['type'] = 'exon'
+gff3['quals'] = 'Parent=tRNA{0}.{1}'.format(genome_id, *data)
+output_line(gff3)
+else:
+intron_location = data[4][data[4].rindex('(') + 1:-1].split(',')
+intron_start, intron_length = map(int, intron_location)
+if strand == '+':
+original_end = gff3['end']
+else:
+original_end = gff3['start']
+# EXON
+gff3.update({
+'type': 'exon',
+'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data),
+})
+if strand == '+':
+gff3['end'] = gff3['start'] + intron_start - 2
+else:
+gff3['start'] = gff3['end'] - intron_start + 2
+output_line(gff3)
+# INTRON
+gff3.update({
+'type': 'intron',
+'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data),
+})
+if strand == '+':
+gff3['start'] = gff3['end'] + 1
+gff3['end'] = gff3['start'] + intron_length + 2
+else:
+gff3['end'] = gff3['start'] - 1
+gff3['start'] = gff3['end'] - intron_length + 1
+output_line(gff3)
+# EXON
+gff3.update({
+'type': 'exon',
+'quals': 'Parent=tRNA{0}.{1}'.format(genome_id, *data),
+})
+if strand == '+':
+gff3['start'] = gff3['end'] + 1
+gff3['end'] = original_end
+else:
+gff3['end'] = gff3['start'] - 1
+gff3['start'] = original_end
+output_line(gff3)

Mercurial > repos > bgruening > trna_prediction

comparison aragorn_out_to_gff3.py @ 2:358f58401cd6 draft default tip