# HG changeset patch # User bgruening # Date 1421950551 18000 # Node ID d788d1abe23817a8c9135129079d6c74a7a6bbf6 # Parent d34f31cbc9ddba7c9bdbe9253ec644a4c62bd9e5 Uploaded diff -r d34f31cbc9dd -r d788d1abe238 aragorn.xml --- a/aragorn.xml Sat Jul 06 10:37:13 2013 -0400 +++ b/aragorn.xml Thu Jan 22 13:15:51 2015 -0500 @@ -1,11 +1,13 @@ - - prediction (Aragon) + + prediction (Aragorn) aragorn + TRNAPRED_SCRIPT_PATH - aragorn - $input + $gff3_output_file; +#end if +]]> @@ -48,6 +64,7 @@ + @@ -55,6 +72,9 @@ + + gff3_output + @@ -65,12 +85,15 @@ - + + + + diff -r d34f31cbc9dd -r d788d1abe238 aragorn_out_to_gff3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/aragorn_out_to_gff3.py Thu Jan 22 13:15:51 2015 -0500 @@ -0,0 +1,165 @@ +#!/usr/bin/env python +import re + +def start_pattern(string): + return re.match(r'^[0-9]+\.$', string) \ + or string.startswith('Number of possible') \ + or string.startswith('Searching for') + +def blank_line(string): + return re.match(r'^\s*$', string) + +def blocks(iterable): + accumulator = [] + run_of_blanklines = 0 + for line in iterable: + # Count blank lines + if blank_line(line): + run_of_blanklines += 1 + else: + run_of_blanklines = 0 + + if start_pattern(line) or run_of_blanklines > 2 or 'Mean G+C' in line: + if accumulator: + yield accumulator + accumulator = [line] + else: + accumulator.append(line) + if accumulator: + yield accumulator + +IMPORTANT_INFO = { + 'trna': re.compile(r'tRNA-(?P[A-Za-z]{3})\((?P[A-Za-z]{3})\)'), + 'trna-alt': re.compile(r'tRNA-\?\((?P[^\)]+)\)\((?P[A-Za-z]{2,})\)'), + 'bases': re.compile(r'(?P[0-9]+) bases, %GC = (?P[0-9.]+)'), + 'sequence': re.compile(r'Sequence (?P[c]{0,1})\[(?P\d+),(?P\d+)\]'), + 'possible_pseudogene': re.compile(r'(?PPossible Pseudogene)'), +} +INFO_GROUPS = ('codon', 'anticodon', 'bases', 'gc', 'complement', 'start', 'end', 'pseudo') + +def important_info(block): + info = {} + for line in block: + for matcher in IMPORTANT_INFO: + matches = IMPORTANT_INFO[matcher].search(line) + if matches: + for group in INFO_GROUPS: + try: + info[group] = matches.group(group) + except: + pass + return info + +IMPORTANT_INFO_TMRNA = { + 'tag_peptide': re.compile(r'Tag peptide:\s+(?P[A-Z*]*)'), + 'location': re.compile(r'Location (?P[c]{0,1})\[(?P\d+),(?P\d+)\]'), +} +INFO_GROUPS_TMRNA = ('start', 'end', 'pep') + +def important_info_tmrna(block): + info = {} + for line in block: + for matcher in IMPORTANT_INFO_TMRNA: + matches = IMPORTANT_INFO_TMRNA[matcher].search(line) + if matches: + for group in INFO_GROUPS_TMRNA: + try: + info[group] = matches.group(group) + except: + pass + return info + +import fileinput +stdin_data = [] +for line in fileinput.input(): + stdin_data.append(line) + +possible_blocks = [line for line in blocks(stdin_data)] + +seqid = None +print '##gff-version-3' +# We're off to a GREAT start, if I'm accessing by index you just know that I'm going to do terrible +# awful things +for block_idx in range(len(possible_blocks)): + block = possible_blocks[block_idx] + data = None + fasta_defline = None + + if block[0].startswith('Searching for') or 'nucleotides in sequence' in block[-1]: + # Try and get a sequence ID out of it + try: + fasta_defline = block[-2].strip() + except: + # Failing that, ignore it. + pass + else: + # They DUPLICATE results in multiple places, including a fasta version + # in the 'full report'. + possible_ugliness = [x for x in block if x.startswith('>t')] + if len(possible_ugliness) > 0: + continue + + # However, if it didn't have one of those all important pieces of + # information, then it's either a different important piece of + # information, or complete junk + data = important_info(block) + + # I am not proud of any of this. We essentially say "if that block + # didn't come up with useful info, then try making it a tmrna" + if len(data.keys()) == 0: + data = important_info_tmrna(block) + # And if that fails, just none it. + if len(data.keys()) == 0: + data = None + else: + # But if it didn't, confirm that we're a tmRNA + data['type'] = 'tmRNA' + else: + # If we did have keys, and didn't pass through any of the tmRNA + # checks, we're tRNA + data['type'] = 'tRNA' + + # If we got a sequence ID in this block, set the defline + if 'nucleotides in sequence' in block[-1]: + try: + fasta_defline = block[-2].strip() + except: + pass + + # if a defline is available, try and extract the fasta header ID + if fasta_defline is not None: + try: + seqid = fasta_defline[0:fasta_defline.index(' ')] + except: + seqid = fasta_defline + + # If there's data + if data is not None and len(data.keys()) > 1: + + # Deal with our flags/notes. + if data['type'] == 'tRNA': + # Are these acceptable GFF3 tags? + notes = { + 'Codon': data['codon'], + 'Anticodon': data['anticodon'], + } + if 'pseudo' in data: + notes['Note'] = 'Possible pseudogene' + else: + notes = { + 'Note': 'Tag peptide: ' + data['pep'] + '' + } + + notestr = ';'.join(['%s="%s"' % (k,v) for k,v in notes.iteritems()]) + + print '\t'.join([ + seqid, + 'aragorn', + data['type'], + data['start'], + data['end'], + '.', + '.', + '.', + notestr + ]) diff -r d34f31cbc9dd -r d788d1abe238 tRNAscan.xml --- a/tRNAscan.xml Sat Jul 06 10:37:13 2013 -0400 +++ b/tRNAscan.xml Thu Jan 22 13:15:51 2015 -0500 @@ -5,6 +5,7 @@ biopython + @@ -54,6 +56,7 @@ + diff -r d34f31cbc9dd -r d788d1abe238 test-data/aragorn_tansl-table-1_tmRNA_tRNA.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/aragorn_tansl-table-1_tmRNA_tRNA.gff3 Thu Jan 22 13:15:51 2015 -0500 @@ -0,0 +1,2 @@ +##gff-version-3 +gi|240255695:23036500-23037000 aragorn tRNA 381 453 . . . Anticodon=tgc;Codon=Ala diff -r d34f31cbc9dd -r d788d1abe238 tool_dependencies.xml --- a/tool_dependencies.xml Sat Jul 06 10:37:13 2013 -0400 +++ b/tool_dependencies.xml Thu Jan 22 13:15:51 2015 -0500 @@ -1,47 +1,15 @@ - + - - - http://mbio-serv2.mbioekol.lu.se/ARAGORN/Downloads/aragorn1.2.36.tgz - $INSTALL_DIR/bin/ - gcc -O3 -ffast-math -finline-functions -o aragorn aragorn1.2.36.c - - aragorn - $INSTALL_DIR/bin - - - $INSTALL_DIR/bin - - - - Compiling ARAGORN requires gcc. + - - - http://lowelab.ucsc.edu/software/tRNAscan-SE.tar.gz - $INSTALL_DIR/bin/ - $INSTALL_DIR/lib/tRNAscan-SE/ - $INSTALL_DIR/man/ - - cd ./tRNAscan-SE-1.3.1 && sed 's%^BINDIR = .*%BINDIR = $INSTALL_DIR/bin/%' Makefile | sed 's%^LIBDIR = .*%LIBDIR = $INSTALL_DIR/lib/tRNAscan-SE/%' | sed 's%^MANDIR = .*%MANDIR = $INSTALL_DIR/man%' > Makefile_new - cd ./tRNAscan-SE-1.3.1 && rm Makefile && mv Makefile_new Makefile - cd ./tRNAscan-SE-1.3.1 && make && make install - - - wget ftp://selab.janelia.org/pub/software/infernal/infernal-1.0.2.tar.gz - tar xfvz infernal-1.0.2.tar.gz - cd infernal-1.0.2 && ./configure --prefix=$INSTALL_DIR && make && make install - - $INSTALL_DIR/bin - $INSTALL_DIR/bin/ - - - - Compiling and running tRNAScan-SE requires gcc a PERL environment. + + + $REPOSITORY_INSTALL_DIR +