comparison extract_features.py @ 0:86c89c3bd99d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/augustus commit 2896dcfd180800d00ea413a59264ef8b11788b8e
author bgruening
date Fri, 20 Oct 2017 03:55:35 -0400
parents
children 7be22100e5e1
comparison
equal deleted inserted replaced
-1:000000000000 0:86c89c3bd99d
1 #!/usr/bin/env python
2
3 import argparse
4 import sys
5 import textwrap
6
7
8 def main( args ):
9 """
10 Extract the protein and coding section from an augustus gff, gtf file
11 Example file:
12 HS04636 AUGUSTUS stop_codon 6901 6903 . + 0 Parent=g1.t1
13 HS04636 AUGUSTUS transcription_end_site 8857 8857 . + . Parent=g1.t1
14 # protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL
15 # THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD
16 # PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG
17 # QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH
18 # WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE
19 # KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV
20 # PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL]
21 # end gene g1
22 ###
23 #
24 # ----- prediction on sequence number 2 (length = 2344, name = HS08198) -----
25 #
26 # Predicted genes for sequence number 2 on both strands
27 # start gene g2
28 HS08198 AUGUSTUS gene 86 2344 1 + . ID=g2
29 HS08198 AUGUSTUS transcript 86 2344 . + . ID=g2.t1;Parent=g2
30 HS08198 AUGUSTUS transcription_start_site 86 86 . + . Parent=g2.t1
31 HS08198 AUGUSTUS exon 86 582 . + . Parent=g2.t1
32 HS08198 AUGUSTUS start_codon 445 447 . + 0 Parent=g2.t1
33 """
34 protein_seq = ''
35 coding_seq = ''
36 if args.protein:
37 po = open( args.protein, 'w+' )
38 if args.codingseq:
39 co = open( args.codingseq, 'w+' )
40
41 for line in sys.stdin:
42 # protein- and coding-sequence are stored as comments
43 if line.startswith('#'):
44 line = line[2:].strip()
45 if line.startswith('start gene'):
46 gene_name = line[11:].strip()
47
48 if protein_seq:
49 if line.endswith(']'):
50 protein_seq += line[:-1]
51 po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) )
52 protein_seq = ''
53 else:
54 protein_seq += line
55
56 if coding_seq:
57 if line.endswith(']'):
58 coding_seq += line[:-1]
59 co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) )
60 coding_seq = ''
61 else:
62 coding_seq += line
63
64 if args.protein and line.startswith('protein sequence = ['):
65 if line.endswith(']'):
66 protein_seq = line[20:-1]
67 po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) )
68 protein_seq = ''
69 else:
70 line = line[20:]
71 protein_seq = line
72
73 if args.codingseq and line.startswith('coding sequence = ['):
74 if line.endswith(']'):
75 coding_seq = line[19:-1]
76 co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) )
77 coding_seq = ''
78 else:
79 line = line[19:]
80 coding_seq = line
81
82 if args.codingseq:
83 co.close()
84 if args.protein:
85 po.close()
86
87
88 if __name__ == '__main__':
89 parser = argparse.ArgumentParser()
90 parser.add_argument('-p', '--protein', help='Path to the protein file.')
91 parser.add_argument('-c', '--codingseq', help='Path to the coding file.')
92
93 args = parser.parse_args()
94 main( args )