5
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import os
|
|
4 import sys
|
|
5 import argparse
|
|
6 import textwrap
|
|
7
|
|
8 def main( args ):
|
|
9 """
|
|
10 Extract the protein and coding section from an augustus gff, gtf file
|
|
11 Example file:
|
|
12 HS04636 AUGUSTUS stop_codon 6901 6903 . + 0 Parent=g1.t1
|
|
13 HS04636 AUGUSTUS transcription_end_site 8857 8857 . + . Parent=g1.t1
|
|
14 # protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL
|
|
15 # THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD
|
|
16 # PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG
|
|
17 # QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH
|
|
18 # WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE
|
|
19 # KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV
|
|
20 # PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL]
|
|
21 # end gene g1
|
|
22 ###
|
|
23 #
|
|
24 # ----- prediction on sequence number 2 (length = 2344, name = HS08198) -----
|
|
25 #
|
|
26 # Predicted genes for sequence number 2 on both strands
|
|
27 # start gene g2
|
|
28 HS08198 AUGUSTUS gene 86 2344 1 + . ID=g2
|
|
29 HS08198 AUGUSTUS transcript 86 2344 . + . ID=g2.t1;Parent=g2
|
|
30 HS08198 AUGUSTUS transcription_start_site 86 86 . + . Parent=g2.t1
|
|
31 HS08198 AUGUSTUS exon 86 582 . + . Parent=g2.t1
|
|
32 HS08198 AUGUSTUS start_codon 445 447 . + 0 Parent=g2.t1
|
|
33 """
|
|
34 protein_seq = ''
|
|
35 coding_seq = ''
|
|
36 if args.protein:
|
|
37 po = open( args.protein, 'w+' )
|
|
38 if args.codingseq:
|
|
39 co = open( args.codingseq, 'w+' )
|
|
40
|
|
41 for line in sys.stdin:
|
|
42 # protein- and coding-sequence are stored as comments
|
|
43 if line.startswith('#'):
|
|
44 line = line[2:].strip()
|
|
45 if line.startswith('start gene'):
|
|
46 gene_name = line[11:].strip()
|
|
47
|
|
48 if args.protein and line.startswith('protein sequence = ['):
|
|
49 if line.endswith(']'):
|
|
50 protein_seq = line[20:-1]
|
|
51 po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) )
|
|
52 protein_seq = ''
|
|
53 else:
|
|
54 line = line[20:]
|
|
55 protein_seq = line
|
|
56
|
|
57 if args.codingseq and line.startswith('coding sequence = ['):
|
|
58 if line.endswith(']'):
|
|
59 coding_seq = line[19:-1]
|
|
60 co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) )
|
|
61 coding_seq = ''
|
|
62 else:
|
|
63 line = line[19:]
|
|
64 coding_seq = line
|
|
65
|
|
66 if protein_seq:
|
|
67 if line.endswith(']'):
|
|
68 protein_seq += line[:-1]
|
|
69 po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) )
|
|
70 protein_seq = ''
|
|
71 else:
|
|
72 protein_seq += line
|
|
73
|
|
74 if coding_seq:
|
|
75 if line.endswith(']'):
|
|
76 coding_seq += line[:-1]
|
|
77 co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) )
|
|
78 coding_seq = ''
|
|
79 else:
|
|
80 coding_seq += line
|
|
81 if args.codingseq:
|
|
82 co.close()
|
|
83 if args.protein:
|
|
84 po.close()
|
|
85
|
|
86 if __name__ == '__main__':
|
|
87 parser = argparse.ArgumentParser()
|
|
88 parser.add_argument('-p', '--protein', help='Path to the protein file.')
|
|
89 parser.add_argument('-c', '--codingseq', help='Path to the coding file.')
|
|
90
|
|
91 args = parser.parse_args()
|
|
92 main( args )
|
|
93
|