Mercurial > repos > bgruening > augustus
comparison extract_features.py @ 0:af307d3285c5 draft
Uploaded
author | bgruening |
---|---|
date | Sat, 06 Jul 2013 10:07:41 -0400 |
parents | |
children | 4de31938431b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:af307d3285c5 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import os | |
4 import sys | |
5 import argparse | |
6 import textwrap | |
7 | |
8 def main( args ): | |
9 """ | |
10 Extract the protein and coding section from an augustus gff, gtf file | |
11 Example file: | |
12 HS04636 AUGUSTUS stop_codon 6901 6903 . + 0 Parent=g1.t1 | |
13 HS04636 AUGUSTUS transcription_end_site 8857 8857 . + . Parent=g1.t1 | |
14 # protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL | |
15 # THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD | |
16 # PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG | |
17 # QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH | |
18 # WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE | |
19 # KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV | |
20 # PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL] | |
21 # end gene g1 | |
22 ### | |
23 # | |
24 # ----- prediction on sequence number 2 (length = 2344, name = HS08198) ----- | |
25 # | |
26 # Predicted genes for sequence number 2 on both strands | |
27 # start gene g2 | |
28 HS08198 AUGUSTUS gene 86 2344 1 + . ID=g2 | |
29 HS08198 AUGUSTUS transcript 86 2344 . + . ID=g2.t1;Parent=g2 | |
30 HS08198 AUGUSTUS transcription_start_site 86 86 . + . Parent=g2.t1 | |
31 HS08198 AUGUSTUS exon 86 582 . + . Parent=g2.t1 | |
32 HS08198 AUGUSTUS start_codon 445 447 . + 0 Parent=g2.t1 | |
33 """ | |
34 protein_seq = '' | |
35 coding_seq = '' | |
36 if args.protein: | |
37 po = open( args.protein, 'w+' ) | |
38 if args.codingseq: | |
39 co = open( args.codingseq, 'w+' ) | |
40 | |
41 for line in sys.stdin: | |
42 # protein- and coding-sequence are stored as comments | |
43 if line.startswith('#'): | |
44 line = line[2:].strip() | |
45 if line.startswith('start gene'): | |
46 gene_name = line[11:].strip() | |
47 | |
48 if args.protein and line.startswith('protein sequence = ['): | |
49 if line.endswith(']'): | |
50 protein_seq = line[20:-1] | |
51 po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) ) | |
52 protein_seq = '' | |
53 else: | |
54 line = line[20:] | |
55 protein_seq = line | |
56 | |
57 if args.codingseq and line.startswith('coding sequence = ['): | |
58 if line.endswith(']'): | |
59 coding_seq = line[19:-1] | |
60 co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) ) | |
61 coding_seq = '' | |
62 else: | |
63 line = line[19:] | |
64 coding_seq = line | |
65 | |
66 if protein_seq: | |
67 if line.endswith(']'): | |
68 protein_seq += line[:-1] | |
69 po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) ) | |
70 protein_seq = '' | |
71 else: | |
72 protein_seq += line | |
73 | |
74 if coding_seq: | |
75 if line.endswith(']'): | |
76 coding_seq += line[:-1] | |
77 co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) ) | |
78 coding_seq = '' | |
79 else: | |
80 coding_seq += line | |
81 if args.codingseq: | |
82 co.close() | |
83 if args.protein: | |
84 po.close() | |
85 | |
86 if __name__ == '__main__': | |
87 parser = argparse.ArgumentParser() | |
88 parser.add_argument('-p', '--protein', help='Path to the protein file.') | |
89 parser.add_argument('-c', '--codingseq', help='Path to the coding file.') | |
90 | |
91 args = parser.parse_args() | |
92 main( args ) | |
93 |