comparison extract_features.py @ 5:7be22100e5e1 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/augustus commit bba7f5df059fcbeb06e89cf689e9a04d4f22cb76"
author iuc
date Thu, 15 Jul 2021 17:16:33 +0000
parents 86c89c3bd99d
children
comparison
equal deleted inserted replaced
4:6519ebe25019 5:7be22100e5e1
3 import argparse 3 import argparse
4 import sys 4 import sys
5 import textwrap 5 import textwrap
6 6
7 7
8 def main( args ): 8 def main(args):
9 """ 9 """
10 Extract the protein and coding section from an augustus gff, gtf file 10 Extract the protein and coding section from an augustus gff, gtf file
11 Example file: 11 Example file:
12 HS04636 AUGUSTUS stop_codon 6901 6903 . + 0 Parent=g1.t1 12 HS04636 AUGUSTUS stop_codon 6901 6903 . + 0 Parent=g1.t1
13 HS04636 AUGUSTUS transcription_end_site 8857 8857 . + . Parent=g1.t1 13 HS04636 AUGUSTUS transcription_end_site 8857 8857 . + . Parent=g1.t1
14 # protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL 14 # protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL
15 # THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD 15 # THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD
16 # PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG 16 # PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG
17 # QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH 17 # QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH
18 # WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE 18 # WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE
19 # KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV 19 # KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV
20 # PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL] 20 # PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL]
21 # end gene g1 21 # end gene g1
22 ### 22 ###
23 # 23 #
24 # ----- prediction on sequence number 2 (length = 2344, name = HS08198) ----- 24 # ----- prediction on sequence number 2 (length = 2344, name = HS08198) -----
25 # 25 #
26 # Predicted genes for sequence number 2 on both strands 26 # Predicted genes for sequence number 2 on both strands
27 # start gene g2 27 # start gene g2
28 HS08198 AUGUSTUS gene 86 2344 1 + . ID=g2 28 HS08198 AUGUSTUS gene 86 2344 1 + . ID=g2
29 HS08198 AUGUSTUS transcript 86 2344 . + . ID=g2.t1;Parent=g2 29 HS08198 AUGUSTUS transcript 86 2344 . + . ID=g2.t1;Parent=g2
30 HS08198 AUGUSTUS transcription_start_site 86 86 . + . Parent=g2.t1 30 HS08198 AUGUSTUS transcription_start_site 86 86 . + . Parent=g2.t1
31 HS08198 AUGUSTUS exon 86 582 . + . Parent=g2.t1 31 HS08198 AUGUSTUS exon 86 582 . + . Parent=g2.t1
32 HS08198 AUGUSTUS start_codon 445 447 . + 0 Parent=g2.t1 32 HS08198 AUGUSTUS start_codon 445 447 . + 0 Parent=g2.t1
33 """ 33 """
34 protein_seq = '' 34 protein_seq = ""
35 coding_seq = '' 35 coding_seq = ""
36 if args.protein: 36 if args.protein:
37 po = open( args.protein, 'w+' ) 37 po = open(args.protein, "w+")
38 if args.codingseq: 38 if args.codingseq:
39 co = open( args.codingseq, 'w+' ) 39 co = open(args.codingseq, "w+")
40 40
41 for line in sys.stdin: 41 for line in sys.stdin:
42 # protein- and coding-sequence are stored as comments 42 # protein- and coding-sequence are stored as comments
43 if line.startswith('#'): 43 if line.startswith("#"):
44 line = line[2:].strip() 44 line = line[2:].strip()
45 if line.startswith('start gene'): 45 if line.startswith("start gene"):
46 gene_name = line[11:].strip() 46 gene_name = line[11:].strip()
47 47
48 if protein_seq: 48 if protein_seq:
49 if line.endswith(']'): 49 if line.endswith("]"):
50 protein_seq += line[:-1] 50 protein_seq += line[:-1]
51 po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) ) 51 po.write(
52 protein_seq = '' 52 ">%s\n%s\n"
53 % (gene_name, "\n".join(textwrap.wrap(protein_seq, 80)))
54 )
55 protein_seq = ""
53 else: 56 else:
54 protein_seq += line 57 protein_seq += line
55 58
56 if coding_seq: 59 if coding_seq:
57 if line.endswith(']'): 60 if line.endswith("]"):
58 coding_seq += line[:-1] 61 coding_seq += line[:-1]
59 co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) ) 62 co.write(
60 coding_seq = '' 63 ">%s\n%s\n"
64 % (gene_name, "\n".join(textwrap.wrap(coding_seq, 80)))
65 )
66 coding_seq = ""
61 else: 67 else:
62 coding_seq += line 68 coding_seq += line
63 69
64 if args.protein and line.startswith('protein sequence = ['): 70 if args.protein and line.startswith("protein sequence = ["):
65 if line.endswith(']'): 71 if line.endswith("]"):
66 protein_seq = line[20:-1] 72 protein_seq = line[20:-1]
67 po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) ) 73 po.write(
68 protein_seq = '' 74 ">%s\n%s\n"
75 % (gene_name, "\n".join(textwrap.wrap(protein_seq, 80)))
76 )
77 protein_seq = ""
69 else: 78 else:
70 line = line[20:] 79 line = line[20:]
71 protein_seq = line 80 protein_seq = line
72 81
73 if args.codingseq and line.startswith('coding sequence = ['): 82 if args.codingseq and line.startswith("coding sequence = ["):
74 if line.endswith(']'): 83 if line.endswith("]"):
75 coding_seq = line[19:-1] 84 coding_seq = line[19:-1]
76 co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) ) 85 co.write(
77 coding_seq = '' 86 ">%s\n%s\n"
87 % (gene_name, "\n".join(textwrap.wrap(coding_seq, 80)))
88 )
89 coding_seq = ""
78 else: 90 else:
79 line = line[19:] 91 line = line[19:]
80 coding_seq = line 92 coding_seq = line
81 93
82 if args.codingseq: 94 if args.codingseq:
83 co.close() 95 co.close()
84 if args.protein: 96 if args.protein:
85 po.close() 97 po.close()
86 98
87 99
88 if __name__ == '__main__': 100 if __name__ == "__main__":
89 parser = argparse.ArgumentParser() 101 parser = argparse.ArgumentParser()
90 parser.add_argument('-p', '--protein', help='Path to the protein file.') 102 parser.add_argument("-p", "--protein", help="Path to the protein file.")
91 parser.add_argument('-c', '--codingseq', help='Path to the coding file.') 103 parser.add_argument("-c", "--codingseq", help="Path to the coding file.")
92 104
93 args = parser.parse_args() 105 args = parser.parse_args()
94 main( args ) 106 main(args)