annotate multi_antiSMASH_wrapper.py @ 0:6a37d0a4510a default tip

initial uploaded
author bjoern-gruening
date Thu, 15 Mar 2012 05:23:03 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
1 #!/usr/bin/env python
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
2 # -*- coding: UTF-8 -*-
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
3
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
4 import os, sys, subprocess, commands
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
5 import random, shutil
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
6 import zipfile
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
7 from Bio import SeqIO
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
8 import tempfile
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
9
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
10 blastdbpath = '/home/galaxy/bin/antismash-1.1.0/db'
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
11 pfamdbpath = '/home/galaxy/bin/antismash-1.1.0/db'
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
12 antismash_path = '/home/galaxy/bin/antismash-1.1.0/antismash.py'
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
13
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
14
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
15 def anitSMASH(args, sequence_path, glimmer_path):
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
16 #./antismash.py Tue6071_genome.fasta --geneclustertypes 1 --fullhmm y
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
17 print sequence_path
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
18 rint = random.randint(1,10000000)
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
19 tmp_dir = '/tmp/galaxy_%s' % rint
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
20 os.mkdir(tmp_dir)
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
21 os.mkdir(os.path.join( tmp_dir, 'geneprediction' ))
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
22 os.chdir(tmp_dir)
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
23 print 'IFORMAT:',args.iformat
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
24 if args.iformat == 'fasta':
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
25 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.fasta')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
26 elif args.iformat == 'genbank':
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
27 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.gbk')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
28 else:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
29 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.embl')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
30
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
31
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
32 # try to generate the same name as in antismash.py
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
33 genomename = ".".join( (os.path.basename(sequence_path) + '.'+ args.iformat).split(".")[:-1] )
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
34 for i in """!"#$%&()*+,./:;=>?@[]^`{|}'""":
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
35 genomename = genomename.replace(i,"")
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
36 result_path = os.path.join( tmp_dir, genomename )
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
37
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
38 shutil.copy(sequence_path, new_input_path )
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
39
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
40 if args.eukaryotic:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
41 taxon = '--taxon e'
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
42 else:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
43 taxon = '--taxon p'
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
44
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
45 if args.clusterblast:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
46 clusterblast = '--clusterblast y'
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
47 else:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
48 clusterblast = '--clusterblast n'
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
49
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
50 if args.smcogs:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
51 smcogs = '--smcogs y'
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
52 else:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
53 smcogs = '--smcogs n'
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
54
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
55 if args.fullhmm:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
56 fullhmm = '--fullhmm y'
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
57 else:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
58 fullhmm = '--fullhmm n'
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
59
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
60 if args.fullblast:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
61 fullblast = '--fullblast y'
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
62 else:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
63 fullblast = '--fullblast n'
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
64
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
65 h = [antismash_path, new_input_path,
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
66 '--geneclustertypes %s' % args.geneclustertypes,
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
67 taxon,
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
68 clusterblast,
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
69 smcogs,
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
70 fullhmm,
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
71 fullblast,
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
72 '--blastdbpath %s' % blastdbpath,
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
73 '--pfamdbpath %s' % pfamdbpath,
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
74 '--cores 10',
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
75 ]
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
76 if args.iformat == 'fasta':
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
77 h.append('--glimmer_prediction %s' % glimmer_path)
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
78
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
79 a = ' '.join(h)
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
80
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
81 subprocess.call(a, shell=True)
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
82 print a
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
83 embl_path = os.path.join(result_path, '%s.final.embl' % genomename)
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
84 if os.path.exists(embl_path) and args.embl_path:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
85 args.embl_path.write( open(embl_path).read() )
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
86
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
87 gff_line = "%s\tantismash\t%s\t%s\t%s\t.\t%s\t.\t%s"
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
88
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
89 clustername_mapping = {}
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
90 genecluster_path = os.path.join(result_path, 'clusterblast/geneclusters.txt')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
91 if os.path.exists(genecluster_path):
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
92 for line in open( genecluster_path):
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
93 token = line.split('\t')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
94 clustername_mapping[token[2]] = token[3]
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
95 old_cluster_name = False
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
96 parent_cluster_name = ""
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
97 child_lines = ""
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
98 starts = []
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
99 ends = []
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
100 for line in open( os.path.join(result_path, 'clusterblast/geneclusterprots.fasta') ):
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
101 if line.startswith('>'):
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
102 columns = line[1:].strip().split('|')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
103 args.geneclusterprots.write( line.replace('|%s|' % columns[1], '|%s|%s|' % (columns[1],clustername_mapping[columns[1]])) )
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
104
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
105 """
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
106 "Handlampe" kann man zu einer Person sagen wenn es nicht mehr zum Armleuchter reicht. Es ist also ein abgeschwächte Form für "Depp".
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
107 """
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
108 if old_cluster_name == False:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
109 # inital clustername setting
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
110 # try to catch the line in which the name changes
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
111 old_cluster_name = columns[1]
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
112 parent_cluster_name = columns[1]
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
113
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
114 seqname = columns[0]
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
115 strand = columns[3]
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
116 (start, end) = columns[2].split('-')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
117 starts.append(int(start))
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
118 ends.append(int(end))
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
119
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
120 if old_cluster_name != columns[1]:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
121 # begin of a new cluster, caclulate the parent line for the previous clusterproteins
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
122 # TODO: maybe the assumption is not correct, to take the smallest start and the largest end
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
123 starts.sort()
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
124 ends.sort()
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
125 genecluster_start = starts[0]
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
126 genecluster_end = ends[-1]
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
127 group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5])
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
128 oline = gff_line % (
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
129 seqname, 'genecluster', genecluster_start, genecluster_end, strand, group
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
130 )
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
131 args.geneclusterprots_gff.write(oline + child_lines)
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
132 starts = []
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
133 ends = []
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
134 old_cluster_name = columns[1]
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
135 parent_cluster_name = columns[1]
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
136 child_lines = ''
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
137
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
138 group = 'ID=%s;Parent=%s;Name=%s;Clustertype=%s;Note=%s\n' % (columns[4], columns[1], columns[4], clustername_mapping[columns[1]], columns[5])
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
139 child_lines += gff_line % (
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
140 seqname, 'gene', start, end, strand, group
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
141 )
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
142
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
143 else:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
144 args.geneclusterprots.write( line )
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
145
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
146 starts.sort()
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
147 ends.sort()
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
148 genecluster_start = starts[0]
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
149 genecluster_end = ends[-1]
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
150 group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5])
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
151 oline = gff_line % (
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
152 seqname, 'genecluster', genecluster_start, genecluster_end, strand, group
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
153 )
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
154 args.geneclusterprots_gff.write(oline + child_lines)
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
155
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
156 # remove tmp directory
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
157 shutil.rmtree(tmp_dir)
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
158
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
159
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
160 def arg_parse():
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
161 import argparse
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
162 parser = argparse.ArgumentParser(prog = 'antiSMASH-Wrapper')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
163 parser.add_argument('--version', action='version', version='%(prog)s 0.01')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
164 parser.add_argument('--geneclustertypes')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
165 parser.add_argument('--clusterblast', action='store_true')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
166 parser.add_argument('--eukaryotic', action='store_true')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
167 parser.add_argument('--fullhmm', action='store_true')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
168 parser.add_argument('--smcogs', action='store_true')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
169 parser.add_argument('--fullblast', action='store_true')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
170
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
171 parser.add_argument('--input', '-i', help='FASTA Sequence File')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
172 parser.add_argument('--glimmer_prediction', help='Glimmer Prediction File')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
173
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
174 parser.add_argument('--input_format', '-f', dest="iformat", help='input format, if the input is a fasta file you need a glimmer file as additional input')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
175
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
176 parser.add_argument('--zip', help='output: all files as zip file')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
177 parser.add_argument('--html_file', help='output: the path to the index html file')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
178 parser.add_argument('--html_path', help='output: the path to the output html dir')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
179 parser.add_argument('--embl_path', help='output: the path to the embl output file', type=argparse.FileType('w'))
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
180 parser.add_argument('--geneclusterprots', help='output: Genecluster Fasta File', type=argparse.FileType('w'))
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
181 parser.add_argument('--geneclusterprots_gff', help='output: Genecluster GFF', type=argparse.FileType('w'))
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
182
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
183
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
184 args = parser.parse_args()
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
185 return args
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
186
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
187
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
188 def extract_glimmerHMM_results(sequence_id, glimmerHMM):
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
189 """
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
190 Returns the given annoations from a sequence_id as complete GFF3 formated string.
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
191 """
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
192 result = "##gff-version 3\n"
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
193 found_annoations = False
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
194 for line in open(glimmerHMM):
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
195 if line.startswith(sequence_id + '\t'):
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
196 result += line
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
197 found_annoations = True
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
198 if found_annoations:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
199 return result
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
200 else:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
201 return False
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
202
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
203 def extract_glimmer_results(sequence_id, glimmer):
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
204 """
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
205 Returns the given annoations from a sequence_id as glimmer result file.
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
206 """
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
207 result = ''
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
208 found_annoations = False
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
209 for line in open(glimmer):
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
210 # that construct matches the pseudo fasta header and the corresponding orfs
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
211 if line.split()[0].startswith(sequence_id):
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
212 result += line
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
213 found_annoations = True
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
214 if found_annoations:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
215 return result
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
216 else:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
217 return False
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
218
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
219
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
220 if __name__ == '__main__':
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
221 args = arg_parse()
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
222 geneclusterprots = ""
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
223 temp_dir = tempfile.mkdtemp()
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
224 args.geneclusterprots_gff.write("##gff-version 3\n")
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
225
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
226
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
227
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
228 for record in SeqIO.parse(open(args.input, "rU"), args.iformat) :
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
229 # create two tem files and fill it with the information from one sequence
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
230 record.id = record.id or record.name
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
231 tmp_sequence = os.path.join(temp_dir, record.id) #fasta -file
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
232 SeqIO.write(record, open(tmp_sequence, 'w+'), args.iformat)
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
233
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
234 tmp_glimmer = ''
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
235 if args.iformat == 'fasta':
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
236 # We need to annotate the sequences manually
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
237 if args.eukaryotic:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
238 #print os.path.join(temp_dir, record.id + '.gff')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
239 tmp_glimmer = os.path.join(temp_dir, record.id + '.gff')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
240 annotations = extract_glimmerHMM_results(record.id, args.glimmer_prediction)
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
241 #print annotations, ':', record.id, args.glimmer_prediction
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
242 if not annotations:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
243 continue
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
244 open(tmp_glimmer, 'w+').write( annotations )
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
245 else:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
246 tmp_glimmer = os.path.join(temp_dir, record.id + '.tsv')
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
247 annotations = extract_glimmer_results(record.id, args.glimmer_prediction)
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
248 if not annotations:
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
249 continue
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
250 open(tmp_glimmer, 'w+').write( annotations )
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
251
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
252 anitSMASH(args, tmp_sequence, tmp_glimmer)
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
253 shutil.rmtree(temp_dir)
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
254
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
255
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
256
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
257
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
258
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
259
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
260
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
261
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
262
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
263
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
264
6a37d0a4510a initial uploaded
bjoern-gruening
parents:
diff changeset
265