0
+ − 1 #!/usr/bin/env python
+ − 2 # -*- coding: UTF-8 -*-
+ − 3
+ − 4 import os, sys, subprocess, commands
+ − 5 import random, shutil
+ − 6 import zipfile
+ − 7 from Bio import SeqIO
+ − 8 import tempfile
+ − 9
+ − 10 blastdbpath = '/home/galaxy/bin/antismash-1.1.0/db'
+ − 11 pfamdbpath = '/home/galaxy/bin/antismash-1.1.0/db'
+ − 12 antismash_path = '/home/galaxy/bin/antismash-1.1.0/antismash.py'
+ − 13
+ − 14
+ − 15 def anitSMASH(args, sequence_path, glimmer_path):
+ − 16 #./antismash.py Tue6071_genome.fasta --geneclustertypes 1 --fullhmm y
+ − 17 print sequence_path
+ − 18 rint = random.randint(1,10000000)
+ − 19 tmp_dir = '/tmp/galaxy_%s' % rint
+ − 20 os.mkdir(tmp_dir)
+ − 21 os.mkdir(os.path.join( tmp_dir, 'geneprediction' ))
+ − 22 os.chdir(tmp_dir)
+ − 23 print 'IFORMAT:',args.iformat
+ − 24 if args.iformat == 'fasta':
+ − 25 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.fasta')
+ − 26 elif args.iformat == 'genbank':
+ − 27 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.gbk')
+ − 28 else:
+ − 29 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.embl')
+ − 30
+ − 31
+ − 32 # try to generate the same name as in antismash.py
+ − 33 genomename = ".".join( (os.path.basename(sequence_path) + '.'+ args.iformat).split(".")[:-1] )
+ − 34 for i in """!"#$%&()*+,./:;=>?@[]^`{|}'""":
+ − 35 genomename = genomename.replace(i,"")
+ − 36 result_path = os.path.join( tmp_dir, genomename )
+ − 37
+ − 38 shutil.copy(sequence_path, new_input_path )
+ − 39
+ − 40 if args.eukaryotic:
+ − 41 taxon = '--taxon e'
+ − 42 else:
+ − 43 taxon = '--taxon p'
+ − 44
+ − 45 if args.clusterblast:
+ − 46 clusterblast = '--clusterblast y'
+ − 47 else:
+ − 48 clusterblast = '--clusterblast n'
+ − 49
+ − 50 if args.smcogs:
+ − 51 smcogs = '--smcogs y'
+ − 52 else:
+ − 53 smcogs = '--smcogs n'
+ − 54
+ − 55 if args.fullhmm:
+ − 56 fullhmm = '--fullhmm y'
+ − 57 else:
+ − 58 fullhmm = '--fullhmm n'
+ − 59
+ − 60 if args.fullblast:
+ − 61 fullblast = '--fullblast y'
+ − 62 else:
+ − 63 fullblast = '--fullblast n'
+ − 64
+ − 65 h = [antismash_path, new_input_path,
+ − 66 '--geneclustertypes %s' % args.geneclustertypes,
+ − 67 taxon,
+ − 68 clusterblast,
+ − 69 smcogs,
+ − 70 fullhmm,
+ − 71 fullblast,
+ − 72 '--blastdbpath %s' % blastdbpath,
+ − 73 '--pfamdbpath %s' % pfamdbpath,
+ − 74 '--cores 10',
+ − 75 ]
+ − 76 if args.iformat == 'fasta':
+ − 77 h.append('--glimmer_prediction %s' % glimmer_path)
+ − 78
+ − 79 a = ' '.join(h)
+ − 80
+ − 81 subprocess.call(a, shell=True)
+ − 82 print a
+ − 83 embl_path = os.path.join(result_path, '%s.final.embl' % genomename)
+ − 84 if os.path.exists(embl_path) and args.embl_path:
+ − 85 args.embl_path.write( open(embl_path).read() )
+ − 86
+ − 87 gff_line = "%s\tantismash\t%s\t%s\t%s\t.\t%s\t.\t%s"
+ − 88
+ − 89 clustername_mapping = {}
+ − 90 genecluster_path = os.path.join(result_path, 'clusterblast/geneclusters.txt')
+ − 91 if os.path.exists(genecluster_path):
+ − 92 for line in open( genecluster_path):
+ − 93 token = line.split('\t')
+ − 94 clustername_mapping[token[2]] = token[3]
+ − 95 old_cluster_name = False
+ − 96 parent_cluster_name = ""
+ − 97 child_lines = ""
+ − 98 starts = []
+ − 99 ends = []
+ − 100 for line in open( os.path.join(result_path, 'clusterblast/geneclusterprots.fasta') ):
+ − 101 if line.startswith('>'):
+ − 102 columns = line[1:].strip().split('|')
+ − 103 args.geneclusterprots.write( line.replace('|%s|' % columns[1], '|%s|%s|' % (columns[1],clustername_mapping[columns[1]])) )
+ − 104
+ − 105 """
+ − 106 "Handlampe" kann man zu einer Person sagen wenn es nicht mehr zum Armleuchter reicht. Es ist also ein abgeschwächte Form für "Depp".
+ − 107 """
+ − 108 if old_cluster_name == False:
+ − 109 # inital clustername setting
+ − 110 # try to catch the line in which the name changes
+ − 111 old_cluster_name = columns[1]
+ − 112 parent_cluster_name = columns[1]
+ − 113
+ − 114 seqname = columns[0]
+ − 115 strand = columns[3]
+ − 116 (start, end) = columns[2].split('-')
+ − 117 starts.append(int(start))
+ − 118 ends.append(int(end))
+ − 119
+ − 120 if old_cluster_name != columns[1]:
+ − 121 # begin of a new cluster, caclulate the parent line for the previous clusterproteins
+ − 122 # TODO: maybe the assumption is not correct, to take the smallest start and the largest end
+ − 123 starts.sort()
+ − 124 ends.sort()
+ − 125 genecluster_start = starts[0]
+ − 126 genecluster_end = ends[-1]
+ − 127 group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5])
+ − 128 oline = gff_line % (
+ − 129 seqname, 'genecluster', genecluster_start, genecluster_end, strand, group
+ − 130 )
+ − 131 args.geneclusterprots_gff.write(oline + child_lines)
+ − 132 starts = []
+ − 133 ends = []
+ − 134 old_cluster_name = columns[1]
+ − 135 parent_cluster_name = columns[1]
+ − 136 child_lines = ''
+ − 137
+ − 138 group = 'ID=%s;Parent=%s;Name=%s;Clustertype=%s;Note=%s\n' % (columns[4], columns[1], columns[4], clustername_mapping[columns[1]], columns[5])
+ − 139 child_lines += gff_line % (
+ − 140 seqname, 'gene', start, end, strand, group
+ − 141 )
+ − 142
+ − 143 else:
+ − 144 args.geneclusterprots.write( line )
+ − 145
+ − 146 starts.sort()
+ − 147 ends.sort()
+ − 148 genecluster_start = starts[0]
+ − 149 genecluster_end = ends[-1]
+ − 150 group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5])
+ − 151 oline = gff_line % (
+ − 152 seqname, 'genecluster', genecluster_start, genecluster_end, strand, group
+ − 153 )
+ − 154 args.geneclusterprots_gff.write(oline + child_lines)
+ − 155
+ − 156 # remove tmp directory
+ − 157 shutil.rmtree(tmp_dir)
+ − 158
+ − 159
+ − 160 def arg_parse():
+ − 161 import argparse
+ − 162 parser = argparse.ArgumentParser(prog = 'antiSMASH-Wrapper')
+ − 163 parser.add_argument('--version', action='version', version='%(prog)s 0.01')
+ − 164 parser.add_argument('--geneclustertypes')
+ − 165 parser.add_argument('--clusterblast', action='store_true')
+ − 166 parser.add_argument('--eukaryotic', action='store_true')
+ − 167 parser.add_argument('--fullhmm', action='store_true')
+ − 168 parser.add_argument('--smcogs', action='store_true')
+ − 169 parser.add_argument('--fullblast', action='store_true')
+ − 170
+ − 171 parser.add_argument('--input', '-i', help='FASTA Sequence File')
+ − 172 parser.add_argument('--glimmer_prediction', help='Glimmer Prediction File')
+ − 173
+ − 174 parser.add_argument('--input_format', '-f', dest="iformat", help='input format, if the input is a fasta file you need a glimmer file as additional input')
+ − 175
+ − 176 parser.add_argument('--zip', help='output: all files as zip file')
+ − 177 parser.add_argument('--html_file', help='output: the path to the index html file')
+ − 178 parser.add_argument('--html_path', help='output: the path to the output html dir')
+ − 179 parser.add_argument('--embl_path', help='output: the path to the embl output file', type=argparse.FileType('w'))
+ − 180 parser.add_argument('--geneclusterprots', help='output: Genecluster Fasta File', type=argparse.FileType('w'))
+ − 181 parser.add_argument('--geneclusterprots_gff', help='output: Genecluster GFF', type=argparse.FileType('w'))
+ − 182
+ − 183
+ − 184 args = parser.parse_args()
+ − 185 return args
+ − 186
+ − 187
+ − 188 def extract_glimmerHMM_results(sequence_id, glimmerHMM):
+ − 189 """
+ − 190 Returns the given annoations from a sequence_id as complete GFF3 formated string.
+ − 191 """
+ − 192 result = "##gff-version 3\n"
+ − 193 found_annoations = False
+ − 194 for line in open(glimmerHMM):
+ − 195 if line.startswith(sequence_id + '\t'):
+ − 196 result += line
+ − 197 found_annoations = True
+ − 198 if found_annoations:
+ − 199 return result
+ − 200 else:
+ − 201 return False
+ − 202
+ − 203 def extract_glimmer_results(sequence_id, glimmer):
+ − 204 """
+ − 205 Returns the given annoations from a sequence_id as glimmer result file.
+ − 206 """
+ − 207 result = ''
+ − 208 found_annoations = False
+ − 209 for line in open(glimmer):
+ − 210 # that construct matches the pseudo fasta header and the corresponding orfs
+ − 211 if line.split()[0].startswith(sequence_id):
+ − 212 result += line
+ − 213 found_annoations = True
+ − 214 if found_annoations:
+ − 215 return result
+ − 216 else:
+ − 217 return False
+ − 218
+ − 219
+ − 220 if __name__ == '__main__':
+ − 221 args = arg_parse()
+ − 222 geneclusterprots = ""
+ − 223 temp_dir = tempfile.mkdtemp()
+ − 224 args.geneclusterprots_gff.write("##gff-version 3\n")
+ − 225
+ − 226
+ − 227
+ − 228 for record in SeqIO.parse(open(args.input, "rU"), args.iformat) :
+ − 229 # create two tem files and fill it with the information from one sequence
+ − 230 record.id = record.id or record.name
+ − 231 tmp_sequence = os.path.join(temp_dir, record.id) #fasta -file
+ − 232 SeqIO.write(record, open(tmp_sequence, 'w+'), args.iformat)
+ − 233
+ − 234 tmp_glimmer = ''
+ − 235 if args.iformat == 'fasta':
+ − 236 # We need to annotate the sequences manually
+ − 237 if args.eukaryotic:
+ − 238 #print os.path.join(temp_dir, record.id + '.gff')
+ − 239 tmp_glimmer = os.path.join(temp_dir, record.id + '.gff')
+ − 240 annotations = extract_glimmerHMM_results(record.id, args.glimmer_prediction)
+ − 241 #print annotations, ':', record.id, args.glimmer_prediction
+ − 242 if not annotations:
+ − 243 continue
+ − 244 open(tmp_glimmer, 'w+').write( annotations )
+ − 245 else:
+ − 246 tmp_glimmer = os.path.join(temp_dir, record.id + '.tsv')
+ − 247 annotations = extract_glimmer_results(record.id, args.glimmer_prediction)
+ − 248 if not annotations:
+ − 249 continue
+ − 250 open(tmp_glimmer, 'w+').write( annotations )
+ − 251
+ − 252 anitSMASH(args, tmp_sequence, tmp_glimmer)
+ − 253 shutil.rmtree(temp_dir)
+ − 254
+ − 255
+ − 256
+ − 257
+ − 258
+ − 259
+ − 260
+ − 261
+ − 262
+ − 263
+ − 264
+ − 265