Mercurial > repos > bjoern-gruening > antismash
comparison multi_antiSMASH_wrapper.py @ 0:6a37d0a4510a default tip
initial uploaded
| author | bjoern-gruening |
|---|---|
| date | Thu, 15 Mar 2012 05:23:03 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6a37d0a4510a |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # -*- coding: UTF-8 -*- | |
| 3 | |
| 4 import os, sys, subprocess, commands | |
| 5 import random, shutil | |
| 6 import zipfile | |
| 7 from Bio import SeqIO | |
| 8 import tempfile | |
| 9 | |
| 10 blastdbpath = '/home/galaxy/bin/antismash-1.1.0/db' | |
| 11 pfamdbpath = '/home/galaxy/bin/antismash-1.1.0/db' | |
| 12 antismash_path = '/home/galaxy/bin/antismash-1.1.0/antismash.py' | |
| 13 | |
| 14 | |
| 15 def anitSMASH(args, sequence_path, glimmer_path): | |
| 16 #./antismash.py Tue6071_genome.fasta --geneclustertypes 1 --fullhmm y | |
| 17 print sequence_path | |
| 18 rint = random.randint(1,10000000) | |
| 19 tmp_dir = '/tmp/galaxy_%s' % rint | |
| 20 os.mkdir(tmp_dir) | |
| 21 os.mkdir(os.path.join( tmp_dir, 'geneprediction' )) | |
| 22 os.chdir(tmp_dir) | |
| 23 print 'IFORMAT:',args.iformat | |
| 24 if args.iformat == 'fasta': | |
| 25 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.fasta') | |
| 26 elif args.iformat == 'genbank': | |
| 27 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.gbk') | |
| 28 else: | |
| 29 new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.embl') | |
| 30 | |
| 31 | |
| 32 # try to generate the same name as in antismash.py | |
| 33 genomename = ".".join( (os.path.basename(sequence_path) + '.'+ args.iformat).split(".")[:-1] ) | |
| 34 for i in """!"#$%&()*+,./:;=>?@[]^`{|}'""": | |
| 35 genomename = genomename.replace(i,"") | |
| 36 result_path = os.path.join( tmp_dir, genomename ) | |
| 37 | |
| 38 shutil.copy(sequence_path, new_input_path ) | |
| 39 | |
| 40 if args.eukaryotic: | |
| 41 taxon = '--taxon e' | |
| 42 else: | |
| 43 taxon = '--taxon p' | |
| 44 | |
| 45 if args.clusterblast: | |
| 46 clusterblast = '--clusterblast y' | |
| 47 else: | |
| 48 clusterblast = '--clusterblast n' | |
| 49 | |
| 50 if args.smcogs: | |
| 51 smcogs = '--smcogs y' | |
| 52 else: | |
| 53 smcogs = '--smcogs n' | |
| 54 | |
| 55 if args.fullhmm: | |
| 56 fullhmm = '--fullhmm y' | |
| 57 else: | |
| 58 fullhmm = '--fullhmm n' | |
| 59 | |
| 60 if args.fullblast: | |
| 61 fullblast = '--fullblast y' | |
| 62 else: | |
| 63 fullblast = '--fullblast n' | |
| 64 | |
| 65 h = [antismash_path, new_input_path, | |
| 66 '--geneclustertypes %s' % args.geneclustertypes, | |
| 67 taxon, | |
| 68 clusterblast, | |
| 69 smcogs, | |
| 70 fullhmm, | |
| 71 fullblast, | |
| 72 '--blastdbpath %s' % blastdbpath, | |
| 73 '--pfamdbpath %s' % pfamdbpath, | |
| 74 '--cores 10', | |
| 75 ] | |
| 76 if args.iformat == 'fasta': | |
| 77 h.append('--glimmer_prediction %s' % glimmer_path) | |
| 78 | |
| 79 a = ' '.join(h) | |
| 80 | |
| 81 subprocess.call(a, shell=True) | |
| 82 print a | |
| 83 embl_path = os.path.join(result_path, '%s.final.embl' % genomename) | |
| 84 if os.path.exists(embl_path) and args.embl_path: | |
| 85 args.embl_path.write( open(embl_path).read() ) | |
| 86 | |
| 87 gff_line = "%s\tantismash\t%s\t%s\t%s\t.\t%s\t.\t%s" | |
| 88 | |
| 89 clustername_mapping = {} | |
| 90 genecluster_path = os.path.join(result_path, 'clusterblast/geneclusters.txt') | |
| 91 if os.path.exists(genecluster_path): | |
| 92 for line in open( genecluster_path): | |
| 93 token = line.split('\t') | |
| 94 clustername_mapping[token[2]] = token[3] | |
| 95 old_cluster_name = False | |
| 96 parent_cluster_name = "" | |
| 97 child_lines = "" | |
| 98 starts = [] | |
| 99 ends = [] | |
| 100 for line in open( os.path.join(result_path, 'clusterblast/geneclusterprots.fasta') ): | |
| 101 if line.startswith('>'): | |
| 102 columns = line[1:].strip().split('|') | |
| 103 args.geneclusterprots.write( line.replace('|%s|' % columns[1], '|%s|%s|' % (columns[1],clustername_mapping[columns[1]])) ) | |
| 104 | |
| 105 """ | |
| 106 "Handlampe" kann man zu einer Person sagen wenn es nicht mehr zum Armleuchter reicht. Es ist also ein abgeschwächte Form für "Depp". | |
| 107 """ | |
| 108 if old_cluster_name == False: | |
| 109 # inital clustername setting | |
| 110 # try to catch the line in which the name changes | |
| 111 old_cluster_name = columns[1] | |
| 112 parent_cluster_name = columns[1] | |
| 113 | |
| 114 seqname = columns[0] | |
| 115 strand = columns[3] | |
| 116 (start, end) = columns[2].split('-') | |
| 117 starts.append(int(start)) | |
| 118 ends.append(int(end)) | |
| 119 | |
| 120 if old_cluster_name != columns[1]: | |
| 121 # begin of a new cluster, caclulate the parent line for the previous clusterproteins | |
| 122 # TODO: maybe the assumption is not correct, to take the smallest start and the largest end | |
| 123 starts.sort() | |
| 124 ends.sort() | |
| 125 genecluster_start = starts[0] | |
| 126 genecluster_end = ends[-1] | |
| 127 group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5]) | |
| 128 oline = gff_line % ( | |
| 129 seqname, 'genecluster', genecluster_start, genecluster_end, strand, group | |
| 130 ) | |
| 131 args.geneclusterprots_gff.write(oline + child_lines) | |
| 132 starts = [] | |
| 133 ends = [] | |
| 134 old_cluster_name = columns[1] | |
| 135 parent_cluster_name = columns[1] | |
| 136 child_lines = '' | |
| 137 | |
| 138 group = 'ID=%s;Parent=%s;Name=%s;Clustertype=%s;Note=%s\n' % (columns[4], columns[1], columns[4], clustername_mapping[columns[1]], columns[5]) | |
| 139 child_lines += gff_line % ( | |
| 140 seqname, 'gene', start, end, strand, group | |
| 141 ) | |
| 142 | |
| 143 else: | |
| 144 args.geneclusterprots.write( line ) | |
| 145 | |
| 146 starts.sort() | |
| 147 ends.sort() | |
| 148 genecluster_start = starts[0] | |
| 149 genecluster_end = ends[-1] | |
| 150 group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5]) | |
| 151 oline = gff_line % ( | |
| 152 seqname, 'genecluster', genecluster_start, genecluster_end, strand, group | |
| 153 ) | |
| 154 args.geneclusterprots_gff.write(oline + child_lines) | |
| 155 | |
| 156 # remove tmp directory | |
| 157 shutil.rmtree(tmp_dir) | |
| 158 | |
| 159 | |
| 160 def arg_parse(): | |
| 161 import argparse | |
| 162 parser = argparse.ArgumentParser(prog = 'antiSMASH-Wrapper') | |
| 163 parser.add_argument('--version', action='version', version='%(prog)s 0.01') | |
| 164 parser.add_argument('--geneclustertypes') | |
| 165 parser.add_argument('--clusterblast', action='store_true') | |
| 166 parser.add_argument('--eukaryotic', action='store_true') | |
| 167 parser.add_argument('--fullhmm', action='store_true') | |
| 168 parser.add_argument('--smcogs', action='store_true') | |
| 169 parser.add_argument('--fullblast', action='store_true') | |
| 170 | |
| 171 parser.add_argument('--input', '-i', help='FASTA Sequence File') | |
| 172 parser.add_argument('--glimmer_prediction', help='Glimmer Prediction File') | |
| 173 | |
| 174 parser.add_argument('--input_format', '-f', dest="iformat", help='input format, if the input is a fasta file you need a glimmer file as additional input') | |
| 175 | |
| 176 parser.add_argument('--zip', help='output: all files as zip file') | |
| 177 parser.add_argument('--html_file', help='output: the path to the index html file') | |
| 178 parser.add_argument('--html_path', help='output: the path to the output html dir') | |
| 179 parser.add_argument('--embl_path', help='output: the path to the embl output file', type=argparse.FileType('w')) | |
| 180 parser.add_argument('--geneclusterprots', help='output: Genecluster Fasta File', type=argparse.FileType('w')) | |
| 181 parser.add_argument('--geneclusterprots_gff', help='output: Genecluster GFF', type=argparse.FileType('w')) | |
| 182 | |
| 183 | |
| 184 args = parser.parse_args() | |
| 185 return args | |
| 186 | |
| 187 | |
| 188 def extract_glimmerHMM_results(sequence_id, glimmerHMM): | |
| 189 """ | |
| 190 Returns the given annoations from a sequence_id as complete GFF3 formated string. | |
| 191 """ | |
| 192 result = "##gff-version 3\n" | |
| 193 found_annoations = False | |
| 194 for line in open(glimmerHMM): | |
| 195 if line.startswith(sequence_id + '\t'): | |
| 196 result += line | |
| 197 found_annoations = True | |
| 198 if found_annoations: | |
| 199 return result | |
| 200 else: | |
| 201 return False | |
| 202 | |
| 203 def extract_glimmer_results(sequence_id, glimmer): | |
| 204 """ | |
| 205 Returns the given annoations from a sequence_id as glimmer result file. | |
| 206 """ | |
| 207 result = '' | |
| 208 found_annoations = False | |
| 209 for line in open(glimmer): | |
| 210 # that construct matches the pseudo fasta header and the corresponding orfs | |
| 211 if line.split()[0].startswith(sequence_id): | |
| 212 result += line | |
| 213 found_annoations = True | |
| 214 if found_annoations: | |
| 215 return result | |
| 216 else: | |
| 217 return False | |
| 218 | |
| 219 | |
| 220 if __name__ == '__main__': | |
| 221 args = arg_parse() | |
| 222 geneclusterprots = "" | |
| 223 temp_dir = tempfile.mkdtemp() | |
| 224 args.geneclusterprots_gff.write("##gff-version 3\n") | |
| 225 | |
| 226 | |
| 227 | |
| 228 for record in SeqIO.parse(open(args.input, "rU"), args.iformat) : | |
| 229 # create two tem files and fill it with the information from one sequence | |
| 230 record.id = record.id or record.name | |
| 231 tmp_sequence = os.path.join(temp_dir, record.id) #fasta -file | |
| 232 SeqIO.write(record, open(tmp_sequence, 'w+'), args.iformat) | |
| 233 | |
| 234 tmp_glimmer = '' | |
| 235 if args.iformat == 'fasta': | |
| 236 # We need to annotate the sequences manually | |
| 237 if args.eukaryotic: | |
| 238 #print os.path.join(temp_dir, record.id + '.gff') | |
| 239 tmp_glimmer = os.path.join(temp_dir, record.id + '.gff') | |
| 240 annotations = extract_glimmerHMM_results(record.id, args.glimmer_prediction) | |
| 241 #print annotations, ':', record.id, args.glimmer_prediction | |
| 242 if not annotations: | |
| 243 continue | |
| 244 open(tmp_glimmer, 'w+').write( annotations ) | |
| 245 else: | |
| 246 tmp_glimmer = os.path.join(temp_dir, record.id + '.tsv') | |
| 247 annotations = extract_glimmer_results(record.id, args.glimmer_prediction) | |
| 248 if not annotations: | |
| 249 continue | |
| 250 open(tmp_glimmer, 'w+').write( annotations ) | |
| 251 | |
| 252 anitSMASH(args, tmp_sequence, tmp_glimmer) | |
| 253 shutil.rmtree(temp_dir) | |
| 254 | |
| 255 | |
| 256 | |
| 257 | |
| 258 | |
| 259 | |
| 260 | |
| 261 | |
| 262 | |
| 263 | |
| 264 | |
| 265 |
