Mercurial > repos > bjoern-gruening > antismash
diff multi_antiSMASH_wrapper.py @ 0:6a37d0a4510a default tip
initial uploaded
author | bjoern-gruening |
---|---|
date | Thu, 15 Mar 2012 05:23:03 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multi_antiSMASH_wrapper.py Thu Mar 15 05:23:03 2012 -0400 @@ -0,0 +1,265 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +import os, sys, subprocess, commands +import random, shutil +import zipfile +from Bio import SeqIO +import tempfile + +blastdbpath = '/home/galaxy/bin/antismash-1.1.0/db' +pfamdbpath = '/home/galaxy/bin/antismash-1.1.0/db' +antismash_path = '/home/galaxy/bin/antismash-1.1.0/antismash.py' + + +def anitSMASH(args, sequence_path, glimmer_path): + #./antismash.py Tue6071_genome.fasta --geneclustertypes 1 --fullhmm y + print sequence_path + rint = random.randint(1,10000000) + tmp_dir = '/tmp/galaxy_%s' % rint + os.mkdir(tmp_dir) + os.mkdir(os.path.join( tmp_dir, 'geneprediction' )) + os.chdir(tmp_dir) + print 'IFORMAT:',args.iformat + if args.iformat == 'fasta': + new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.fasta') + elif args.iformat == 'genbank': + new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.gbk') + else: + new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.embl') + + + # try to generate the same name as in antismash.py + genomename = ".".join( (os.path.basename(sequence_path) + '.'+ args.iformat).split(".")[:-1] ) + for i in """!"#$%&()*+,./:;=>?@[]^`{|}'""": + genomename = genomename.replace(i,"") + result_path = os.path.join( tmp_dir, genomename ) + + shutil.copy(sequence_path, new_input_path ) + + if args.eukaryotic: + taxon = '--taxon e' + else: + taxon = '--taxon p' + + if args.clusterblast: + clusterblast = '--clusterblast y' + else: + clusterblast = '--clusterblast n' + + if args.smcogs: + smcogs = '--smcogs y' + else: + smcogs = '--smcogs n' + + if args.fullhmm: + fullhmm = '--fullhmm y' + else: + fullhmm = '--fullhmm n' + + if args.fullblast: + fullblast = '--fullblast y' + else: + fullblast = '--fullblast n' + + h = [antismash_path, new_input_path, + '--geneclustertypes %s' % args.geneclustertypes, + taxon, + clusterblast, + smcogs, + fullhmm, + fullblast, + '--blastdbpath %s' % blastdbpath, + '--pfamdbpath %s' % pfamdbpath, + '--cores 10', + ] + if args.iformat == 'fasta': + h.append('--glimmer_prediction %s' % glimmer_path) + + a = ' '.join(h) + + subprocess.call(a, shell=True) + print a + embl_path = os.path.join(result_path, '%s.final.embl' % genomename) + if os.path.exists(embl_path) and args.embl_path: + args.embl_path.write( open(embl_path).read() ) + + gff_line = "%s\tantismash\t%s\t%s\t%s\t.\t%s\t.\t%s" + + clustername_mapping = {} + genecluster_path = os.path.join(result_path, 'clusterblast/geneclusters.txt') + if os.path.exists(genecluster_path): + for line in open( genecluster_path): + token = line.split('\t') + clustername_mapping[token[2]] = token[3] + old_cluster_name = False + parent_cluster_name = "" + child_lines = "" + starts = [] + ends = [] + for line in open( os.path.join(result_path, 'clusterblast/geneclusterprots.fasta') ): + if line.startswith('>'): + columns = line[1:].strip().split('|') + args.geneclusterprots.write( line.replace('|%s|' % columns[1], '|%s|%s|' % (columns[1],clustername_mapping[columns[1]])) ) + + """ + "Handlampe" kann man zu einer Person sagen wenn es nicht mehr zum Armleuchter reicht. Es ist also ein abgeschwächte Form für "Depp". + """ + if old_cluster_name == False: + # inital clustername setting + # try to catch the line in which the name changes + old_cluster_name = columns[1] + parent_cluster_name = columns[1] + + seqname = columns[0] + strand = columns[3] + (start, end) = columns[2].split('-') + starts.append(int(start)) + ends.append(int(end)) + + if old_cluster_name != columns[1]: + # begin of a new cluster, caclulate the parent line for the previous clusterproteins + # TODO: maybe the assumption is not correct, to take the smallest start and the largest end + starts.sort() + ends.sort() + genecluster_start = starts[0] + genecluster_end = ends[-1] + group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5]) + oline = gff_line % ( + seqname, 'genecluster', genecluster_start, genecluster_end, strand, group + ) + args.geneclusterprots_gff.write(oline + child_lines) + starts = [] + ends = [] + old_cluster_name = columns[1] + parent_cluster_name = columns[1] + child_lines = '' + + group = 'ID=%s;Parent=%s;Name=%s;Clustertype=%s;Note=%s\n' % (columns[4], columns[1], columns[4], clustername_mapping[columns[1]], columns[5]) + child_lines += gff_line % ( + seqname, 'gene', start, end, strand, group + ) + + else: + args.geneclusterprots.write( line ) + + starts.sort() + ends.sort() + genecluster_start = starts[0] + genecluster_end = ends[-1] + group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5]) + oline = gff_line % ( + seqname, 'genecluster', genecluster_start, genecluster_end, strand, group + ) + args.geneclusterprots_gff.write(oline + child_lines) + + # remove tmp directory + shutil.rmtree(tmp_dir) + + +def arg_parse(): + import argparse + parser = argparse.ArgumentParser(prog = 'antiSMASH-Wrapper') + parser.add_argument('--version', action='version', version='%(prog)s 0.01') + parser.add_argument('--geneclustertypes') + parser.add_argument('--clusterblast', action='store_true') + parser.add_argument('--eukaryotic', action='store_true') + parser.add_argument('--fullhmm', action='store_true') + parser.add_argument('--smcogs', action='store_true') + parser.add_argument('--fullblast', action='store_true') + + parser.add_argument('--input', '-i', help='FASTA Sequence File') + parser.add_argument('--glimmer_prediction', help='Glimmer Prediction File') + + parser.add_argument('--input_format', '-f', dest="iformat", help='input format, if the input is a fasta file you need a glimmer file as additional input') + + parser.add_argument('--zip', help='output: all files as zip file') + parser.add_argument('--html_file', help='output: the path to the index html file') + parser.add_argument('--html_path', help='output: the path to the output html dir') + parser.add_argument('--embl_path', help='output: the path to the embl output file', type=argparse.FileType('w')) + parser.add_argument('--geneclusterprots', help='output: Genecluster Fasta File', type=argparse.FileType('w')) + parser.add_argument('--geneclusterprots_gff', help='output: Genecluster GFF', type=argparse.FileType('w')) + + + args = parser.parse_args() + return args + + +def extract_glimmerHMM_results(sequence_id, glimmerHMM): + """ + Returns the given annoations from a sequence_id as complete GFF3 formated string. + """ + result = "##gff-version 3\n" + found_annoations = False + for line in open(glimmerHMM): + if line.startswith(sequence_id + '\t'): + result += line + found_annoations = True + if found_annoations: + return result + else: + return False + +def extract_glimmer_results(sequence_id, glimmer): + """ + Returns the given annoations from a sequence_id as glimmer result file. + """ + result = '' + found_annoations = False + for line in open(glimmer): + # that construct matches the pseudo fasta header and the corresponding orfs + if line.split()[0].startswith(sequence_id): + result += line + found_annoations = True + if found_annoations: + return result + else: + return False + + +if __name__ == '__main__': + args = arg_parse() + geneclusterprots = "" + temp_dir = tempfile.mkdtemp() + args.geneclusterprots_gff.write("##gff-version 3\n") + + + + for record in SeqIO.parse(open(args.input, "rU"), args.iformat) : + # create two tem files and fill it with the information from one sequence + record.id = record.id or record.name + tmp_sequence = os.path.join(temp_dir, record.id) #fasta -file + SeqIO.write(record, open(tmp_sequence, 'w+'), args.iformat) + + tmp_glimmer = '' + if args.iformat == 'fasta': + # We need to annotate the sequences manually + if args.eukaryotic: + #print os.path.join(temp_dir, record.id + '.gff') + tmp_glimmer = os.path.join(temp_dir, record.id + '.gff') + annotations = extract_glimmerHMM_results(record.id, args.glimmer_prediction) + #print annotations, ':', record.id, args.glimmer_prediction + if not annotations: + continue + open(tmp_glimmer, 'w+').write( annotations ) + else: + tmp_glimmer = os.path.join(temp_dir, record.id + '.tsv') + annotations = extract_glimmer_results(record.id, args.glimmer_prediction) + if not annotations: + continue + open(tmp_glimmer, 'w+').write( annotations ) + + anitSMASH(args, tmp_sequence, tmp_glimmer) + shutil.rmtree(temp_dir) + + + + + + + + + + + +