view multi_antiSMASH_wrapper.py @ 0:6a37d0a4510a default tip

initial uploaded
author bjoern-gruening
date Thu, 15 Mar 2012 05:23:03 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

import os, sys, subprocess, commands
import random, shutil
import zipfile
from Bio import SeqIO
import tempfile

blastdbpath = '/home/galaxy/bin/antismash-1.1.0/db'
pfamdbpath = '/home/galaxy/bin/antismash-1.1.0/db'
antismash_path = '/home/galaxy/bin/antismash-1.1.0/antismash.py'


def anitSMASH(args, sequence_path, glimmer_path):
    #./antismash.py Tue6071_genome.fasta --geneclustertypes 1 --fullhmm y
    print sequence_path
    rint = random.randint(1,10000000)
    tmp_dir = '/tmp/galaxy_%s' % rint
    os.mkdir(tmp_dir)
    os.mkdir(os.path.join( tmp_dir, 'geneprediction' ))
    os.chdir(tmp_dir)
    print 'IFORMAT:',args.iformat
    if args.iformat == 'fasta':
        new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.fasta')
    elif args.iformat == 'genbank':
        new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.gbk')
    else:
        new_input_path = os.path.join(tmp_dir, os.path.basename(sequence_path) + '.embl')


    # try to generate the same name as in antismash.py
    genomename = ".".join( (os.path.basename(sequence_path) + '.'+ args.iformat).split(".")[:-1] )
    for i in """!"#$%&()*+,./:;=>?@[]^`{|}'""":
        genomename = genomename.replace(i,"")
    result_path = os.path.join( tmp_dir, genomename )

    shutil.copy(sequence_path, new_input_path )

    if args.eukaryotic:
        taxon = '--taxon e'
    else:
        taxon = '--taxon p'

    if args.clusterblast:
        clusterblast = '--clusterblast y'
    else:
        clusterblast = '--clusterblast n'

    if args.smcogs:
        smcogs = '--smcogs y'
    else:
        smcogs = '--smcogs n'

    if args.fullhmm:
        fullhmm = '--fullhmm y'
    else:
        fullhmm = '--fullhmm n'

    if args.fullblast:
        fullblast = '--fullblast y'
    else:
        fullblast = '--fullblast n'

    h = [antismash_path, new_input_path, 
        '--geneclustertypes %s' % args.geneclustertypes, 
        taxon, 
        clusterblast, 
        smcogs, 
        fullhmm,
        fullblast,
        '--blastdbpath %s' % blastdbpath,
        '--pfamdbpath %s' % pfamdbpath,
        '--cores 10',
        ]
    if args.iformat == 'fasta':
        h.append('--glimmer_prediction %s' % glimmer_path)

    a = ' '.join(h)

    subprocess.call(a, shell=True)
    print a
    embl_path = os.path.join(result_path, '%s.final.embl' % genomename)
    if os.path.exists(embl_path) and args.embl_path:
        args.embl_path.write( open(embl_path).read() )

    gff_line = "%s\tantismash\t%s\t%s\t%s\t.\t%s\t.\t%s"

    clustername_mapping = {}
    genecluster_path = os.path.join(result_path, 'clusterblast/geneclusters.txt')
    if os.path.exists(genecluster_path):
        for line in open( genecluster_path):
            token = line.split('\t')
            clustername_mapping[token[2]] = token[3]
        old_cluster_name = False
        parent_cluster_name = ""
        child_lines = ""
        starts = []
        ends = []
        for line in open( os.path.join(result_path, 'clusterblast/geneclusterprots.fasta') ):
            if line.startswith('>'):
                columns = line[1:].strip().split('|')
                args.geneclusterprots.write( line.replace('|%s|' % columns[1], '|%s|%s|' % (columns[1],clustername_mapping[columns[1]])) )

                """
                "Handlampe" kann man zu einer Person sagen wenn es nicht mehr zum Armleuchter reicht. Es ist also ein abgeschwächte Form für "Depp".
                """
                if old_cluster_name == False:
                    # inital clustername setting
                    # try to catch the line in which the name changes
                    old_cluster_name = columns[1]
                    parent_cluster_name = columns[1]

                seqname = columns[0]
                strand = columns[3]
                (start, end) = columns[2].split('-')
                starts.append(int(start))
                ends.append(int(end))

                if old_cluster_name != columns[1]:
                    # begin of a new cluster, caclulate the parent line for the previous clusterproteins
                    # TODO: maybe the assumption is not correct, to take the smallest start and the largest end
                    starts.sort()
                    ends.sort()
                    genecluster_start = starts[0]
                    genecluster_end = ends[-1]
                    group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5])
                    oline = gff_line % (
                            seqname, 'genecluster', genecluster_start, genecluster_end, strand, group
                    )
                    args.geneclusterprots_gff.write(oline + child_lines)
                    starts = []
                    ends = []
                    old_cluster_name = columns[1]
                    parent_cluster_name = columns[1]
                    child_lines = ''

                group = 'ID=%s;Parent=%s;Name=%s;Clustertype=%s;Note=%s\n' % (columns[4], columns[1], columns[4], clustername_mapping[columns[1]], columns[5])
                child_lines += gff_line % (
                    seqname, 'gene', start, end, strand, group
                )

            else:
                args.geneclusterprots.write( line )

        starts.sort()
        ends.sort()
        genecluster_start = starts[0]
        genecluster_end = ends[-1]
        group = 'ID=%s;Name=%s;Clustertype=%s;Note=%s\n' % (parent_cluster_name, columns[4], clustername_mapping[columns[1]], columns[5])
        oline = gff_line % (
                seqname, 'genecluster', genecluster_start, genecluster_end, strand, group
        )
        args.geneclusterprots_gff.write(oline + child_lines)

    # remove tmp directory
    shutil.rmtree(tmp_dir)


def arg_parse():
    import argparse
    parser = argparse.ArgumentParser(prog = 'antiSMASH-Wrapper')
    parser.add_argument('--version', action='version', version='%(prog)s 0.01')
    parser.add_argument('--geneclustertypes')
    parser.add_argument('--clusterblast', action='store_true')
    parser.add_argument('--eukaryotic', action='store_true')
    parser.add_argument('--fullhmm', action='store_true')
    parser.add_argument('--smcogs', action='store_true')
    parser.add_argument('--fullblast', action='store_true')

    parser.add_argument('--input', '-i', help='FASTA Sequence File')
    parser.add_argument('--glimmer_prediction', help='Glimmer Prediction File')

    parser.add_argument('--input_format', '-f', dest="iformat", help='input format, if the input is a fasta file you need a glimmer file as additional input')

    parser.add_argument('--zip', help='output: all files as zip file')
    parser.add_argument('--html_file', help='output: the path to the index html file')
    parser.add_argument('--html_path', help='output: the path to the output html dir')
    parser.add_argument('--embl_path', help='output: the path to the embl output file', type=argparse.FileType('w'))
    parser.add_argument('--geneclusterprots', help='output: Genecluster Fasta File', type=argparse.FileType('w'))
    parser.add_argument('--geneclusterprots_gff', help='output: Genecluster GFF', type=argparse.FileType('w'))


    args = parser.parse_args()
    return args


def extract_glimmerHMM_results(sequence_id, glimmerHMM):
    """
        Returns the given annoations from a sequence_id as complete GFF3 formated string.
    """
    result = "##gff-version 3\n"
    found_annoations = False
    for line in open(glimmerHMM):
        if line.startswith(sequence_id + '\t'):
            result += line
            found_annoations = True
    if found_annoations:
        return result
    else:
        return False

def extract_glimmer_results(sequence_id, glimmer):
    """
        Returns the given annoations from a sequence_id as glimmer result file.
    """
    result = ''
    found_annoations = False
    for line in open(glimmer):
        # that construct matches the pseudo fasta header and the corresponding orfs
        if line.split()[0].startswith(sequence_id):
            result += line
            found_annoations = True
    if found_annoations:
        return result
    else:
        return False


if __name__ == '__main__':
    args = arg_parse()
    geneclusterprots = ""
    temp_dir = tempfile.mkdtemp()
    args.geneclusterprots_gff.write("##gff-version 3\n")



    for record in SeqIO.parse(open(args.input, "rU"), args.iformat) :
        # create two tem files and fill it with the information from one sequence
        record.id = record.id or record.name
        tmp_sequence = os.path.join(temp_dir, record.id) #fasta -file
        SeqIO.write(record, open(tmp_sequence, 'w+'), args.iformat)

        tmp_glimmer = ''
        if args.iformat == 'fasta':
            # We need to annotate the sequences manually
            if args.eukaryotic:
                #print os.path.join(temp_dir, record.id + '.gff')
                tmp_glimmer = os.path.join(temp_dir, record.id + '.gff')
                annotations = extract_glimmerHMM_results(record.id, args.glimmer_prediction)
                #print annotations, ':', record.id, args.glimmer_prediction
                if not annotations:
                    continue
                open(tmp_glimmer, 'w+').write( annotations )
            else:
                tmp_glimmer = os.path.join(temp_dir, record.id + '.tsv')
                annotations = extract_glimmer_results(record.id, args.glimmer_prediction)
                if not annotations:
                    continue
                open(tmp_glimmer, 'w+').write( annotations )

        anitSMASH(args, tmp_sequence, tmp_glimmer)
    shutil.rmtree(temp_dir)