view blastx_to_scaffold.py @ 1:940c0c669e96 draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/blastx_to_scaffold commit b6de14061c479f0418cd89e26d6f5ac26e565a07
author drosofff
date Wed, 09 Nov 2016 11:39:20 -0500
parents a2e034f1638e
children
line wrap: on
line source

#!/usr/bin/python
import sys
import argparse

def insert_newlines(string, every=60):
    lines = []
    for i in xrange(0, len(string), every):
        lines.append(string[i:i+every])
    return '\n'.join(lines)
    
def Parser():
    the_parser = argparse.ArgumentParser(
        description="Generate DNA scaffold from blastx alignment of Contigs")
    the_parser.add_argument(
        '--sequences', action="store", type=str, help="input sequence file in fasta format")
    the_parser.add_argument(
        '--blastx-tab', dest="blastx_tab", action="store", type=str, help="13-columns tabular blastx output")
    the_parser.add_argument(
        '--output', action="store", type=str, help="output file path, fasta format")
    args = the_parser.parse_args()
    return args

def __main__():
    args = Parser()
    protLenght = int (open (args.blastx_tab, "r").readline().split("\t")[12])
    BlastxOutput = open (args.blastx_tab, "r")
    Contigs = open (args.sequences, "r")
    ContigsDict = {}
    protScaffold = {}
    
    for line in Contigs:
        if line[0] == ">":
            header = line[1:-1]
            ContigsDict[header] = ""
        else:
            ContigsDict[header] += line[:-1]
            
    protScaffold = dict ( [(i,"NNN") for i in range (1, protLenght+1)] )
        
    for line in BlastxOutput:
        fields = line[:-1].split("\t")
        queryStart = int(fields[6])
        queryStop = int(fields[7])
        subjectStart = int(fields[8])
        subjectStop = int(fields[9])
        seqHeader = fields[0]
        sequence = ContigsDict[seqHeader]
        for i in range (subjectStart, subjectStop):
            del protScaffold[i]
        protScaffold[subjectStop] = ContigsDict[seqHeader][queryStart -1: queryStop]
        
    finalSeqList = []
    for i in sorted (protScaffold):
        finalSeqList.append(protScaffold[i])
    finalSequence = insert_newlines("".join(finalSeqList))
     
    Out = open (args.output, "w")
    print >> Out, ">Scaffold"
    print >> Out, finalSequence
            
    BlastxOutput.close()
    Contigs.close()
    Out.close()
        
if __name__ == "__main__":
    __main__()