view commons/core/parsing/PalsToAlign.py @ 53:47310c4fb725

Uploaded
author m-zytnicki
date Fri, 10 Jan 2014 08:57:02 -0500
parents 769e306b7933
children
line wrap: on
line source

import time
import os

class PalsToAlign(object):
    """
    Convert the output from PALS (GFF2 format) into the 'align' format.
    """
    def __init__(self,inputPalsFileName="" , outputAlignFileName="", removeSameSequences=False):
        self._removeSameSequences = removeSameSequences
        self._inputPalsFileName = inputPalsFileName
        self._outputAlignFileName = outputAlignFileName

    def run (self):
        file = open(self._inputPalsFileName, "r")
        tmpFileName = "PalsToAlign%s"%str(os.getpid() ) 
        tmpFile = open(tmpFileName, "w")
        
        for line in file.readlines():
    
            if line == "":
                break
    
            data = line.split("\t")
    
            qryName = data[0]
            source = data[1]
            feature = data[2]
            qryStart = data[3]
            qryEnd = data[4]
            score = data[5]
            strand = data[6]
            frame = data[7]
            attributes = data[8][:-1].split()
    
            sbjName = attributes[1]
            sbjStart = attributes[2]
            sbjEnd = attributes[3][:-1]
            percId = (1 - float(attributes[-1])) * 100.0
    
            if strand != "+":
                tmp = sbjStart
                sbjStart = sbjEnd
                sbjEnd = tmp
    
            if self._removeSameSequences \
            and "chunk" in qryName and "chunk" in sbjName \
            and min(int(qryStart), int(qryEnd)) == 1 \
            and min(int(sbjStart), int(sbjEnd)) == 1 \
            and percId == 100.0:
                line = self.inFile.readline()
                continue
    
            if qryStart < qryEnd:
                alignLine = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (qryName, qryStart, qryEnd, sbjName, sbjStart, sbjEnd, "0.0", score, percId)
            else:
                alignLine = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (qryName, qryEnd, qryStart, sbjName, sbjEnd, sbjStart, "0.0", score, percId)
    
            tmpFile.write(alignLine)

        file.close()
        tmpFile.close()
    
        os.system("sort -k 1,1 -k 4,4 -k 2,2n -k 3,3n -k 5,5n -k 6,6n -k 8,8n %s > %s" % (tmpFileName, self._outputAlignFileName))
        os.remove(tmpFileName)