view tools/regVariation/substitutions.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env python
#Guruprasad ANanda
"""
Fetches substitutions from pairwise alignments.
"""

from galaxy import eggs

from galaxy.tools.util import maf_utilities

import bx.align.maf
import sys
import os, fileinput
def stop_err(msg):
    sys.stderr.write(msg)
    sys.exit()

if len(sys.argv) < 3:
        stop_err("Incorrect number of arguments.")    
    
inp_file = sys.argv[1]
out_file = sys.argv[2]
fout = open(out_file, 'w')

def fetchSubs(block):
    
    src1 = block.components[0].src
    sequence1 = block.components[0].text
    start1 = block.components[0].start
    end1 = block.components[0].end
    len1 = int(end1)-int(start1)
    len1_withgap = len(sequence1)
    
    for seq in range (1,len(block.components)):
        src2 = block.components[seq].src
        sequence2 = block.components[seq].text
        start2 = block.components[seq].start
        end2 = block.components[seq].end
        len2 = int(end2)-int(start2)
        sub_begin = None
        sub_end = None
        begin = False
        
        for nt in range(len1_withgap):
            if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?': #Not a gap or masked character
                if sequence1[nt].upper() != sequence2[nt].upper():
                    if not(begin):
                        sub_begin = nt
                        begin = True
                    sub_end = nt
                else:
                    if begin:
                        print >>fout, "%s\t%s\t%s" %(src1,start1+sub_begin-sequence1[0:sub_begin].count('-'),start1+sub_end-sequence1[0:sub_end].count('-'))
                        print >>fout, "%s\t%s\t%s" %(src2,start2+sub_begin-sequence2[0:sub_begin].count('-'),start2+sub_end-sequence2[0:sub_end].count('-'))    
                        begin = False

            else:
                if begin:
                    print >>fout, "%s\t%s\t%s" %(src1,start1+sub_begin-sequence1[0:sub_begin].count('-'),end1+sub_end-sequence1[0:sub_end].count('-'))
                    print >>fout, "%s\t%s\t%s" %(src2,start2+sub_begin-sequence2[0:sub_begin].count('-'),end2+sub_end-sequence2[0:sub_end].count('-'))    
                    begin = False
                    ended = False
    
              
def main():
    skipped = 0
    not_pairwise = 0
    try:
        maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
    except:
        stop_err("Your MAF file appears to be malformed.")
    print >>fout, "#Chr\tStart\tEnd"
    for block in maf_reader:
        if len(block.components) != 2:
            not_pairwise += 1
            continue
        try:
            fetchSubs(block)
        except:
            skipped += 1
    
    if not_pairwise:
        print "Skipped %d non-pairwise blocks" %(not_pairwise)
    if skipped:
        print "Skipped %d blocks" %(skipped)
if __name__ == "__main__":
    main()