view snpsplit.py @ 5:b6786c2247b1 draft

Uploaded
author rreumerman
date Fri, 05 Apr 2013 05:05:30 -0400
parents bd5692103d5b
children
line wrap: on
line source

'''This script takes a tab-delimited file containting position, ref base, mut base and splits any multicharacter ref or mut base entries into seperate lines and calculating the new positions'''

import sys

if len(sys.argv) != 3:
    exit("snpsplit takes exactly two arguments (input and output file), no more and no less")

input_name = sys.argv[1]
output_name = sys.argv[2]

try:
    in_file = open(input_name)
except IOError as e:
    exit("Error trying to open '"+input_name+"': {1}".format(e.errno, e.strerror))

try:
    out_file = open(output_name, 'w')
except IOError as e:
    exit("Error trying to open '"+output_name+"': {1}".format(e.errno, e.strerror))

def splitter(cells):
    global out_lines
    for i in range(0,len(cells[1])):
        if cells[1][i] == cells[2][i]: continue
        out_file.write(str(int(cells[0])+i)+'\t'+cells[1][i]+'\t'+cells[2][i]+'\n')
        out_lines += 1

in_lines=out_lines=0
out_file.write("Position\tRef\tMut\n")
for line in in_file:
    in_lines += 1
    cells = line.rstrip().split('\t')
    if not str(line[0]).isdigit():
        out_file.write(line)
        continue

    # Can only deal with SNPs/MNPs, not indels.
    if len(cells[1]) != len(cells[2]): continue
    splitter(cells)

in_file.close()
out_file.close()

print "Lines read: %s" % in_lines
print "Lines printed: %s" % out_lines