annotate snpsplit.py @ 4:bd5692103d5b draft

Uploaded
author rreumerman
date Fri, 05 Apr 2013 05:00:40 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
1 '''This script takes a tab-delimited file containting position, ref base, mut base and splits any multicharacter ref or mut base entries into seperate lines and calculating the new positions'''
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
2
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
3 import sys
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
4
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
5 if len(sys.argv) != 3:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
6 exit("snpsplit takes exactly two arguments (input and output file), no more and no less")
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
7
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
8 input_name = sys.argv[1]
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
9 output_name = sys.argv[2]
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
10
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
11 try:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
12 in_file = open(input_name)
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
13 except IOError as e:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
14 exit("Error trying to open '"+input_name+"': {1}".format(e.errno, e.strerror))
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
15
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
16 try:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
17 out_file = open(output_name, 'w')
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
18 except IOError as e:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
19 exit("Error trying to open '"+output_name+"': {1}".format(e.errno, e.strerror))
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
20
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
21 def splitter(cells):
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
22 global out_lines
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
23 for i in range(0,len(cells[1])):
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
24 if cells[1][i] == cells[2][i]: continue
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
25 out_file.write(str(int(cells[0])+i)+'\t'+cells[1][i]+'\t'+cells[2][i]+'\n')
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
26 out_lines += 1
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
27
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
28 in_lines=out_lines=0
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
29 out_file.write("Position\tRef\tMut\n")
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
30 for line in in_file:
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
31 in_lines += 1
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
32 cells = line.rstrip().split('\t')
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
33 if not str(line[0]).isdigit():
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
34 out_file.write(line)
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
35 continue
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
36
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
37 # Can only deal with SNPs/MNPs, not indels.
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
38 if len(cells[1]) != len(cells[2]): continue
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
39 splitter(cells)
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
40
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
41 in_file.close()
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
42 out_file.close()
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
43
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
44 print "Lines read: %s" % in_lines
bd5692103d5b Uploaded
rreumerman
parents:
diff changeset
45 print "Lines printed: %s" % out_lines