Mercurial > repos > rreumerman > snptools
comparison tablemerger.py @ 4:bd5692103d5b draft
Uploaded
author | rreumerman |
---|---|
date | Fri, 05 Apr 2013 05:00:40 -0400 |
parents | |
children | 8de0ffc2166f |
comparison
equal
deleted
inserted
replaced
3:a808647d7312 | 4:bd5692103d5b |
---|---|
1 '''Takes tab-delimited SNP tables from user input and merges them into one.''' | |
2 | |
3 import sys | |
4 files = [] | |
5 filenames = [] | |
6 | |
7 try: | |
8 output = open(sys.argv[1], "w") | |
9 output.write('Position\tReference') | |
10 except: | |
11 exit("No output file given or unable to open output file.") | |
12 for name in sys.argv[2:]: | |
13 try: | |
14 files.append(open(name, "rU")) | |
15 except: | |
16 continue | |
17 | |
18 # Fetch headers and print them to output file; | |
19 headers = [header.readline()[:-1].split('\t')[2:] for header in files] | |
20 columns = [len(strains) for strains in headers] | |
21 for strain in [a for b in headers for a in b]: | |
22 output.write('\t'+strain) | |
23 output.flush() | |
24 | |
25 file_active = [True]*len(files) | |
26 snps = [row.readline()[:-1].split('\t') for row in files] | |
27 | |
28 while True in file_active: | |
29 for h in range(0,len(snps)): | |
30 if file_active[h]: | |
31 cur_pos = [h] | |
32 lowest = int(snps[h][0]) | |
33 break | |
34 i = 1 | |
35 | |
36 # Determine lowest position | |
37 while i < len(snps): | |
38 if int(snps[i][0]) < lowest and file_active[i]: | |
39 lowest = int(snps[i][0]) | |
40 cur_pos = [i] | |
41 elif int(snps[i][0]) == lowest: | |
42 cur_pos.append(i) | |
43 i+=1 | |
44 | |
45 # Check if all SNPs sharing a position have the same reference base, exit with message otherwise; | |
46 if len(cur_pos) > 1: | |
47 ref_base = snps[cur_pos[0]][1].lower() | |
48 for j in cur_pos[1:]: | |
49 if snps[j][1].lower() != ref_base: | |
50 error = '\nError: Reference bases not the same for position %s, present in following files:' % lowest | |
51 for k in cur_pos: | |
52 error += ' '+filenames[k] | |
53 exit(error+'.') | |
54 | |
55 # Write line to output file containing position, ref base and snps/empty cells; | |
56 output.write('\n'+snps[cur_pos[0]][0]+'\t'+snps[cur_pos[0]][1].lower()) | |
57 for l,row in enumerate(snps): | |
58 if l in cur_pos: | |
59 for snp in row[2:]: | |
60 output.write('\t'+snp) | |
61 else: | |
62 output.write('\t'*columns[l]) | |
63 | |
64 # Read new line in files that had snp at current position; | |
65 for m in cur_pos: | |
66 line = files[m].readline() | |
67 if line == '': file_active[m] = False | |
68 else: | |
69 snps[m] = line.split('\t') | |
70 snps[m][-1] = snps[m][-1].rstrip()# Remove newline character at end of line; | |
71 | |
72 for it in files: it.close() | |
73 output.close() |