4
|
1 '''Takes tab-delimited SNP tables from user input and merges them into one.'''
|
|
2
|
|
3 import sys
|
|
4 files = []
|
|
5 filenames = []
|
|
6
|
|
7 try:
|
|
8 output = open(sys.argv[1], "w")
|
|
9 output.write('Position\tReference')
|
|
10 except:
|
|
11 exit("No output file given or unable to open output file.")
|
|
12 for name in sys.argv[2:]:
|
|
13 try:
|
|
14 files.append(open(name, "rU"))
|
|
15 except:
|
|
16 continue
|
|
17
|
|
18 # Fetch headers and print them to output file;
|
|
19 headers = [header.readline()[:-1].split('\t')[2:] for header in files]
|
|
20 columns = [len(strains) for strains in headers]
|
|
21 for strain in [a for b in headers for a in b]:
|
|
22 output.write('\t'+strain)
|
|
23 output.flush()
|
|
24
|
|
25 file_active = [True]*len(files)
|
|
26 snps = [row.readline()[:-1].split('\t') for row in files]
|
|
27
|
|
28 while True in file_active:
|
|
29 for h in range(0,len(snps)):
|
|
30 if file_active[h]:
|
|
31 cur_pos = [h]
|
|
32 lowest = int(snps[h][0])
|
|
33 break
|
|
34 i = 1
|
|
35
|
|
36 # Determine lowest position
|
|
37 while i < len(snps):
|
|
38 if int(snps[i][0]) < lowest and file_active[i]:
|
|
39 lowest = int(snps[i][0])
|
|
40 cur_pos = [i]
|
|
41 elif int(snps[i][0]) == lowest:
|
|
42 cur_pos.append(i)
|
|
43 i+=1
|
|
44
|
|
45 # Check if all SNPs sharing a position have the same reference base, exit with message otherwise;
|
|
46 if len(cur_pos) > 1:
|
|
47 ref_base = snps[cur_pos[0]][1].lower()
|
|
48 for j in cur_pos[1:]:
|
|
49 if snps[j][1].lower() != ref_base:
|
|
50 error = '\nError: Reference bases not the same for position %s, present in following files:' % lowest
|
|
51 for k in cur_pos:
|
|
52 error += ' '+filenames[k]
|
|
53 exit(error+'.')
|
|
54
|
|
55 # Write line to output file containing position, ref base and snps/empty cells;
|
|
56 output.write('\n'+snps[cur_pos[0]][0]+'\t'+snps[cur_pos[0]][1].lower())
|
|
57 for l,row in enumerate(snps):
|
|
58 if l in cur_pos:
|
|
59 for snp in row[2:]:
|
|
60 output.write('\t'+snp)
|
|
61 else:
|
|
62 output.write('\t'*columns[l])
|
|
63
|
|
64 # Read new line in files that had snp at current position;
|
|
65 for m in cur_pos:
|
|
66 line = files[m].readline()
|
|
67 if line == '': file_active[m] = False
|
|
68 else:
|
|
69 snps[m] = line.split('\t')
|
|
70 snps[m][-1] = snps[m][-1].rstrip()# Remove newline character at end of line;
|
|
71
|
|
72 for it in files: it.close()
|
|
73 output.close()
|