annotate tools/filters/joinWrapper.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 #Guruprasad Ananda
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 This tool provides the UNIX "join" functionality.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 import sys, os, tempfile, subprocess
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 def stop_err(msg):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 sys.stderr.write(msg)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 def main():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 infile1 = sys.argv[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 infile2 = sys.argv[2]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 field1 = int(sys.argv[3])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 field2 = int(sys.argv[4])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 mode =sys.argv[5]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 outfile = sys.argv[6]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 tmpfile1 = tempfile.NamedTemporaryFile()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 tmpfile2 = tempfile.NamedTemporaryFile()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 #Sort the two files based on specified fields
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 os.system("sort -t ' ' -k %d,%d -o %s %s" %(field1, field1, tmpfile1.name, infile1))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 os.system("sort -t ' ' -k %d,%d -o %s %s" %(field2, field2, tmpfile2.name, infile2))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 except Exception, exc:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 stop_err( 'Initialization error -> %s' %str(exc) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 option = ""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 for line in file(tmpfile1.name):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 line = line.strip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 if line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 elems = line.split('\t')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 for j in range(1,len(elems)+1):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 if j == 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 option = "1.1"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 option = option + ",1." + str(j)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 #check if join has --version option. BSD join doens't have this option, while GNU join does.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 #The return value in the latter case will be 0, and non-zero in the latter case.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 ret = subprocess.call('join --version 2>/dev/null', shell=True)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 # check if we are a version later than 7 of join. If so, we want to skip
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 # checking the order since join will raise an error with duplicated items in
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 # the two files being joined.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 if ret == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 cl = subprocess.Popen(["join", "--version"], stdout=subprocess.PIPE)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 (stdout, _) = cl.communicate()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 version_line = stdout.split("\n")[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 (version, _) = version_line.split()[-1].split(".")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 if int(version) >= 7:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 flags = "--nocheck-order"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 flags = ""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 flags = ""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 if mode == "V":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 cmdline = "join %s -t ' ' -v 1 -o %s -1 %d -2 %d %s %s > %s" %(flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 cmdline = "join %s -t ' ' -o %s -1 %d -2 %d %s %s > %s" %(flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 os.system(cmdline)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 except Exception, exj:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 stop_err('Error joining the two datasets -> %s' %str(exj))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 if __name__ == "__main__":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 main()