0
|
1 #!/usr/bin/env python
|
|
2 #Guruprasad Ananda
|
|
3 """
|
|
4 This tool provides the UNIX "join" functionality.
|
|
5 """
|
|
6 import sys, os, tempfile, subprocess
|
|
7
|
|
8 def stop_err(msg):
|
|
9 sys.stderr.write(msg)
|
|
10 sys.exit()
|
|
11
|
|
12 def main():
|
|
13 infile1 = sys.argv[1]
|
|
14 infile2 = sys.argv[2]
|
|
15 field1 = int(sys.argv[3])
|
|
16 field2 = int(sys.argv[4])
|
|
17 mode =sys.argv[5]
|
|
18 outfile = sys.argv[6]
|
|
19
|
|
20 tmpfile1 = tempfile.NamedTemporaryFile()
|
|
21 tmpfile2 = tempfile.NamedTemporaryFile()
|
|
22
|
|
23 try:
|
|
24 #Sort the two files based on specified fields
|
|
25 os.system("sort -t ' ' -k %d,%d -o %s %s" %(field1, field1, tmpfile1.name, infile1))
|
|
26 os.system("sort -t ' ' -k %d,%d -o %s %s" %(field2, field2, tmpfile2.name, infile2))
|
|
27 except Exception, exc:
|
|
28 stop_err( 'Initialization error -> %s' %str(exc) )
|
|
29
|
|
30 option = ""
|
|
31 for line in file(tmpfile1.name):
|
|
32 line = line.strip()
|
|
33 if line:
|
|
34 elems = line.split('\t')
|
|
35 for j in range(1,len(elems)+1):
|
|
36 if j == 1:
|
|
37 option = "1.1"
|
|
38 else:
|
|
39 option = option + ",1." + str(j)
|
|
40 break
|
|
41
|
|
42 #check if join has --version option. BSD join doens't have this option, while GNU join does.
|
|
43 #The return value in the latter case will be 0, and non-zero in the latter case.
|
|
44 ret = subprocess.call('join --version 2>/dev/null', shell=True)
|
|
45 # check if we are a version later than 7 of join. If so, we want to skip
|
|
46 # checking the order since join will raise an error with duplicated items in
|
|
47 # the two files being joined.
|
|
48 if ret == 0:
|
|
49 cl = subprocess.Popen(["join", "--version"], stdout=subprocess.PIPE)
|
|
50 (stdout, _) = cl.communicate()
|
|
51 version_line = stdout.split("\n")[0]
|
|
52 (version, _) = version_line.split()[-1].split(".")
|
|
53 if int(version) >= 7:
|
|
54 flags = "--nocheck-order"
|
|
55 else:
|
|
56 flags = ""
|
|
57 else:
|
|
58 flags = ""
|
|
59
|
|
60 if mode == "V":
|
|
61 cmdline = "join %s -t ' ' -v 1 -o %s -1 %d -2 %d %s %s > %s" %(flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile)
|
|
62 else:
|
|
63 cmdline = "join %s -t ' ' -o %s -1 %d -2 %d %s %s > %s" %(flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile)
|
|
64
|
|
65 try:
|
|
66 os.system(cmdline)
|
|
67 except Exception, exj:
|
|
68 stop_err('Error joining the two datasets -> %s' %str(exj))
|
|
69
|
|
70 if __name__ == "__main__":
|
|
71 main()
|