comparison tools/venn_list/venn_list.py @ 6:ea68a1a4c1d9 draft

v0.0.10 explicit galaxy_sequence_utils dependency etc
author peterjc
date Thu, 02 Feb 2017 11:17:31 -0500
parents 991342eca214
children ee50d9ef9d69
comparison
equal deleted inserted replaced
5:26e35d5133a1 6:ea68a1a4c1d9
9 """ 9 """
10 10
11 11
12 import sys 12 import sys
13 13
14 def sys_exit(msg, error_level=1):
15 """Print error message to stdout and quit with given error level."""
16 sys.stderr.write("%s\n" % msg)
17 sys.exit(error_level)
18
19 try: 14 try:
20 import rpy 15 import rpy
21 except ImportError: 16 except ImportError:
22 sys_exit("Requires the Python library rpy (to call R)") 17 sys.exit("Requires the Python library rpy (to call R)")
23 except RuntimeError, e: 18 except RuntimeError, e:
24 sys_exit("The Python library rpy is not availble for the current R version\n\n%s" % e) 19 sys.exit("The Python library rpy is not availble for the current R version\n\n%s" % e)
25 20
26 try: 21 try:
27 rpy.r.library("limma") 22 rpy.r.library("limma")
28 except: 23 except Exception:
29 sys_exit("Requires the R library limma (for vennDiagram function)") 24 sys.exit("Requires the R library limma (for vennDiagram function)")
30 25
31 26
32 if len(sys.argv)-1 not in [7, 10, 13]: 27 if len(sys.argv) - 1 not in [7, 10, 13]:
33 sys_exit("Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" % (len(sys.argv)-1)) 28 sys.exit("Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" % (len(sys.argv) - 1))
34 29
35 all_file, all_type, all_label = sys.argv[1:4] 30 all_file, all_type, all_label = sys.argv[1:4]
36 set_data = [] 31 set_data = []
37 if len(sys.argv)-1 >= 7: 32 if len(sys.argv) - 1 >= 7:
38 set_data.append(tuple(sys.argv[4:7])) 33 set_data.append(tuple(sys.argv[4:7]))
39 if len(sys.argv)-1 >= 10: 34 if len(sys.argv) - 1 >= 10:
40 set_data.append(tuple(sys.argv[7:10])) 35 set_data.append(tuple(sys.argv[7:10]))
41 if len(sys.argv)-1 >= 13: 36 if len(sys.argv) - 1 >= 13:
42 set_data.append(tuple(sys.argv[10:13])) 37 set_data.append(tuple(sys.argv[10:13]))
43 pdf_file = sys.argv[-1] 38 pdf_file = sys.argv[-1]
44 n = len(set_data) 39 n = len(set_data)
45 print "Doing %i-way Venn Diagram" % n 40 print "Doing %i-way Venn Diagram" % n
46 41
42
47 def load_ids(filename, filetype): 43 def load_ids(filename, filetype):
48 if filetype=="tabular": 44 if filetype == "tabular":
49 for line in open(filename): 45 for line in open(filename):
50 line = line.rstrip("\n") 46 line = line.rstrip("\n")
51 if line and not line.startswith("#"): 47 if line and not line.startswith("#"):
52 yield line.split("\t",1)[0] 48 yield line.split("\t", 1)[0]
53 elif filetype=="fasta": 49 elif filetype == "fasta":
54 for line in open(filename): 50 for line in open(filename):
55 if line.startswith(">"): 51 if line.startswith(">"):
56 yield line[1:].rstrip("\n").split(None,1)[0] 52 yield line[1:].rstrip("\n").split(None, 1)[0]
57 elif filetype.startswith("fastq"): 53 elif filetype.startswith("fastq"):
58 #Use the Galaxy library not Biopython to cope with CS 54 # Use the Galaxy library not Biopython to cope with CS
59 from galaxy_utils.sequence.fastq import fastqReader 55 from galaxy_utils.sequence.fastq import fastqReader
60 handle = open(filename, "rU") 56 handle = open(filename, "rU")
61 for record in fastqReader(handle): 57 for record in fastqReader(handle):
62 #The [1:] is because the fastaReader leaves the @ on the identifer. 58 # The [1:] is because the fastaReader leaves the @ on the identifer.
63 yield record.identifier.split()[0][1:] 59 yield record.identifier.split()[0][1:]
64 handle.close() 60 handle.close()
65 elif filetype=="sff": 61 elif filetype == "sff":
66 try: 62 try:
67 from Bio.SeqIO import index 63 from Bio.SeqIO import index
68 except ImportError: 64 except ImportError:
69 sys_exit("Require Biopython 1.54 or later (to read SFF files)") 65 sys.exit("Require Biopython 1.54 or later (to read SFF files)")
70 #This will read the SFF index block if present (very fast) 66 # This will read the SFF index block if present (very fast)
71 for name in index(filename, "sff"): 67 for name in index(filename, "sff"):
72 yield name 68 yield name
73 else: 69 else:
74 sys_exit("Unexpected file type %s" % filetype) 70 sys.exit("Unexpected file type %s" % filetype)
71
75 72
76 def load_ids_whitelist(filename, filetype, whitelist): 73 def load_ids_whitelist(filename, filetype, whitelist):
77 for name in load_ids(filename, filetype): 74 for name in load_ids(filename, filetype):
78 if name in whitelist: 75 if name in whitelist:
79 yield name 76 yield name
80 else: 77 else:
81 sys_exit("Unexpected ID %s in %s file %s" % (name, filetype, filename)) 78 sys.exit("Unexpected ID %s in %s file %s" % (name, filetype, filename))
82 79
83 if all_file in ["", "-", '""', '"-"']: 80 if all_file in ["", "-", '""', '"-"']:
84 #Load without white list 81 # Load without white list
85 sets = [set(load_ids(f,t)) for (f,t,c) in set_data] 82 sets = [set(load_ids(f, t)) for (f, t, c) in set_data]
86 #Take union 83 # Take union
87 all = set() 84 all_ids = set()
88 for s in sets: 85 for s in sets:
89 all.update(s) 86 all_ids.update(s)
90 print "Inferred total of %i IDs" % len(all) 87 print "Inferred total of %i IDs" % len(all_ids)
91 else: 88 else:
92 all = set(load_ids(all_file, all_type)) 89 all_ids = set(load_ids(all_file, all_type))
93 print "Total of %i IDs" % len(all) 90 print "Total of %i IDs" % len(all_ids)
94 sets = [set(load_ids_whitelist(f,t,all)) for (f,t,c) in set_data] 91 sets = [set(load_ids_whitelist(f, t, all_ids)) for (f, t, c) in set_data]
95 92
96 for s, (f,t,c) in zip(sets, set_data): 93 for s, (f, t, c) in zip(sets, set_data):
97 print "%i in %s" % (len(s), c) 94 print "%i in %s" % (len(s), c)
98 95
99 #Now call R library to draw simple Venn diagram 96 # Now call R library to draw simple Venn diagram
100 try: 97 try:
101 #Create dummy Venn diagram counts object for three groups 98 # Create dummy Venn diagram counts object for three groups
102 cols = 'c("%s")' % '","'.join("Set%i" % (i+1) for i in range(n)) 99 cols = 'c("%s")' % '","'.join("Set%i" % (i + 1) for i in range(n))
103 rpy.r('groups <- cbind(%s)' % ','.join(['1']*n)) 100 rpy.r('groups <- cbind(%s)' % ','.join(['1'] * n))
104 rpy.r('colnames(groups) <- %s' % cols) 101 rpy.r('colnames(groups) <- %s' % cols)
105 rpy.r('vc <- vennCounts(groups)') 102 rpy.r('vc <- vennCounts(groups)')
106 #Populate the 2^n classes with real counts 103 # Populate the 2^n classes with real counts
107 #Don't make any assumptions about the class order 104 # Don't make any assumptions about the class order
108 #print rpy.r('vc') 105 # print rpy.r('vc')
109 for index, row in enumerate(rpy.r('vc[,%s]' % cols)): 106 for index, row in enumerate(rpy.r('vc[,%s]' % cols)):
110 if isinstance(row, int) or isinstance(row, float): 107 if isinstance(row, int) or isinstance(row, float):
111 #Hack for rpy being too clever for single element row 108 # Hack for rpy being too clever for single element row
112 row = [row] 109 row = [row]
113 names = all 110 names = all_ids
114 for wanted, s in zip(row, sets): 111 for wanted, s in zip(row, sets):
115 if wanted: 112 if wanted:
116 names = names.intersection(s) 113 names = names.intersection(s)
117 else: 114 else:
118 names = names.difference(s) 115 names = names.difference(s)
119 rpy.r('vc[%i,"Counts"] <- %i' % (index+1, len(names))) 116 rpy.r('vc[%i,"Counts"] <- %i' % (index + 1, len(names)))
120 #print rpy.r('vc') 117 # print rpy.r('vc')
121 if n == 1: 118 if n == 1:
122 #Single circle, don't need to add (Total XXX) line 119 # Single circle, don't need to add (Total XXX) line
123 names = [c for (t,f,c) in set_data] 120 names = [c for (t, f, c) in set_data]
124 else: 121 else:
125 names = ["%s\n(Total %i)" % (c, len(s)) for s, (f,t,c) in zip(sets, set_data)] 122 names = ["%s\n(Total %i)" % (c, len(s)) for s, (f, t, c) in zip(sets, set_data)]
126 rpy.r.assign("names", names) 123 rpy.r.assign("names", names)
127 rpy.r.assign("colors", ["red","green","blue"][:n]) 124 rpy.r.assign("colors", ["red", "green", "blue"][:n])
128 rpy.r.pdf(pdf_file, 8, 8) 125 rpy.r.pdf(pdf_file, 8, 8)
129 rpy.r("""vennDiagram(vc, include="both", names=names, 126 rpy.r("""vennDiagram(vc, include="both", names=names,
130 main="%s", sub="(Total %i)", 127 main="%s", sub="(Total %i)",
131 circle.col=colors) 128 circle.col=colors)
132 """ % (all_label, len(all))) 129 """ % (all_label, len(all_ids)))
133 rpy.r.dev_off() 130 rpy.r.dev_off()
134 except Exception, exc: 131 except Exception, exc:
135 sys_exit( "%s" %str( exc ) ) 132 sys.exit("%s" % str(exc))
136 rpy.r.quit( save="no" ) 133 rpy.r.quit(save="no")
137 print "Done" 134 print "Done"