comparison tools/plotting/venn_list.py @ 2:c96bef0643dc draft

Uploaded v0.0.3 again to revert upload of wrong file. Sigh.
author peterjc
date Mon, 06 May 2013 14:05:13 -0400
parents baf7031d470e
children 6aae6bc0802d
comparison
equal deleted inserted replaced
1:f1323a651777 2:c96bef0643dc
1 #!/usr/bin/env python
2 """Plot up to 3-way Venn Diagram using R limma vennDiagram (via rpy)
3
4 This script is copyright 2010 by Peter Cock, The James Hutton Institute
5 (formerly SCRI), UK. All rights reserved.
6 See accompanying text file for licence details (MIT/BSD style).
7
8 This is version 0.0.3 of the script.
9 """
10
11
12 import sys
13 import rpy
14
15 def stop_err(msg, error_level=1):
16 """Print error message to stdout and quit with given error level."""
17 sys.stderr.write("%s\n" % msg)
18 sys.exit(error_level)
19
20 try:
21 import rpy
22 except ImportError:
23 stop_err("Requires the Python library rpy (to call R)")
24
25 try:
26 rpy.r.library("limma")
27 except:
28 stop_err("Requires the R library limma (for vennDiagram function)")
29
30
31 if len(sys.argv)-1 not in [7, 10, 13]:
32 stop_err("Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" % (len(sys.argv)-1))
33
34 all_file, all_type, all_label = sys.argv[1:4]
35 set_data = []
36 if len(sys.argv)-1 >= 7:
37 set_data.append(tuple(sys.argv[4:7]))
38 if len(sys.argv)-1 >= 10:
39 set_data.append(tuple(sys.argv[7:10]))
40 if len(sys.argv)-1 >= 13:
41 set_data.append(tuple(sys.argv[10:13]))
42 pdf_file = sys.argv[-1]
43 n = len(set_data)
44 print "Doing %i-way Venn Diagram" % n
45
46 def load_ids(filename, filetype):
47 if filetype=="tabular":
48 for line in open(filename):
49 if not line.startswith("#"):
50 yield line.rstrip("\n").split("\t",1)[0]
51 elif filetype=="fasta":
52 for line in open(filename):
53 if line.startswith(">"):
54 yield line[1:].rstrip("\n").split(None,1)[0]
55 elif filetype.startswith("fastq"):
56 #Use the Galaxy library not Biopython to cope with CS
57 from galaxy_utils.sequence.fastq import fastqReader
58 handle = open(filename, "rU")
59 for record in fastqReader(handle):
60 #The [1:] is because the fastaReader leaves the @ on the identifer.
61 yield record.identifier.split()[0][1:]
62 handle.close()
63 elif filetype=="sff":
64 try:
65 from Bio.SeqIO import index
66 except ImportError:
67 stop_err("Require Biopython 1.54 or later (to read SFF files)")
68 #This will read the SFF index block if present (very fast)
69 for name in index(filename, "sff"):
70 yield name
71 else:
72 stop_err("Unexpected file type %s" % filetype)
73
74 def load_ids_whitelist(filename, filetype, whitelist):
75 for name in load_ids(filename, filetype):
76 if name in whitelist:
77 yield name
78 else:
79 stop_err("Unexpected ID %s in %s file %s" % (name, filetype, filename))
80
81 if all_file in ["", "-", '""', '"-"']:
82 #Load without white list
83 sets = [set(load_ids(f,t)) for (f,t,c) in set_data]
84 #Take union
85 all = set()
86 for s in sets:
87 all.update(s)
88 print "Inferred total of %i IDs" % len(all)
89 else:
90 all = set(load_ids(all_file, all_type))
91 print "Total of %i IDs" % len(all)
92 sets = [set(load_ids_whitelist(f,t,all)) for (f,t,c) in set_data]
93
94 for s, (f,t,c) in zip(sets, set_data):
95 print "%i in %s" % (len(s), c)
96
97 #Now call R library to draw simple Venn diagram
98 try:
99 #Create dummy Venn diagram counts object for three groups
100 cols = 'c("%s")' % '","'.join("Set%i" % (i+1) for i in range(n))
101 rpy.r('groups <- cbind(%s)' % ','.join(['1']*n))
102 rpy.r('colnames(groups) <- %s' % cols)
103 rpy.r('vc <- vennCounts(groups)')
104 #Populate the 2^n classes with real counts
105 #Don't make any assumptions about the class order
106 #print rpy.r('vc')
107 for index, row in enumerate(rpy.r('vc[,%s]' % cols)):
108 if isinstance(row, int) or isinstance(row, float):
109 #Hack for rpy being too clever for single element row
110 row = [row]
111 names = all
112 for wanted, s in zip(row, sets):
113 if wanted:
114 names = names.intersection(s)
115 else:
116 names = names.difference(s)
117 rpy.r('vc[%i,"Counts"] <- %i' % (index+1, len(names)))
118 #print rpy.r('vc')
119 if n == 1:
120 #Single circle, don't need to add (Total XXX) line
121 names = [c for (t,f,c) in set_data]
122 else:
123 names = ["%s\n(Total %i)" % (c, len(s)) for s, (f,t,c) in zip(sets, set_data)]
124 rpy.r.assign("names", names)
125 rpy.r.assign("colors", ["red","green","blue"][:n])
126 rpy.r.pdf(pdf_file, 8, 8)
127 rpy.r("""vennDiagram(vc, include="both", names=names,
128 main="%s", sub="(Total %i)",
129 circle.col=colors)
130 """ % (all_label, len(all)))
131 rpy.r.dev_off()
132 except Exception, exc:
133 stop_err( "%s" %str( exc ) )
134 rpy.r.quit( save="no" )
135 print "Done"