Mercurial > repos > peterjc > venn_list
comparison tools/venn_list/venn_list.py @ 6:ea68a1a4c1d9 draft
v0.0.10 explicit galaxy_sequence_utils dependency etc
author | peterjc |
---|---|
date | Thu, 02 Feb 2017 11:17:31 -0500 |
parents | 991342eca214 |
children | ee50d9ef9d69 |
comparison
equal
deleted
inserted
replaced
5:26e35d5133a1 | 6:ea68a1a4c1d9 |
---|---|
9 """ | 9 """ |
10 | 10 |
11 | 11 |
12 import sys | 12 import sys |
13 | 13 |
14 def sys_exit(msg, error_level=1): | |
15 """Print error message to stdout and quit with given error level.""" | |
16 sys.stderr.write("%s\n" % msg) | |
17 sys.exit(error_level) | |
18 | |
19 try: | 14 try: |
20 import rpy | 15 import rpy |
21 except ImportError: | 16 except ImportError: |
22 sys_exit("Requires the Python library rpy (to call R)") | 17 sys.exit("Requires the Python library rpy (to call R)") |
23 except RuntimeError, e: | 18 except RuntimeError, e: |
24 sys_exit("The Python library rpy is not availble for the current R version\n\n%s" % e) | 19 sys.exit("The Python library rpy is not availble for the current R version\n\n%s" % e) |
25 | 20 |
26 try: | 21 try: |
27 rpy.r.library("limma") | 22 rpy.r.library("limma") |
28 except: | 23 except Exception: |
29 sys_exit("Requires the R library limma (for vennDiagram function)") | 24 sys.exit("Requires the R library limma (for vennDiagram function)") |
30 | 25 |
31 | 26 |
32 if len(sys.argv)-1 not in [7, 10, 13]: | 27 if len(sys.argv) - 1 not in [7, 10, 13]: |
33 sys_exit("Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" % (len(sys.argv)-1)) | 28 sys.exit("Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" % (len(sys.argv) - 1)) |
34 | 29 |
35 all_file, all_type, all_label = sys.argv[1:4] | 30 all_file, all_type, all_label = sys.argv[1:4] |
36 set_data = [] | 31 set_data = [] |
37 if len(sys.argv)-1 >= 7: | 32 if len(sys.argv) - 1 >= 7: |
38 set_data.append(tuple(sys.argv[4:7])) | 33 set_data.append(tuple(sys.argv[4:7])) |
39 if len(sys.argv)-1 >= 10: | 34 if len(sys.argv) - 1 >= 10: |
40 set_data.append(tuple(sys.argv[7:10])) | 35 set_data.append(tuple(sys.argv[7:10])) |
41 if len(sys.argv)-1 >= 13: | 36 if len(sys.argv) - 1 >= 13: |
42 set_data.append(tuple(sys.argv[10:13])) | 37 set_data.append(tuple(sys.argv[10:13])) |
43 pdf_file = sys.argv[-1] | 38 pdf_file = sys.argv[-1] |
44 n = len(set_data) | 39 n = len(set_data) |
45 print "Doing %i-way Venn Diagram" % n | 40 print "Doing %i-way Venn Diagram" % n |
46 | 41 |
42 | |
47 def load_ids(filename, filetype): | 43 def load_ids(filename, filetype): |
48 if filetype=="tabular": | 44 if filetype == "tabular": |
49 for line in open(filename): | 45 for line in open(filename): |
50 line = line.rstrip("\n") | 46 line = line.rstrip("\n") |
51 if line and not line.startswith("#"): | 47 if line and not line.startswith("#"): |
52 yield line.split("\t",1)[0] | 48 yield line.split("\t", 1)[0] |
53 elif filetype=="fasta": | 49 elif filetype == "fasta": |
54 for line in open(filename): | 50 for line in open(filename): |
55 if line.startswith(">"): | 51 if line.startswith(">"): |
56 yield line[1:].rstrip("\n").split(None,1)[0] | 52 yield line[1:].rstrip("\n").split(None, 1)[0] |
57 elif filetype.startswith("fastq"): | 53 elif filetype.startswith("fastq"): |
58 #Use the Galaxy library not Biopython to cope with CS | 54 # Use the Galaxy library not Biopython to cope with CS |
59 from galaxy_utils.sequence.fastq import fastqReader | 55 from galaxy_utils.sequence.fastq import fastqReader |
60 handle = open(filename, "rU") | 56 handle = open(filename, "rU") |
61 for record in fastqReader(handle): | 57 for record in fastqReader(handle): |
62 #The [1:] is because the fastaReader leaves the @ on the identifer. | 58 # The [1:] is because the fastaReader leaves the @ on the identifer. |
63 yield record.identifier.split()[0][1:] | 59 yield record.identifier.split()[0][1:] |
64 handle.close() | 60 handle.close() |
65 elif filetype=="sff": | 61 elif filetype == "sff": |
66 try: | 62 try: |
67 from Bio.SeqIO import index | 63 from Bio.SeqIO import index |
68 except ImportError: | 64 except ImportError: |
69 sys_exit("Require Biopython 1.54 or later (to read SFF files)") | 65 sys.exit("Require Biopython 1.54 or later (to read SFF files)") |
70 #This will read the SFF index block if present (very fast) | 66 # This will read the SFF index block if present (very fast) |
71 for name in index(filename, "sff"): | 67 for name in index(filename, "sff"): |
72 yield name | 68 yield name |
73 else: | 69 else: |
74 sys_exit("Unexpected file type %s" % filetype) | 70 sys.exit("Unexpected file type %s" % filetype) |
71 | |
75 | 72 |
76 def load_ids_whitelist(filename, filetype, whitelist): | 73 def load_ids_whitelist(filename, filetype, whitelist): |
77 for name in load_ids(filename, filetype): | 74 for name in load_ids(filename, filetype): |
78 if name in whitelist: | 75 if name in whitelist: |
79 yield name | 76 yield name |
80 else: | 77 else: |
81 sys_exit("Unexpected ID %s in %s file %s" % (name, filetype, filename)) | 78 sys.exit("Unexpected ID %s in %s file %s" % (name, filetype, filename)) |
82 | 79 |
83 if all_file in ["", "-", '""', '"-"']: | 80 if all_file in ["", "-", '""', '"-"']: |
84 #Load without white list | 81 # Load without white list |
85 sets = [set(load_ids(f,t)) for (f,t,c) in set_data] | 82 sets = [set(load_ids(f, t)) for (f, t, c) in set_data] |
86 #Take union | 83 # Take union |
87 all = set() | 84 all_ids = set() |
88 for s in sets: | 85 for s in sets: |
89 all.update(s) | 86 all_ids.update(s) |
90 print "Inferred total of %i IDs" % len(all) | 87 print "Inferred total of %i IDs" % len(all_ids) |
91 else: | 88 else: |
92 all = set(load_ids(all_file, all_type)) | 89 all_ids = set(load_ids(all_file, all_type)) |
93 print "Total of %i IDs" % len(all) | 90 print "Total of %i IDs" % len(all_ids) |
94 sets = [set(load_ids_whitelist(f,t,all)) for (f,t,c) in set_data] | 91 sets = [set(load_ids_whitelist(f, t, all_ids)) for (f, t, c) in set_data] |
95 | 92 |
96 for s, (f,t,c) in zip(sets, set_data): | 93 for s, (f, t, c) in zip(sets, set_data): |
97 print "%i in %s" % (len(s), c) | 94 print "%i in %s" % (len(s), c) |
98 | 95 |
99 #Now call R library to draw simple Venn diagram | 96 # Now call R library to draw simple Venn diagram |
100 try: | 97 try: |
101 #Create dummy Venn diagram counts object for three groups | 98 # Create dummy Venn diagram counts object for three groups |
102 cols = 'c("%s")' % '","'.join("Set%i" % (i+1) for i in range(n)) | 99 cols = 'c("%s")' % '","'.join("Set%i" % (i + 1) for i in range(n)) |
103 rpy.r('groups <- cbind(%s)' % ','.join(['1']*n)) | 100 rpy.r('groups <- cbind(%s)' % ','.join(['1'] * n)) |
104 rpy.r('colnames(groups) <- %s' % cols) | 101 rpy.r('colnames(groups) <- %s' % cols) |
105 rpy.r('vc <- vennCounts(groups)') | 102 rpy.r('vc <- vennCounts(groups)') |
106 #Populate the 2^n classes with real counts | 103 # Populate the 2^n classes with real counts |
107 #Don't make any assumptions about the class order | 104 # Don't make any assumptions about the class order |
108 #print rpy.r('vc') | 105 # print rpy.r('vc') |
109 for index, row in enumerate(rpy.r('vc[,%s]' % cols)): | 106 for index, row in enumerate(rpy.r('vc[,%s]' % cols)): |
110 if isinstance(row, int) or isinstance(row, float): | 107 if isinstance(row, int) or isinstance(row, float): |
111 #Hack for rpy being too clever for single element row | 108 # Hack for rpy being too clever for single element row |
112 row = [row] | 109 row = [row] |
113 names = all | 110 names = all_ids |
114 for wanted, s in zip(row, sets): | 111 for wanted, s in zip(row, sets): |
115 if wanted: | 112 if wanted: |
116 names = names.intersection(s) | 113 names = names.intersection(s) |
117 else: | 114 else: |
118 names = names.difference(s) | 115 names = names.difference(s) |
119 rpy.r('vc[%i,"Counts"] <- %i' % (index+1, len(names))) | 116 rpy.r('vc[%i,"Counts"] <- %i' % (index + 1, len(names))) |
120 #print rpy.r('vc') | 117 # print rpy.r('vc') |
121 if n == 1: | 118 if n == 1: |
122 #Single circle, don't need to add (Total XXX) line | 119 # Single circle, don't need to add (Total XXX) line |
123 names = [c for (t,f,c) in set_data] | 120 names = [c for (t, f, c) in set_data] |
124 else: | 121 else: |
125 names = ["%s\n(Total %i)" % (c, len(s)) for s, (f,t,c) in zip(sets, set_data)] | 122 names = ["%s\n(Total %i)" % (c, len(s)) for s, (f, t, c) in zip(sets, set_data)] |
126 rpy.r.assign("names", names) | 123 rpy.r.assign("names", names) |
127 rpy.r.assign("colors", ["red","green","blue"][:n]) | 124 rpy.r.assign("colors", ["red", "green", "blue"][:n]) |
128 rpy.r.pdf(pdf_file, 8, 8) | 125 rpy.r.pdf(pdf_file, 8, 8) |
129 rpy.r("""vennDiagram(vc, include="both", names=names, | 126 rpy.r("""vennDiagram(vc, include="both", names=names, |
130 main="%s", sub="(Total %i)", | 127 main="%s", sub="(Total %i)", |
131 circle.col=colors) | 128 circle.col=colors) |
132 """ % (all_label, len(all))) | 129 """ % (all_label, len(all_ids))) |
133 rpy.r.dev_off() | 130 rpy.r.dev_off() |
134 except Exception, exc: | 131 except Exception, exc: |
135 sys_exit( "%s" %str( exc ) ) | 132 sys.exit("%s" % str(exc)) |
136 rpy.r.quit( save="no" ) | 133 rpy.r.quit(save="no") |
137 print "Done" | 134 print "Done" |