Mercurial > repos > peterjc > venn_list
diff tools/plotting/venn_list.py @ 2:c96bef0643dc draft
Uploaded v0.0.3 again to revert upload of wrong file. Sigh.
author | peterjc |
---|---|
date | Mon, 06 May 2013 14:05:13 -0400 |
parents | baf7031d470e |
children | 6aae6bc0802d |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/plotting/venn_list.py Mon May 06 14:05:13 2013 -0400 @@ -0,0 +1,135 @@ +#!/usr/bin/env python +"""Plot up to 3-way Venn Diagram using R limma vennDiagram (via rpy) + +This script is copyright 2010 by Peter Cock, The James Hutton Institute +(formerly SCRI), UK. All rights reserved. +See accompanying text file for licence details (MIT/BSD style). + +This is version 0.0.3 of the script. +""" + + +import sys +import rpy + +def stop_err(msg, error_level=1): + """Print error message to stdout and quit with given error level.""" + sys.stderr.write("%s\n" % msg) + sys.exit(error_level) + +try: + import rpy +except ImportError: + stop_err("Requires the Python library rpy (to call R)") + +try: + rpy.r.library("limma") +except: + stop_err("Requires the R library limma (for vennDiagram function)") + + +if len(sys.argv)-1 not in [7, 10, 13]: + stop_err("Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" % (len(sys.argv)-1)) + +all_file, all_type, all_label = sys.argv[1:4] +set_data = [] +if len(sys.argv)-1 >= 7: + set_data.append(tuple(sys.argv[4:7])) +if len(sys.argv)-1 >= 10: + set_data.append(tuple(sys.argv[7:10])) +if len(sys.argv)-1 >= 13: + set_data.append(tuple(sys.argv[10:13])) +pdf_file = sys.argv[-1] +n = len(set_data) +print "Doing %i-way Venn Diagram" % n + +def load_ids(filename, filetype): + if filetype=="tabular": + for line in open(filename): + if not line.startswith("#"): + yield line.rstrip("\n").split("\t",1)[0] + elif filetype=="fasta": + for line in open(filename): + if line.startswith(">"): + yield line[1:].rstrip("\n").split(None,1)[0] + elif filetype.startswith("fastq"): + #Use the Galaxy library not Biopython to cope with CS + from galaxy_utils.sequence.fastq import fastqReader + handle = open(filename, "rU") + for record in fastqReader(handle): + #The [1:] is because the fastaReader leaves the @ on the identifer. + yield record.identifier.split()[0][1:] + handle.close() + elif filetype=="sff": + try: + from Bio.SeqIO import index + except ImportError: + stop_err("Require Biopython 1.54 or later (to read SFF files)") + #This will read the SFF index block if present (very fast) + for name in index(filename, "sff"): + yield name + else: + stop_err("Unexpected file type %s" % filetype) + +def load_ids_whitelist(filename, filetype, whitelist): + for name in load_ids(filename, filetype): + if name in whitelist: + yield name + else: + stop_err("Unexpected ID %s in %s file %s" % (name, filetype, filename)) + +if all_file in ["", "-", '""', '"-"']: + #Load without white list + sets = [set(load_ids(f,t)) for (f,t,c) in set_data] + #Take union + all = set() + for s in sets: + all.update(s) + print "Inferred total of %i IDs" % len(all) +else: + all = set(load_ids(all_file, all_type)) + print "Total of %i IDs" % len(all) + sets = [set(load_ids_whitelist(f,t,all)) for (f,t,c) in set_data] + +for s, (f,t,c) in zip(sets, set_data): + print "%i in %s" % (len(s), c) + +#Now call R library to draw simple Venn diagram +try: + #Create dummy Venn diagram counts object for three groups + cols = 'c("%s")' % '","'.join("Set%i" % (i+1) for i in range(n)) + rpy.r('groups <- cbind(%s)' % ','.join(['1']*n)) + rpy.r('colnames(groups) <- %s' % cols) + rpy.r('vc <- vennCounts(groups)') + #Populate the 2^n classes with real counts + #Don't make any assumptions about the class order + #print rpy.r('vc') + for index, row in enumerate(rpy.r('vc[,%s]' % cols)): + if isinstance(row, int) or isinstance(row, float): + #Hack for rpy being too clever for single element row + row = [row] + names = all + for wanted, s in zip(row, sets): + if wanted: + names = names.intersection(s) + else: + names = names.difference(s) + rpy.r('vc[%i,"Counts"] <- %i' % (index+1, len(names))) + #print rpy.r('vc') + if n == 1: + #Single circle, don't need to add (Total XXX) line + names = [c for (t,f,c) in set_data] + else: + names = ["%s\n(Total %i)" % (c, len(s)) for s, (f,t,c) in zip(sets, set_data)] + rpy.r.assign("names", names) + rpy.r.assign("colors", ["red","green","blue"][:n]) + rpy.r.pdf(pdf_file, 8, 8) + rpy.r("""vennDiagram(vc, include="both", names=names, + main="%s", sub="(Total %i)", + circle.col=colors) + """ % (all_label, len(all))) + rpy.r.dev_off() +except Exception, exc: + stop_err( "%s" %str( exc ) ) +rpy.r.quit( save="no" ) +print "Done"