Mercurial > repos > boris > hetbox
comparison hetbox.py @ 0:8d2c4c11fd8f draft default tip
Uploaded repo.tar.gz
| author | boris | 
|---|---|
| date | Mon, 03 Feb 2014 13:15:10 -0500 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:8d2c4c11fd8f | 
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 # Code by Boris Rebolledo-Jaramillo | |
| 4 # (boris-at-bx.psu.edu) | |
| 5 # Edited by Nick Stoler | |
| 6 # (nick-at-bx.psu.edu) | |
| 7 # New in this version: | |
| 8 # - Add in proper header line if not present | |
| 9 | |
| 10 | |
| 11 import os | |
| 12 import sys | |
| 13 import array | |
| 14 import numpy | |
| 15 from rpy2.robjects import Formula | |
| 16 from rpy2.robjects.packages import importr | |
| 17 from rpy2 import robjects | |
| 18 | |
| 19 def fail(message): | |
| 20 sys.stderr.write(message+'\n') | |
| 21 sys.exit(1) | |
| 22 | |
| 23 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', | |
| 24 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] | |
| 25 | |
| 26 COLUMN_LABELS_STRANDED= ['SAMPLE', 'CHR', 'POS', '+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T', | |
| 27 'CVRG','ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] | |
| 28 | |
| 29 args = sys.argv[1:] | |
| 30 if len(args) >= 1: | |
| 31 infile = args[0] | |
| 32 else: | |
| 33 fail('Error: No input filename provided (as argument 1).') | |
| 34 if len(args) >= 2: | |
| 35 outfile = args[1] | |
| 36 else: | |
| 37 fail('Error: No output filename provided (as argument 2).') | |
| 38 if len(args) >= 3: | |
| 39 report = args[2] | |
| 40 else: | |
| 41 report = '' | |
| 42 | |
| 43 # Check input file | |
| 44 add_header = False | |
| 45 if not os.path.exists(infile): | |
| 46 fail('Error: Input file '+infile+' could not be found.') | |
| 47 with open(infile, 'r') as lines: | |
| 48 line = lines.readline() | |
| 49 if not line: | |
| 50 fail('Error: Input file seems to be empty') | |
| 51 line = line.strip().lstrip('#') # rm whitespace, comment chars | |
| 52 labels = line.split("\t") | |
| 53 if 'SAMPLE' not in labels: | |
| 54 sys.stderr.write("Error: Input file does not seem to have a proper header " | |
| 55 +"line.\nAdding an artificial header..") | |
| 56 add_header = True | |
| 57 | |
| 58 r = robjects.r | |
| 59 base = importr('base') | |
| 60 utils = importr('utils') | |
| 61 stats = importr('stats') | |
| 62 rprint = robjects.globalenv.get("print") | |
| 63 graphics = importr('graphics') | |
| 64 grdevices = importr('grDevices') | |
| 65 grdevices.png(file=outfile, width=1024, height=768, type="cairo") | |
| 66 | |
| 67 # Read file into a data frame | |
| 68 if add_header: | |
| 69 # add header line manually if not present | |
| 70 DATA = utils.read_delim(infile, header=False) | |
| 71 labels = robjects.r.names(DATA) | |
| 72 for i in range(len(labels)): | |
| 73 try: | |
| 74 labels[i] = COLUMN_LABELS[i] | |
| 75 except IndexError, e: | |
| 76 try: | |
| 77 labels[i] = COLUMN_LABELS_EXTENDED[i] | |
| 78 except: | |
| 79 fail("Error in input file: Too many columns (does not match hardcoded " | |
| 80 +"column labels).") | |
| 81 else: | |
| 82 DATA = utils.read_delim(infile) | |
| 83 # Remove comment from header, if present | |
| 84 labels = robjects.r.names(DATA) | |
| 85 if labels[0][0:2] == 'X.': | |
| 86 labels[0] = labels[0][2:] | |
| 87 | |
| 88 # Multiply minor allele frequencies by 100 to get percentage | |
| 89 # .rx2() looks up a column by its label and returns it as a vector | |
| 90 # .ro turns the returned object into one that can be operated on per-element | |
| 91 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100 | |
| 92 samples = DATA.rx2('SAMPLE') | |
| 93 | |
| 94 # Formula() creates a Python object representing the R object returned by x ~ y | |
| 95 formula = Formula('minor_freq ~ samples') | |
| 96 # The "environment" in .getenvironment() is the entire R workspace in which the | |
| 97 # Formula object exists. The R workspace meaning all the defined variables. | |
| 98 # Here, the .getenvironment() method is being used to set some variables in the | |
| 99 # R workspace | |
| 100 | |
| 101 formula.getenvironment()['minor_freq'] = minor_freq | |
| 102 formula.getenvironment()['samples'] = samples | |
| 103 | |
| 104 | |
| 105 r.par(oma=array.array('i', [0,0,0,0])) | |
| 106 r.par(mar=array.array('i', [10,4,4,2])) | |
| 107 ylimit = array.array('i',[-5,50]) | |
| 108 | |
| 109 # create boxplot - fill kwargs1 with the options for the boxplot function | |
| 110 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", | |
| 111 'outpch':"*",'main':"Distribution of minor allele frequencies", | |
| 112 'cex.lab':"1.5"} | |
| 113 p = graphics.boxplot(formula, axes=0,ylim=ylimit, lty=1,**kwargs1) | |
| 114 | |
| 115 table = base.table(DATA.rx2('SAMPLE')) | |
| 116 graphics.text(0, -1, 'N:', font=2) | |
| 117 for i in range(1, base.length(table)[0]+1, 1): | |
| 118 graphics.text(i, -1, table[i-1], font=2) | |
| 119 | |
| 120 graphlabels = base.names(table) | |
| 121 kwargs3 = {'pos':"-2", 'las':"2", 'cex.axis':"1"} | |
| 122 graphics.axis(1, at=range(1, len(graphlabels)+1, 1),labels=graphlabels, **kwargs3) | |
| 123 graphics.axis(2,at=(range(0,60,10)),pos=0,font=2) | |
| 124 grdevices.dev_off() | |
| 125 | |
| 126 if not report: | |
| 127 sys.exit(0) | |
| 128 | |
| 129 | |
| 130 SAMPLES=[] | |
| 131 for i in range(len(table)): | |
| 132 SAMPLES.append(base.names(table)[i]) | |
| 133 | |
| 134 def boxstats(data,sample): | |
| 135 VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample] | |
| 136 NoHET = len(VALUES) | |
| 137 MEDIAN = numpy.median(VALUES) | |
| 138 MAD = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic) | |
| 139 return [NoHET,MEDIAN, MAD] | |
| 140 | |
| 141 boxreport = open(report, "w+") | |
| 142 boxreport.write("#sample\tNo.sites\tmedian.freq\tMAD.freq\n") | |
| 143 | |
| 144 for sample in SAMPLES: | |
| 145 ENTRY = [sample] + boxstats(infile,sample) | |
| 146 boxreport.write ('%s\t%d\t%.1f\t%.1f\n' % tuple([ENTRY[i] for i in [0,1,2,3]])) | |
| 147 boxreport.close() | |
| 148 | |
| 149 | |
| 150 | |
| 151 | 
