| 
0
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 # Code by Boris Rebolledo-Jaramillo
 | 
| 
 | 
     4 # (boris-at-bx.psu.edu)
 | 
| 
 | 
     5 # Edited by Nick Stoler
 | 
| 
 | 
     6 # (nick-at-bx.psu.edu)
 | 
| 
 | 
     7 # New in this version:
 | 
| 
 | 
     8 # - Add in proper header line if not present
 | 
| 
 | 
     9 
 | 
| 
 | 
    10 
 | 
| 
 | 
    11 import os
 | 
| 
 | 
    12 import sys
 | 
| 
 | 
    13 import array
 | 
| 
 | 
    14 import numpy
 | 
| 
 | 
    15 from rpy2.robjects import Formula
 | 
| 
 | 
    16 from rpy2.robjects.packages import importr
 | 
| 
 | 
    17 from rpy2 import robjects
 | 
| 
 | 
    18 
 | 
| 
 | 
    19 def fail(message):
 | 
| 
 | 
    20   sys.stderr.write(message+'\n')
 | 
| 
 | 
    21   sys.exit(1)
 | 
| 
 | 
    22 
 | 
| 
 | 
    23 COLUMN_LABELS = ['SAMPLE', 'CHR',  'POS', 'A', 'C', 'G', 'T', 'CVRG',
 | 
| 
 | 
    24                  'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
 | 
| 
 | 
    25 
 | 
| 
 | 
    26 COLUMN_LABELS_STRANDED= ['SAMPLE', 'CHR',  'POS', '+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T',
 | 
| 
 | 
    27                          'CVRG','ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.']
 | 
| 
 | 
    28 
 | 
| 
 | 
    29 args = sys.argv[1:]
 | 
| 
 | 
    30 if len(args) >= 1:
 | 
| 
 | 
    31   infile = args[0]
 | 
| 
 | 
    32 else:
 | 
| 
 | 
    33   fail('Error: No input filename provided (as argument 1).')
 | 
| 
 | 
    34 if len(args) >= 2:
 | 
| 
 | 
    35   outfile = args[1]
 | 
| 
 | 
    36 else:
 | 
| 
 | 
    37   fail('Error: No output filename provided (as argument 2).')
 | 
| 
 | 
    38 if len(args) >= 3:
 | 
| 
 | 
    39   report = args[2]
 | 
| 
 | 
    40 else:
 | 
| 
 | 
    41   report = ''
 | 
| 
 | 
    42 
 | 
| 
 | 
    43 # Check input file
 | 
| 
 | 
    44 add_header = False
 | 
| 
 | 
    45 if not os.path.exists(infile):
 | 
| 
 | 
    46   fail('Error: Input file '+infile+' could not be found.')
 | 
| 
 | 
    47 with open(infile, 'r') as lines:
 | 
| 
 | 
    48   line = lines.readline()
 | 
| 
 | 
    49   if not line:
 | 
| 
 | 
    50     fail('Error: Input file seems to be empty')
 | 
| 
 | 
    51   line = line.strip().lstrip('#') # rm whitespace, comment chars
 | 
| 
 | 
    52   labels = line.split("\t")
 | 
| 
 | 
    53   if 'SAMPLE' not in labels:
 | 
| 
 | 
    54     sys.stderr.write("Error: Input file does not seem to have a proper header "
 | 
| 
 | 
    55       +"line.\nAdding an artificial header..")
 | 
| 
 | 
    56     add_header = True
 | 
| 
 | 
    57 
 | 
| 
 | 
    58 r = robjects.r
 | 
| 
 | 
    59 base = importr('base')
 | 
| 
 | 
    60 utils = importr('utils')
 | 
| 
 | 
    61 stats = importr('stats')
 | 
| 
 | 
    62 rprint = robjects.globalenv.get("print")
 | 
| 
 | 
    63 graphics = importr('graphics')
 | 
| 
 | 
    64 grdevices = importr('grDevices')
 | 
| 
 | 
    65 grdevices.png(file=outfile, width=1024, height=768, type="cairo")
 | 
| 
 | 
    66 
 | 
| 
 | 
    67 # Read file into a data frame
 | 
| 
 | 
    68 if add_header:
 | 
| 
 | 
    69     # add header line manually if not present
 | 
| 
 | 
    70     DATA = utils.read_delim(infile, header=False)
 | 
| 
 | 
    71     labels = robjects.r.names(DATA)
 | 
| 
 | 
    72     for i in range(len(labels)):
 | 
| 
 | 
    73         try:
 | 
| 
 | 
    74             labels[i] = COLUMN_LABELS[i]
 | 
| 
 | 
    75         except IndexError, e:
 | 
| 
 | 
    76             try:
 | 
| 
 | 
    77                 labels[i] = COLUMN_LABELS_EXTENDED[i]
 | 
| 
 | 
    78             except:
 | 
| 
 | 
    79                 fail("Error in input file: Too many columns (does not match hardcoded "
 | 
| 
 | 
    80                 +"column labels).")
 | 
| 
 | 
    81 else:
 | 
| 
 | 
    82   DATA = utils.read_delim(infile)
 | 
| 
 | 
    83   # Remove comment from header, if present
 | 
| 
 | 
    84   labels = robjects.r.names(DATA)
 | 
| 
 | 
    85   if labels[0][0:2] == 'X.':
 | 
| 
 | 
    86     labels[0] = labels[0][2:]
 | 
| 
 | 
    87 
 | 
| 
 | 
    88 # Multiply minor allele frequencies by 100 to get percentage
 | 
| 
 | 
    89 #  .rx2() looks up a column by its label and returns it as a vector
 | 
| 
 | 
    90 #  .ro turns the returned object into one that can be operated on per-element
 | 
| 
 | 
    91 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100
 | 
| 
 | 
    92 samples    = DATA.rx2('SAMPLE')
 | 
| 
 | 
    93 
 | 
| 
 | 
    94 # Formula() creates a Python object representing the R object returned by x ~ y
 | 
| 
 | 
    95 formula = Formula('minor_freq ~ samples')
 | 
| 
 | 
    96 # The "environment" in .getenvironment() is the entire R workspace in which the
 | 
| 
 | 
    97 # Formula object exists. The R workspace meaning all the defined variables.
 | 
| 
 | 
    98 # Here, the .getenvironment() method is being used to set some variables in the
 | 
| 
 | 
    99 # R workspace
 | 
| 
 | 
   100 
 | 
| 
 | 
   101 formula.getenvironment()['minor_freq'] = minor_freq
 | 
| 
 | 
   102 formula.getenvironment()['samples']    = samples
 | 
| 
 | 
   103 
 | 
| 
 | 
   104 
 | 
| 
 | 
   105 r.par(oma=array.array('i', [0,0,0,0]))
 | 
| 
 | 
   106 r.par(mar=array.array('i', [10,4,4,2]))
 | 
| 
 | 
   107 ylimit = array.array('i',[-5,50])
 | 
| 
 | 
   108 
 | 
| 
 | 
   109 # create boxplot - fill kwargs1 with the options for the boxplot function
 | 
| 
 | 
   110 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n",
 | 
| 
 | 
   111            'outpch':"*",'main':"Distribution of minor allele frequencies",
 | 
| 
 | 
   112            'cex.lab':"1.5"}
 | 
| 
 | 
   113 p = graphics.boxplot(formula, axes=0,ylim=ylimit, lty=1,**kwargs1)
 | 
| 
 | 
   114 
 | 
| 
 | 
   115 table = base.table(DATA.rx2('SAMPLE'))
 | 
| 
 | 
   116 graphics.text(0, -1, 'N:', font=2)
 | 
| 
 | 
   117 for i in range(1, base.length(table)[0]+1, 1):
 | 
| 
 | 
   118     graphics.text(i, -1, table[i-1], font=2)
 | 
| 
 | 
   119 
 | 
| 
 | 
   120 graphlabels = base.names(table)
 | 
| 
 | 
   121 kwargs3 = {'pos':"-2", 'las':"2", 'cex.axis':"1"}
 | 
| 
 | 
   122 graphics.axis(1, at=range(1, len(graphlabels)+1, 1),labels=graphlabels, **kwargs3)
 | 
| 
 | 
   123 graphics.axis(2,at=(range(0,60,10)),pos=0,font=2)
 | 
| 
 | 
   124 grdevices.dev_off()
 | 
| 
 | 
   125 
 | 
| 
 | 
   126 if not report:
 | 
| 
 | 
   127     sys.exit(0)
 | 
| 
 | 
   128 
 | 
| 
 | 
   129 
 | 
| 
 | 
   130 SAMPLES=[]
 | 
| 
 | 
   131 for i in range(len(table)):
 | 
| 
 | 
   132     SAMPLES.append(base.names(table)[i])
 | 
| 
 | 
   133 
 | 
| 
 | 
   134 def boxstats(data,sample):
 | 
| 
 | 
   135     VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample]
 | 
| 
 | 
   136     NoHET  = len(VALUES)
 | 
| 
 | 
   137     MEDIAN = numpy.median(VALUES)
 | 
| 
 | 
   138     MAD    = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic)
 | 
| 
 | 
   139     return [NoHET,MEDIAN, MAD]
 | 
| 
 | 
   140 
 | 
| 
 | 
   141 boxreport = open(report, "w+")
 | 
| 
 | 
   142 boxreport.write("#sample\tNo.sites\tmedian.freq\tMAD.freq\n")
 | 
| 
 | 
   143 
 | 
| 
 | 
   144 for sample in SAMPLES:
 | 
| 
 | 
   145     ENTRY = [sample] + boxstats(infile,sample)
 | 
| 
 | 
   146     boxreport.write ('%s\t%d\t%.1f\t%.1f\n' % tuple([ENTRY[i] for i in [0,1,2,3]]))
 | 
| 
 | 
   147 boxreport.close()
 | 
| 
 | 
   148 
 | 
| 
 | 
   149 
 | 
| 
 | 
   150 
 | 
| 
 | 
   151 
 |