0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 # Code by Boris Rebolledo-Jaramillo
|
|
4 # (boris-at-bx.psu.edu)
|
|
5 # Edited by Nick Stoler
|
|
6 # (nick-at-bx.psu.edu)
|
|
7 # New in this version:
|
|
8 # - Add in proper header line if not present
|
|
9
|
|
10
|
|
11 import os
|
|
12 import sys
|
|
13 import array
|
|
14 import numpy
|
|
15 from rpy2.robjects import Formula
|
|
16 from rpy2.robjects.packages import importr
|
|
17 from rpy2 import robjects
|
|
18
|
|
19 def fail(message):
|
|
20 sys.stderr.write(message+'\n')
|
|
21 sys.exit(1)
|
|
22
|
|
23 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG',
|
|
24 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
|
|
25
|
|
26 COLUMN_LABELS_STRANDED= ['SAMPLE', 'CHR', 'POS', '+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T',
|
|
27 'CVRG','ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.']
|
|
28
|
|
29 args = sys.argv[1:]
|
|
30 if len(args) >= 1:
|
|
31 infile = args[0]
|
|
32 else:
|
|
33 fail('Error: No input filename provided (as argument 1).')
|
|
34 if len(args) >= 2:
|
|
35 outfile = args[1]
|
|
36 else:
|
|
37 fail('Error: No output filename provided (as argument 2).')
|
|
38 if len(args) >= 3:
|
|
39 report = args[2]
|
|
40 else:
|
|
41 report = ''
|
|
42
|
|
43 # Check input file
|
|
44 add_header = False
|
|
45 if not os.path.exists(infile):
|
|
46 fail('Error: Input file '+infile+' could not be found.')
|
|
47 with open(infile, 'r') as lines:
|
|
48 line = lines.readline()
|
|
49 if not line:
|
|
50 fail('Error: Input file seems to be empty')
|
|
51 line = line.strip().lstrip('#') # rm whitespace, comment chars
|
|
52 labels = line.split("\t")
|
|
53 if 'SAMPLE' not in labels:
|
|
54 sys.stderr.write("Error: Input file does not seem to have a proper header "
|
|
55 +"line.\nAdding an artificial header..")
|
|
56 add_header = True
|
|
57
|
|
58 r = robjects.r
|
|
59 base = importr('base')
|
|
60 utils = importr('utils')
|
|
61 stats = importr('stats')
|
|
62 rprint = robjects.globalenv.get("print")
|
|
63 graphics = importr('graphics')
|
|
64 grdevices = importr('grDevices')
|
|
65 grdevices.png(file=outfile, width=1024, height=768, type="cairo")
|
|
66
|
|
67 # Read file into a data frame
|
|
68 if add_header:
|
|
69 # add header line manually if not present
|
|
70 DATA = utils.read_delim(infile, header=False)
|
|
71 labels = robjects.r.names(DATA)
|
|
72 for i in range(len(labels)):
|
|
73 try:
|
|
74 labels[i] = COLUMN_LABELS[i]
|
|
75 except IndexError, e:
|
|
76 try:
|
|
77 labels[i] = COLUMN_LABELS_EXTENDED[i]
|
|
78 except:
|
|
79 fail("Error in input file: Too many columns (does not match hardcoded "
|
|
80 +"column labels).")
|
|
81 else:
|
|
82 DATA = utils.read_delim(infile)
|
|
83 # Remove comment from header, if present
|
|
84 labels = robjects.r.names(DATA)
|
|
85 if labels[0][0:2] == 'X.':
|
|
86 labels[0] = labels[0][2:]
|
|
87
|
|
88 # Multiply minor allele frequencies by 100 to get percentage
|
|
89 # .rx2() looks up a column by its label and returns it as a vector
|
|
90 # .ro turns the returned object into one that can be operated on per-element
|
|
91 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100
|
|
92 samples = DATA.rx2('SAMPLE')
|
|
93
|
|
94 # Formula() creates a Python object representing the R object returned by x ~ y
|
|
95 formula = Formula('minor_freq ~ samples')
|
|
96 # The "environment" in .getenvironment() is the entire R workspace in which the
|
|
97 # Formula object exists. The R workspace meaning all the defined variables.
|
|
98 # Here, the .getenvironment() method is being used to set some variables in the
|
|
99 # R workspace
|
|
100
|
|
101 formula.getenvironment()['minor_freq'] = minor_freq
|
|
102 formula.getenvironment()['samples'] = samples
|
|
103
|
|
104
|
|
105 r.par(oma=array.array('i', [0,0,0,0]))
|
|
106 r.par(mar=array.array('i', [10,4,4,2]))
|
|
107 ylimit = array.array('i',[-5,50])
|
|
108
|
|
109 # create boxplot - fill kwargs1 with the options for the boxplot function
|
|
110 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n",
|
|
111 'outpch':"*",'main':"Distribution of minor allele frequencies",
|
|
112 'cex.lab':"1.5"}
|
|
113 p = graphics.boxplot(formula, axes=0,ylim=ylimit, lty=1,**kwargs1)
|
|
114
|
|
115 table = base.table(DATA.rx2('SAMPLE'))
|
|
116 graphics.text(0, -1, 'N:', font=2)
|
|
117 for i in range(1, base.length(table)[0]+1, 1):
|
|
118 graphics.text(i, -1, table[i-1], font=2)
|
|
119
|
|
120 graphlabels = base.names(table)
|
|
121 kwargs3 = {'pos':"-2", 'las':"2", 'cex.axis':"1"}
|
|
122 graphics.axis(1, at=range(1, len(graphlabels)+1, 1),labels=graphlabels, **kwargs3)
|
|
123 graphics.axis(2,at=(range(0,60,10)),pos=0,font=2)
|
|
124 grdevices.dev_off()
|
|
125
|
|
126 if not report:
|
|
127 sys.exit(0)
|
|
128
|
|
129
|
|
130 SAMPLES=[]
|
|
131 for i in range(len(table)):
|
|
132 SAMPLES.append(base.names(table)[i])
|
|
133
|
|
134 def boxstats(data,sample):
|
|
135 VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample]
|
|
136 NoHET = len(VALUES)
|
|
137 MEDIAN = numpy.median(VALUES)
|
|
138 MAD = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic)
|
|
139 return [NoHET,MEDIAN, MAD]
|
|
140
|
|
141 boxreport = open(report, "w+")
|
|
142 boxreport.write("#sample\tNo.sites\tmedian.freq\tMAD.freq\n")
|
|
143
|
|
144 for sample in SAMPLES:
|
|
145 ENTRY = [sample] + boxstats(infile,sample)
|
|
146 boxreport.write ('%s\t%d\t%.1f\t%.1f\n' % tuple([ENTRY[i] for i in [0,1,2,3]]))
|
|
147 boxreport.close()
|
|
148
|
|
149
|
|
150
|
|
151
|