comparison iterative_pca.py @ 0:64e75e21466e draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:38:39 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:64e75e21466e
1 import os
2 import subprocess
3 import sys
4 import argparse
5 import datetime
6 import csv
7
8 import jinja2
9
10 import pedify
11
12 JINJA_ENVIRONMENT = jinja2.Environment(
13 loader=jinja2.FileSystemLoader(os.path.dirname(__file__)),
14 extensions=['jinja2.ext.autoescape'],
15 autoescape=True)
16 DATA_TYPES = ["variant_data", "rds", "rdata", "numeric_ped"]
17 CLUSTERING_METHODS = ["dbscan", "hclust", "kmeans"]
18 CLUSTER_TRIMMING = ["sd", "mcd", "dbscan_outliers_only"]
19 HTML_TEMPLATE_NAME = "pca_report.html"
20 DEFAULT_SD_CUTOFF = 2
21 PATH_TO_R_FUNCTIONS = "{}/R_functions".format(os.path.dirname(os.path.abspath(__file__)))
22
23
24 def main():
25 ###===ARGUMENT PARSING===###
26 parser = argparse.ArgumentParser()
27 parser.add_argument("datafile_name", help="name of data file")
28 parser.add_argument("data_type", type=str.lower, choices=DATA_TYPES, help="type of data file")
29 parser.add_argument("iterations", help="max number of iterations to run",
30 type=int)
31 parser.add_argument("--control_tag", help="substring present in the ids of all control samples, e.g. LP")
32 parser.add_argument("--cases_tag", help="substring present in the ids of all case samples, e.g. HAPS")
33 parser.add_argument("--config_file", help="name of file containing config info")
34 parser.add_argument("--clustering_flag", help="Flag to indicate whether to do clustering", action="store_true")
35 parser.add_argument("--clustering_method", type=str.lower, choices=CLUSTERING_METHODS,
36 help="algorithm used to find clusters")
37 parser.add_argument("--cluster_trimming", type=str.lower, choices=CLUSTER_TRIMMING,
38 help="method used to identify and trim outlier clusters")
39 parser.add_argument("-s", "--sd_cutoff", help="number of standard deviations at which to cut outliers")
40 parser.add_argument("-o", "--out", help="base name of output files and directories; defaults to datafile name")
41 group = parser.add_mutually_exclusive_group()
42 group.add_argument("-r", "--relative_prefix", help="RELATIVE path CONTAINING the output directory")
43 group.add_argument("-a", "--absolute_prefix", help="ABSOLUTE path CONTAINING the output directory")
44 parser.add_argument("--html_file", help="ABSOLUTE PATH of output html file")
45 parser.add_argument("--ethnicity_file", help="File containing data about the ethnicity of the samples")
46 parser.add_argument("--galaxy", help="Flag to indicate we are running this script in Galaxy",
47 action="store_true")
48 parser.add_argument("--reject_snps", help="file containing SNP ids to ignore/remove")
49 parser.add_argument("--reject_samples", help="file containing sample ids to ignore/remove")
50
51 args = parser.parse_args()
52
53 # enforce conditionally required args
54 if args.data_type == "variant_data":
55 if args.config_file == None:
56 parser.error("Input variant data files require a complementary config file\
57 specified with '--config_file <filename>' flag")
58
59 if args.config_file:
60 if args.data_type != "variant_data":
61 print "Warning! Your config file will be ignored as the input data is not a\
62 text file containing variant data"
63
64 if args.clustering_flag:
65 if (args.clustering_method == None) or (args.cluster_trimming == None):
66 parser.error("If --clustering_flag is set,\
67 --clustering_method and --cluster_trimming options must also be specified")
68
69 if args.clustering_method or args.cluster_trimming:
70 if args.clustering_flag == False:
71 parser.error("--clustering_method and --cluster_trimming\
72 options only valid if --clustering_flag is also set")
73
74 # dbscan outliers only valid if cluster method == "DBSCAN"
75 if args.cluster_trimming == "dbscan_outliers_only":
76 if args.clustering_method != "dbscan":
77 parser.error("dbscan_outliers_only cluster trimming method only\
78 valid if --clustering_method == dbscan")
79
80 dfname = args.datafile_name
81 data_type = args.data_type
82 n = args.iterations
83 control_tag = args.control_tag
84 cases_tag = args.cases_tag
85 cfname = args.config_file
86 cflag = args.clustering_flag
87 cmethod = (args.clustering_method if args.clustering_method else "none")
88 tmethod = (args.cluster_trimming if args.cluster_trimming else "none")
89 sd_cutoff = (args.sd_cutoff if args.sd_cutoff else DEFAULT_SD_CUTOFF)
90 basename = (args.out if args.out else dfname.split('/')[-1].split('.')[0])
91 hfname = args.html_file
92 galaxy_flag = args.galaxy
93 ethfname = args.ethnicity_file
94 xsnps_filename = args.reject_snps
95 xsamples_filename = args.reject_samples
96
97 # function used to handle strange galaxy behaviour...
98 if galaxy_flag and cfname:
99 rc = format_galaxy_config_file(cfname)
100 if rc != 0:
101 print "Bad config file"
102 sys.exit()
103
104 path_here = os.path.dirname(os.path.abspath(__file__))
105 # determine output directory
106 if args.relative_prefix:
107 outdir = "{}/{}/full_output_{}".format(path_here, args.relative_prefix, basename)
108 elif args.absolute_prefix:
109 outdir = "{}/full_output_{}".format(args.absolute_prefix, basename)
110 else:
111 outdir = "{}/full_output_{}".format(path_here, basename)
112 if hfname == None:
113 hfname = "{}/{}.html".format(outdir, basename)
114
115 ####===BEGIN PIPELINE===####
116 # create a log file
117 lfname = "{}.log".format(basename)
118 logfile = open(lfname, 'w')
119 logfile.write("##Iterative PCA commenced at {}##\n".format(datetime.datetime.now()))
120
121 # create output directory
122 prepare_directory(outdir, logfile)
123
124 # do pre-processing if necessary, else go straight to R
125 if data_type == "variant_data":
126 stage1_info = stage_text_to_ped(dfname, cfname, xsamples_filename,
127 xsnps_filename, basename, outdir, logfile, True)
128 data_source_name = "{}/{}".format(outdir, stage1_info['pfname'])
129 data_type_out = "numeric_ped"
130 xsamples_filename = "{}/{}".format(outdir, stage1_info['xsname'])
131 elif data_type in ["rds", "rdata", "numeric_ped"]:
132 data_source_name = dfname
133 data_type_out = data_type
134 else:
135 print "Unrecognised input data type, exiting..."
136 sys.exit(0)
137
138 # do iterative pca in R (much faster)
139 rCommand = "Rscript {}/iterative_pca_plot.R '{}' '{}' '{}' '{}' \
140 '{}' '{}' '{}' '{}' '{}'\
141 '{}' '{}' '{}' '{}' '{}'".format(
142 path_here,
143 n, outdir, basename, data_source_name,
144 data_type_out, ethfname, control_tag, cases_tag, sd_cutoff,
145 cmethod, tmethod, PATH_TO_R_FUNCTIONS, xsamples_filename, xsnps_filename)
146
147 subprocess.call(rCommand, shell=True)
148 # generate dictionary of output file names
149 if cflag:
150 num_plots = 1
151 else:
152 num_plots = 2
153 if ethfname:
154 num_plots += 1
155 if cases_tag or control_tag:
156 num_plots += 1
157 iteration_data = create_output_dict(outdir, basename, n, num_plots)
158
159 # clean up logfile
160 logfile.write("##Iterative PCA completed at {}##\n".format(datetime.datetime.now()))
161 logfile.close()
162 subprocess.call("mv '{}' '{}'".format(lfname, outdir), shell=True)
163 lfname = "{}/{}".format(outdir, lfname)
164 if cfname:
165 # copy and rename config file into outdir (if provided)
166 subprocess.call("cp '{}' '{}/{}.config'".format(cfname, outdir, basename), shell=True)
167 # output results summary in a nice html format
168 html_output = produce_report(hfname, dfname, lfname, cflag,
169 cmethod, tmethod, sd_cutoff, iteration_data, galaxy_flag)
170 ##############################################################################
171
172 """
173 Function to convert a text file containing variant data into a numeric or genomic ped file
174 Inputs:
175 - dfname: The name of the text file to convert
176 - cfname: Configuration file containing info on which columns to find required information in,
177 as well as filtering options
178 - xsamples_filename: A file containg a list of exact sample ids to exclude
179 - xsnps_filename: A file containing a list of exact snp ids to exclude
180 - basename: Filename without the extension
181 - outdir: Full path of output directory
182 - logfile: A file to keep track of program status. Any shell commands executed will be written to this file
183 - numeric_flag: If true, ped file will be numeric.
184 If false, ped file will be genomic
185 Outputs:
186 - ped file, either numeric or genomic
187 - map file
188 - excluded samples file, containing the ids of all samples which did not pass the filters and were rejected
189 Returns:
190 a dict containing the PATHS to important output files
191 """
192 def stage_text_to_ped(dfname, cfname, xsamples_filename, xsnps_filename, basename, outdir, logfile, numeric_flag):
193 # Create ped and map files
194 logfile.write("##STAGE 1: Create .ped and .map files from input text file##\n")
195
196 pfname = "{}/{}.ped".format(outdir, basename)
197 mfname = "{}/{}.map".format(outdir, basename)
198 xsname = "{}/{}_xsamples.txt".format(outdir, basename)
199
200 pc = pedify.PedConverter()
201 # read in config file
202 rc = pc.read_config_file(cfname)
203 # Add in string filters for excluded samples and snps
204 if xsamples_filename:
205 xsamples = open(xsamples_filename, 'r').read().split('\n')
206 pc.cfg.add_string_filter("comp_generated_sample_cuts", pc.cfg.col_names['sample_id_column'],
207 "exact", "reject", xsamples)
208 if xsnps_filename:
209 xsnps = open(xsnps_filename, 'r').read().split('\n')
210 pc.cfg.add_string_filter("comp_generated_snp_cuts", pc.cfg.col_names['variant_id_column'],
211 "exact", "reject", xsnps)
212 if rc == 0:
213 print 'config file read successfully'
214 else:
215 print 'failed to read in config file successfully. Error code: {}'.format(rc)
216
217 # read in data file
218 rc = pc.read_data_file(dfname)
219 if rc == 0:
220 print 'data file read successfully'
221 else:
222 print 'failed to read in data file successfully. Error code: {}'.format(rc)
223
224 # create output
225 pc.create_ped_file(pfname, numeric=numeric_flag)
226 pc.create_map_file(mfname)
227 pc.create_excluded_samples_file(xsname)
228 outdict = {
229 "pfname": "{}.ped".format(basename),
230 "mfname": "{}.map".format(basename),
231 "xsname": "{}_xsamples.txt".format(basename),
232 "control_tag": pc.cfg.control_info['control_tag']['tag'],
233 "cases_tag": pc.cfg.control_info['cases_tag']['tag']
234 }
235
236 return outdict
237
238 """
239 Function to produce the html report of all the data
240 Inputs:
241 hfname: The ABSOLUTE PATH (including the filename) of the html file to output
242 dfname: path to datafile
243 lfname: path to logfile
244 sd_cutoff: multipler used to determine outliers
245 iterationList: a list of dicts, each containing AT LEAST:
246 - outdir: ABSOLUTE PATH to parent directory containing all the output from the iteration
247 - count: which iteration this was
248 - plotpaths: A list of paths to output plots
249 - xsname: name of file containing ids of samples excluded from this iteration
250 - ofname: name of file containing ids of outliers detected in this iteration
251 returns:
252 The html output that was written to the html file
253 """
254 def produce_report(hfname, dfname, lfname, cflag, cmethod, tmethod, sd_cutoff, iteration_list, galaxy_flag):
255 hfdir = os.path.dirname(hfname)
256 htmldoc = open(hfname, 'w')
257 iteration_data = []
258 for it in iteration_list:
259 outdir = it['outdir']
260 if galaxy_flag:
261 dirs = outdir.split("/")
262 relpath = "{}/{}".format(dirs[-2], dirs[-1])
263 else:
264 relpath = os.path.relpath(outdir, hfdir)
265 it_dict = {}
266
267 ofname = "{}/{}".format(outdir, it['ofname'])
268 xsname = "{}/{}".format(outdir, it['xsname'])
269 of = open(ofname, 'r')
270 xf = open(xsname, 'r')
271 ol = []
272 xl = []
273 for line in of:
274 ol.append(line.split(','))
275 for line in xf:
276 xl.append(line.split(','))
277 of.close()
278 xf.close()
279 # fix the paths to the images
280 rel_plot_paths = []
281 for pp in it['plot_paths']:
282 rel_plot_paths.append("{}/{}".format(relpath, pp))
283
284 it_dict['count'] = it['count']
285 it_dict['ol'] = ol
286 it_dict['xl'] = xl
287 it_dict['rel_plot_paths']= rel_plot_paths
288 iteration_data.append(it_dict)
289
290 logcontent = open(lfname, 'r').read()
291 template_values = {
292 "date": datetime.datetime.now(),
293 "dfname": dfname,
294 "lfname": lfname,
295 "logcontent": logcontent,
296 "iteration_data": iteration_data,
297 "cflag": cflag,
298 "cmethod": cmethod,
299 "tmethod": tmethod,
300 "sd_cutoff": sd_cutoff,
301 }
302 template = JINJA_ENVIRONMENT.get_template(HTML_TEMPLATE_NAME)
303 rendered_template = template.render(template_values)
304 htmldoc.write(rendered_template)
305 htmldoc.close()
306 return rendered_template
307
308 """
309 Generate the expected directory structure of the root output directory,
310 given the basname, number of iterations, and number of expected plots output by
311 the R script
312 """
313 def create_output_dict(outdir, basename, n, num_plots):
314 iteration_list = []
315 for i in range(n):
316 it_dict = {}
317 i_outdir = "{}/output_{}_iteration_{}".format(outdir, basename, i)
318 it_dict['outdir'] = i_outdir
319 it_dict['ofname'] = "{}_outliers.txt".format(basename)
320 it_dict['xsname'] = "{}_xfile.txt".format(basename)
321 it_dict['count'] = i
322 it_dict['plot_paths'] = []
323 for j in range(num_plots):
324 it_dict['plot_paths'].append("{}_plot_number_{}.png".format(basename, j+1))
325 iteration_list.append(it_dict)
326 return iteration_list
327
328
329 ###########################UTILITY########################################
330 def prepare_directory(outdir, logfile):
331 # set up output directory
332 logfile.write("##Setting up output directory:{}##\n".format(outdir))
333 subprocess.call("rm -rf '{}'".format(outdir), shell=True)
334 logfile.write("rm -rf '{}'\n".format(outdir))
335 subprocess.call("mkdir -p '{}'".format(outdir), shell=True)
336 logfile.write("mkdir -p '{}'\n".format(outdir))
337
338 # takes a list of dicts, merges them into a single big dict
339 def merge_dicts(dictList):
340 result = {}
341 for d in dictList:
342 if type(d) == dict:
343 result.update(d)
344 return result
345
346 # Some odd rules regarding character escaping and such in galaxy: fix those here
347 def format_galaxy_config_file(cfname):
348 CHAR_CONVERSION = {
349 'g': '>',
350 'l': '<',
351 'e': '==',
352 'ge': '<=',
353 'le': '>='
354 }
355 cfile = open(cfname, 'r')
356 lines = cfile.readlines()
357 cfile.close()
358
359 newlines = []
360 section = None
361 for l in lines:
362 nl = l
363 if l[0] == '#':
364 if l.strip() == "#numeric_filters":
365 section = "numeric_filters"
366 else:
367 section = None
368 else:
369 if section:
370 tokens = l.split(',')
371 op = tokens[2]
372 if op in CHAR_CONVERSION.keys():
373 newop = CHAR_CONVERSION[op]
374 elif op in CHAR_CONVERSION.values():
375 newop = op
376 else:
377 # error code 1; bad op codes
378 return 1
379 tokens[2] = newop
380 nl = ','.join(tokens)
381
382
383 nl = nl.replace("__pd__", '#')
384 newlines.append(nl)
385
386 cfile = open(cfname, 'w')
387 cfile.writelines(newlines)
388 cfile.close()
389 return 0
390 ##############################################################################
391
392 if __name__ == "__main__":
393 main()