| 0 | 1 import os | 
|  | 2 import subprocess | 
|  | 3 import sys | 
|  | 4 import argparse | 
|  | 5 import datetime | 
|  | 6 import csv | 
|  | 7 | 
|  | 8 import jinja2 | 
|  | 9 | 
|  | 10 import pedify | 
|  | 11 | 
|  | 12 JINJA_ENVIRONMENT = jinja2.Environment( | 
|  | 13     loader=jinja2.FileSystemLoader(os.path.dirname(__file__)), | 
|  | 14     extensions=['jinja2.ext.autoescape'], | 
|  | 15     autoescape=True) | 
|  | 16 DATA_TYPES = ["variant_data", "rds", "rdata", "numeric_ped"] | 
|  | 17 CLUSTERING_METHODS = ["dbscan", "hclust", "kmeans"] | 
|  | 18 CLUSTER_TRIMMING = ["sd", "mcd", "dbscan_outliers_only"] | 
|  | 19 HTML_TEMPLATE_NAME = "pca_report.html" | 
|  | 20 DEFAULT_SD_CUTOFF = 2 | 
|  | 21 PATH_TO_R_FUNCTIONS = "{}/R_functions".format(os.path.dirname(os.path.abspath(__file__))) | 
|  | 22 | 
|  | 23 | 
|  | 24 def main(): | 
|  | 25 	###===ARGUMENT PARSING===### | 
|  | 26 	parser = argparse.ArgumentParser() | 
|  | 27 	parser.add_argument("datafile_name", help="name of data file") | 
|  | 28 	parser.add_argument("data_type", type=str.lower, choices=DATA_TYPES, help="type of data file") | 
|  | 29 	parser.add_argument("iterations", help="max number of iterations to run", | 
|  | 30 						type=int) | 
|  | 31 	parser.add_argument("--control_tag", help="substring present in the ids of all control samples, e.g. LP") | 
|  | 32 	parser.add_argument("--cases_tag", help="substring present in the ids of all case samples, e.g. HAPS") | 
|  | 33 	parser.add_argument("--config_file", help="name of file containing config info") | 
|  | 34 	parser.add_argument("--clustering_flag", help="Flag to indicate whether to do clustering", action="store_true") | 
|  | 35 	parser.add_argument("--clustering_method", type=str.lower, choices=CLUSTERING_METHODS, | 
|  | 36 						help="algorithm used to find clusters") | 
|  | 37 	parser.add_argument("--cluster_trimming", type=str.lower, choices=CLUSTER_TRIMMING, | 
|  | 38 						help="method used to identify and trim outlier clusters") | 
|  | 39 	parser.add_argument("-s", "--sd_cutoff", help="number of standard deviations at which to cut outliers") | 
|  | 40 	parser.add_argument("-o", "--out", help="base name of output files and directories; defaults to datafile name") | 
|  | 41 	group = parser.add_mutually_exclusive_group() | 
|  | 42 	group.add_argument("-r", "--relative_prefix", help="RELATIVE path CONTAINING the output directory") | 
|  | 43 	group.add_argument("-a", "--absolute_prefix", help="ABSOLUTE path CONTAINING the output directory") | 
|  | 44 	parser.add_argument("--html_file", help="ABSOLUTE PATH of output html file") | 
|  | 45 	parser.add_argument("--ethnicity_file", help="File containing data about the ethnicity of the samples") | 
|  | 46 	parser.add_argument("--galaxy", help="Flag to indicate we are running this script in Galaxy", | 
|  | 47 						action="store_true") | 
|  | 48 	parser.add_argument("--reject_snps", help="file containing SNP ids to ignore/remove") | 
|  | 49 	parser.add_argument("--reject_samples", help="file containing sample ids to ignore/remove") | 
|  | 50 | 
|  | 51 	args = parser.parse_args() | 
|  | 52 | 
|  | 53 	# enforce conditionally required args | 
|  | 54 	if args.data_type == "variant_data": | 
|  | 55 		if args.config_file == None: | 
|  | 56 			parser.error("Input variant data files require a complementary config file\ | 
|  | 57  specified with '--config_file <filename>' flag") | 
|  | 58 | 
|  | 59 	if args.config_file: | 
|  | 60 		if args.data_type != "variant_data": | 
|  | 61 			print "Warning! Your config file will be ignored as the input data is not a\ | 
|  | 62  text file containing variant data" | 
|  | 63 | 
|  | 64 	if args.clustering_flag: | 
|  | 65 		if (args.clustering_method == None) or (args.cluster_trimming == None): | 
|  | 66 			parser.error("If --clustering_flag is set,\ | 
|  | 67  --clustering_method and --cluster_trimming options must also be specified") | 
|  | 68 | 
|  | 69 	if args.clustering_method or args.cluster_trimming: | 
|  | 70 		if args.clustering_flag == False: | 
|  | 71 			parser.error("--clustering_method and --cluster_trimming\ | 
|  | 72  options only valid if --clustering_flag is also set") | 
|  | 73 | 
|  | 74 	# dbscan outliers only valid if cluster method == "DBSCAN" | 
|  | 75 	if args.cluster_trimming == "dbscan_outliers_only": | 
|  | 76 		if args.clustering_method != "dbscan": | 
|  | 77 			parser.error("dbscan_outliers_only cluster trimming method only\ | 
|  | 78  valid if --clustering_method == dbscan") | 
|  | 79 | 
|  | 80 	dfname = args.datafile_name | 
|  | 81 	data_type = args.data_type | 
|  | 82 	n = args.iterations | 
|  | 83 	control_tag = args.control_tag | 
|  | 84 	cases_tag = args.cases_tag | 
|  | 85 	cfname = args.config_file | 
|  | 86 	cflag = args.clustering_flag | 
|  | 87 	cmethod = (args.clustering_method if args.clustering_method else "none") | 
|  | 88 	tmethod = (args.cluster_trimming if args.cluster_trimming else "none") | 
|  | 89 	sd_cutoff = (args.sd_cutoff if args.sd_cutoff else DEFAULT_SD_CUTOFF) | 
|  | 90 	basename = (args.out if args.out else dfname.split('/')[-1].split('.')[0]) | 
|  | 91 	hfname = args.html_file | 
|  | 92 	galaxy_flag = args.galaxy | 
|  | 93 	ethfname = args.ethnicity_file | 
|  | 94 	xsnps_filename = args.reject_snps | 
|  | 95 	xsamples_filename = args.reject_samples | 
|  | 96 | 
|  | 97 	# function used to handle strange galaxy behaviour... | 
|  | 98 	if galaxy_flag and cfname: | 
|  | 99 		rc = format_galaxy_config_file(cfname) | 
|  | 100 		if rc != 0: | 
|  | 101 			print "Bad config file" | 
|  | 102 			sys.exit() | 
|  | 103 | 
|  | 104 	path_here = os.path.dirname(os.path.abspath(__file__)) | 
|  | 105 	# determine output directory | 
|  | 106 	if args.relative_prefix: | 
|  | 107 		outdir = "{}/{}/full_output_{}".format(path_here, args.relative_prefix, basename) | 
|  | 108 	elif args.absolute_prefix: | 
|  | 109 		outdir = "{}/full_output_{}".format(args.absolute_prefix, basename) | 
|  | 110 	else: | 
|  | 111 		outdir = "{}/full_output_{}".format(path_here, basename) | 
|  | 112 	if hfname == None: | 
|  | 113 		hfname = "{}/{}.html".format(outdir, basename) | 
|  | 114 | 
|  | 115 	####===BEGIN PIPELINE===#### | 
|  | 116 	# create a log file | 
|  | 117 	lfname = "{}.log".format(basename) | 
|  | 118 	logfile = open(lfname, 'w') | 
|  | 119 	logfile.write("##Iterative PCA commenced at {}##\n".format(datetime.datetime.now())) | 
|  | 120 | 
|  | 121 	# create output directory | 
|  | 122 	prepare_directory(outdir, logfile) | 
|  | 123 | 
|  | 124 	# do pre-processing if necessary, else go straight to R | 
|  | 125 	if data_type == "variant_data": | 
|  | 126 		stage1_info = stage_text_to_ped(dfname, cfname, xsamples_filename, | 
|  | 127 			xsnps_filename, basename, outdir, logfile, True) | 
|  | 128 		data_source_name = "{}/{}".format(outdir, stage1_info['pfname']) | 
|  | 129 		data_type_out = "numeric_ped" | 
|  | 130 		xsamples_filename = "{}/{}".format(outdir, stage1_info['xsname']) | 
|  | 131 	elif data_type in ["rds", "rdata", "numeric_ped"]: | 
|  | 132 		data_source_name = dfname | 
|  | 133 		data_type_out = data_type | 
|  | 134 	else: | 
|  | 135 		print "Unrecognised input data type, exiting..." | 
|  | 136 		sys.exit(0) | 
|  | 137 | 
|  | 138 	# do iterative pca in R (much faster) | 
|  | 139 	rCommand = "Rscript {}/iterative_pca_plot.R '{}' '{}' '{}' '{}' \ | 
|  | 140 												'{}' '{}' '{}' '{}' '{}'\ | 
|  | 141 												'{}' '{}' '{}' '{}' '{}'".format( | 
|  | 142 		path_here, | 
|  | 143 		n, outdir, basename, data_source_name, | 
|  | 144 		data_type_out, ethfname, control_tag, cases_tag, sd_cutoff, | 
|  | 145 		cmethod, tmethod, PATH_TO_R_FUNCTIONS, xsamples_filename, xsnps_filename) | 
|  | 146 | 
|  | 147 	subprocess.call(rCommand, shell=True) | 
|  | 148 	# generate dictionary of output file names | 
|  | 149 	if cflag: | 
|  | 150 		num_plots = 1 | 
|  | 151 	else: | 
|  | 152 		num_plots = 2 | 
|  | 153 	if ethfname: | 
|  | 154 		num_plots += 1 | 
|  | 155 	if cases_tag or control_tag: | 
|  | 156 		num_plots += 1 | 
|  | 157 	iteration_data = create_output_dict(outdir, basename, n, num_plots) | 
|  | 158 | 
|  | 159 	# clean up logfile | 
|  | 160 	logfile.write("##Iterative PCA completed at {}##\n".format(datetime.datetime.now())) | 
|  | 161 	logfile.close() | 
|  | 162 	subprocess.call("mv '{}' '{}'".format(lfname, outdir), shell=True) | 
|  | 163 	lfname = "{}/{}".format(outdir, lfname) | 
|  | 164 	if cfname: | 
|  | 165 		# copy and rename config file into outdir (if provided) | 
|  | 166 		subprocess.call("cp '{}' '{}/{}.config'".format(cfname, outdir, basename), shell=True) | 
|  | 167 	# output results summary in a nice html format | 
|  | 168 	html_output = produce_report(hfname, dfname, lfname, cflag, | 
|  | 169 		cmethod, tmethod, sd_cutoff, iteration_data, galaxy_flag) | 
|  | 170 ############################################################################## | 
|  | 171 | 
|  | 172 """ | 
|  | 173 Function to convert a text file containing variant data into a numeric or genomic ped file | 
|  | 174 Inputs: | 
|  | 175 - dfname: The name of the text file to convert | 
|  | 176 - cfname: Configuration file containing info on which columns to find required information in, | 
|  | 177 			as well as filtering options | 
|  | 178 - xsamples_filename: A file containg a list of exact sample ids to exclude | 
|  | 179 - xsnps_filename: A file containing a list of exact snp ids to exclude | 
|  | 180 - basename: Filename without the extension | 
|  | 181 - outdir: Full path of output directory | 
|  | 182 - logfile: A file to keep track of program status. Any shell commands executed will be written to this file | 
|  | 183 - numeric_flag: If true, ped file will be numeric. | 
|  | 184 				If false, ped file will be genomic | 
|  | 185 Outputs: | 
|  | 186 - ped file, either numeric or genomic | 
|  | 187 - map file | 
|  | 188 - excluded samples file, containing the ids of all samples which did not pass the filters and were rejected | 
|  | 189 Returns: | 
|  | 190 	a dict containing the PATHS to important output files | 
|  | 191 """ | 
|  | 192 def stage_text_to_ped(dfname, cfname, xsamples_filename, xsnps_filename, basename, outdir, logfile, numeric_flag): | 
|  | 193 	# Create ped and map files | 
|  | 194 	logfile.write("##STAGE 1: Create .ped and .map files from input text file##\n") | 
|  | 195 | 
|  | 196 	pfname = "{}/{}.ped".format(outdir, basename) | 
|  | 197 	mfname = "{}/{}.map".format(outdir, basename) | 
|  | 198 	xsname = "{}/{}_xsamples.txt".format(outdir, basename) | 
|  | 199 | 
|  | 200 	pc = pedify.PedConverter() | 
|  | 201 	# read in config file | 
|  | 202 	rc = pc.read_config_file(cfname) | 
|  | 203 	# Add in string filters for excluded samples and snps | 
|  | 204 	if xsamples_filename: | 
|  | 205 		xsamples = open(xsamples_filename, 'r').read().split('\n') | 
|  | 206 		pc.cfg.add_string_filter("comp_generated_sample_cuts", pc.cfg.col_names['sample_id_column'], | 
|  | 207 			"exact", "reject", xsamples) | 
|  | 208 	if xsnps_filename: | 
|  | 209 		xsnps = open(xsnps_filename, 'r').read().split('\n') | 
|  | 210 		pc.cfg.add_string_filter("comp_generated_snp_cuts", pc.cfg.col_names['variant_id_column'], | 
|  | 211 			"exact", "reject", xsnps) | 
|  | 212 	if rc == 0: | 
|  | 213 		print 'config file read successfully' | 
|  | 214 	else: | 
|  | 215 		print 'failed to read in config file successfully. Error code: {}'.format(rc) | 
|  | 216 | 
|  | 217 	# read in data file | 
|  | 218 	rc = pc.read_data_file(dfname) | 
|  | 219 	if rc == 0: | 
|  | 220 		print 'data file read successfully' | 
|  | 221 	else: | 
|  | 222 		print 'failed to read in data file successfully. Error code: {}'.format(rc) | 
|  | 223 | 
|  | 224 	# create output | 
|  | 225 	pc.create_ped_file(pfname, numeric=numeric_flag) | 
|  | 226 	pc.create_map_file(mfname) | 
|  | 227 	pc.create_excluded_samples_file(xsname) | 
|  | 228 	outdict = { | 
|  | 229 		"pfname": "{}.ped".format(basename), | 
|  | 230 		"mfname": "{}.map".format(basename), | 
|  | 231 		"xsname": "{}_xsamples.txt".format(basename), | 
|  | 232 		"control_tag": pc.cfg.control_info['control_tag']['tag'], | 
|  | 233 		"cases_tag": pc.cfg.control_info['cases_tag']['tag'] | 
|  | 234 	} | 
|  | 235 | 
|  | 236 	return outdict | 
|  | 237 | 
|  | 238 """ | 
|  | 239 Function to produce the html report of all the data | 
|  | 240 Inputs: | 
|  | 241 hfname: The ABSOLUTE PATH (including the filename) of the html file to output | 
|  | 242 dfname: path to datafile | 
|  | 243 lfname: path to logfile | 
|  | 244 sd_cutoff: multipler used to determine outliers | 
|  | 245 iterationList: a list of dicts, each containing AT LEAST: | 
|  | 246 	- outdir: ABSOLUTE PATH to parent directory containing all the output from the iteration | 
|  | 247 	- count: which iteration this was | 
|  | 248 	- plotpaths: A list of paths to output plots | 
|  | 249 	- xsname: name of file containing ids of samples excluded from this iteration | 
|  | 250 	- ofname: name of file containing ids of outliers detected in this iteration | 
|  | 251 returns: | 
|  | 252 	The html output that was written to the html file | 
|  | 253 """ | 
|  | 254 def produce_report(hfname, dfname, lfname, cflag, cmethod, tmethod, sd_cutoff, iteration_list, galaxy_flag): | 
|  | 255 	hfdir = os.path.dirname(hfname) | 
|  | 256 	htmldoc = open(hfname, 'w') | 
|  | 257 	iteration_data = [] | 
|  | 258 	for it in iteration_list: | 
|  | 259 		outdir = it['outdir'] | 
|  | 260 		if galaxy_flag: | 
|  | 261 			dirs = outdir.split("/") | 
|  | 262 			relpath = "{}/{}".format(dirs[-2], dirs[-1]) | 
|  | 263 		else: | 
|  | 264 			relpath = os.path.relpath(outdir, hfdir) | 
|  | 265 		it_dict = {} | 
|  | 266 | 
|  | 267 		ofname = "{}/{}".format(outdir, it['ofname']) | 
|  | 268 		xsname = "{}/{}".format(outdir, it['xsname']) | 
|  | 269 		of = open(ofname, 'r') | 
|  | 270 		xf = open(xsname, 'r') | 
|  | 271 		ol = [] | 
|  | 272 		xl = [] | 
|  | 273 		for line in of: | 
|  | 274 			ol.append(line.split(',')) | 
|  | 275 		for line in xf: | 
|  | 276 			xl.append(line.split(',')) | 
|  | 277 		of.close() | 
|  | 278 		xf.close() | 
|  | 279 		# fix the paths to the images | 
|  | 280 		rel_plot_paths = [] | 
|  | 281 		for pp in it['plot_paths']: | 
|  | 282 			rel_plot_paths.append("{}/{}".format(relpath, pp)) | 
|  | 283 | 
|  | 284 		it_dict['count'] = it['count'] | 
|  | 285 		it_dict['ol'] = ol | 
|  | 286 		it_dict['xl'] = xl | 
|  | 287 		it_dict['rel_plot_paths']= rel_plot_paths | 
|  | 288 		iteration_data.append(it_dict) | 
|  | 289 | 
|  | 290 	logcontent = open(lfname, 'r').read() | 
|  | 291 	template_values = { | 
|  | 292 		"date": datetime.datetime.now(), | 
|  | 293 		"dfname": dfname, | 
|  | 294 		"lfname": lfname, | 
|  | 295 		"logcontent": logcontent, | 
|  | 296 		"iteration_data": iteration_data, | 
|  | 297 		"cflag": cflag, | 
|  | 298 		"cmethod": cmethod, | 
|  | 299 		"tmethod": tmethod, | 
|  | 300 		"sd_cutoff": sd_cutoff, | 
|  | 301 	} | 
|  | 302 	template = JINJA_ENVIRONMENT.get_template(HTML_TEMPLATE_NAME) | 
|  | 303 	rendered_template = template.render(template_values) | 
|  | 304 	htmldoc.write(rendered_template) | 
|  | 305 	htmldoc.close() | 
|  | 306 	return rendered_template | 
|  | 307 | 
|  | 308 """ | 
|  | 309 Generate the expected directory structure of the root output directory, | 
|  | 310 given the basname, number of iterations, and number of expected plots output by | 
|  | 311 the R script | 
|  | 312 """ | 
|  | 313 def create_output_dict(outdir, basename, n, num_plots): | 
|  | 314 	iteration_list = [] | 
|  | 315 	for i in range(n): | 
|  | 316 		it_dict = {} | 
|  | 317 		i_outdir = "{}/output_{}_iteration_{}".format(outdir, basename, i) | 
|  | 318 		it_dict['outdir'] = i_outdir | 
|  | 319 		it_dict['ofname'] = "{}_outliers.txt".format(basename) | 
|  | 320 		it_dict['xsname'] = "{}_xfile.txt".format(basename) | 
|  | 321 		it_dict['count'] = i | 
|  | 322 		it_dict['plot_paths'] = [] | 
|  | 323 		for j in range(num_plots): | 
|  | 324 			it_dict['plot_paths'].append("{}_plot_number_{}.png".format(basename, j+1)) | 
|  | 325 		iteration_list.append(it_dict) | 
|  | 326 	return iteration_list | 
|  | 327 | 
|  | 328 | 
|  | 329 ###########################UTILITY######################################## | 
|  | 330 def prepare_directory(outdir, logfile): | 
|  | 331 	# set up output directory | 
|  | 332 	logfile.write("##Setting up output directory:{}##\n".format(outdir)) | 
|  | 333 	subprocess.call("rm -rf '{}'".format(outdir), shell=True) | 
|  | 334 	logfile.write("rm -rf '{}'\n".format(outdir)) | 
|  | 335 	subprocess.call("mkdir -p '{}'".format(outdir), shell=True) | 
|  | 336 	logfile.write("mkdir -p '{}'\n".format(outdir)) | 
|  | 337 | 
|  | 338 # takes a list of dicts, merges them into a single big dict | 
|  | 339 def merge_dicts(dictList): | 
|  | 340 	result = {} | 
|  | 341 	for d in dictList: | 
|  | 342 		if type(d) == dict: | 
|  | 343 			result.update(d) | 
|  | 344 	return result | 
|  | 345 | 
|  | 346 # Some odd rules regarding character escaping and such in galaxy: fix those here | 
|  | 347 def format_galaxy_config_file(cfname): | 
|  | 348 	CHAR_CONVERSION = { | 
|  | 349 		'g': '>', | 
|  | 350 		'l': '<', | 
|  | 351 		'e': '==', | 
|  | 352 		'ge': '<=', | 
|  | 353 		'le': '>=' | 
|  | 354 	} | 
|  | 355 	cfile = open(cfname, 'r') | 
|  | 356 	lines = cfile.readlines() | 
|  | 357 	cfile.close() | 
|  | 358 | 
|  | 359 	newlines = [] | 
|  | 360 	section = None | 
|  | 361 	for l in lines: | 
|  | 362 		nl = l | 
|  | 363 		if l[0] == '#': | 
|  | 364 			if l.strip() == "#numeric_filters": | 
|  | 365 				section = "numeric_filters" | 
|  | 366 			else: | 
|  | 367 				section = None | 
|  | 368 		else: | 
|  | 369 			if section: | 
|  | 370 				tokens = l.split(',') | 
|  | 371 				op = tokens[2] | 
|  | 372 				if op in CHAR_CONVERSION.keys(): | 
|  | 373 					newop = CHAR_CONVERSION[op] | 
|  | 374 				elif op in CHAR_CONVERSION.values(): | 
|  | 375 					newop = op | 
|  | 376 				else: | 
|  | 377 					# error code 1; bad op codes | 
|  | 378 					return 1 | 
|  | 379 				tokens[2] = newop | 
|  | 380 				nl = ','.join(tokens) | 
|  | 381 | 
|  | 382 | 
|  | 383 		nl = nl.replace("__pd__", '#') | 
|  | 384 		newlines.append(nl) | 
|  | 385 | 
|  | 386 	cfile = open(cfname, 'w') | 
|  | 387 	cfile.writelines(newlines) | 
|  | 388 	cfile.close() | 
|  | 389 	return 0 | 
|  | 390 ############################################################################## | 
|  | 391 | 
|  | 392 if __name__ == "__main__": | 
|  | 393 	main() |