diff iterative_pca.py @ 0:64e75e21466e draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:38:39 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/iterative_pca.py	Wed Jun 01 03:38:39 2016 -0400
@@ -0,0 +1,393 @@
+import os
+import subprocess
+import sys
+import argparse
+import datetime
+import csv
+
+import jinja2
+
+import pedify
+
+JINJA_ENVIRONMENT = jinja2.Environment(
+    loader=jinja2.FileSystemLoader(os.path.dirname(__file__)),
+    extensions=['jinja2.ext.autoescape'],
+    autoescape=True)
+DATA_TYPES = ["variant_data", "rds", "rdata", "numeric_ped"]
+CLUSTERING_METHODS = ["dbscan", "hclust", "kmeans"]
+CLUSTER_TRIMMING = ["sd", "mcd", "dbscan_outliers_only"]
+HTML_TEMPLATE_NAME = "pca_report.html"
+DEFAULT_SD_CUTOFF = 2
+PATH_TO_R_FUNCTIONS = "{}/R_functions".format(os.path.dirname(os.path.abspath(__file__)))
+
+
+def main():
+	###===ARGUMENT PARSING===###
+	parser = argparse.ArgumentParser()
+	parser.add_argument("datafile_name", help="name of data file")
+	parser.add_argument("data_type", type=str.lower, choices=DATA_TYPES, help="type of data file")
+	parser.add_argument("iterations", help="max number of iterations to run",
+						type=int)
+	parser.add_argument("--control_tag", help="substring present in the ids of all control samples, e.g. LP")
+	parser.add_argument("--cases_tag", help="substring present in the ids of all case samples, e.g. HAPS")
+	parser.add_argument("--config_file", help="name of file containing config info")
+	parser.add_argument("--clustering_flag", help="Flag to indicate whether to do clustering", action="store_true")
+	parser.add_argument("--clustering_method", type=str.lower, choices=CLUSTERING_METHODS,
+						help="algorithm used to find clusters")
+	parser.add_argument("--cluster_trimming", type=str.lower, choices=CLUSTER_TRIMMING, 
+						help="method used to identify and trim outlier clusters")
+	parser.add_argument("-s", "--sd_cutoff", help="number of standard deviations at which to cut outliers")
+	parser.add_argument("-o", "--out", help="base name of output files and directories; defaults to datafile name")
+	group = parser.add_mutually_exclusive_group()
+	group.add_argument("-r", "--relative_prefix", help="RELATIVE path CONTAINING the output directory")
+	group.add_argument("-a", "--absolute_prefix", help="ABSOLUTE path CONTAINING the output directory")
+	parser.add_argument("--html_file", help="ABSOLUTE PATH of output html file")
+	parser.add_argument("--ethnicity_file", help="File containing data about the ethnicity of the samples")
+	parser.add_argument("--galaxy", help="Flag to indicate we are running this script in Galaxy", 
+						action="store_true")
+	parser.add_argument("--reject_snps", help="file containing SNP ids to ignore/remove")
+	parser.add_argument("--reject_samples", help="file containing sample ids to ignore/remove")
+
+	args = parser.parse_args()
+
+	# enforce conditionally required args
+	if args.data_type == "variant_data":
+		if args.config_file == None:
+			parser.error("Input variant data files require a complementary config file\
+ specified with '--config_file <filename>' flag")
+	
+	if args.config_file:
+		if args.data_type != "variant_data":
+			print "Warning! Your config file will be ignored as the input data is not a\
+ text file containing variant data"
+
+	if args.clustering_flag:
+		if (args.clustering_method == None) or (args.cluster_trimming == None):
+			parser.error("If --clustering_flag is set,\
+ --clustering_method and --cluster_trimming options must also be specified")
+
+	if args.clustering_method or args.cluster_trimming:
+		if args.clustering_flag == False:
+			parser.error("--clustering_method and --cluster_trimming\
+ options only valid if --clustering_flag is also set")
+
+	# dbscan outliers only valid if cluster method == "DBSCAN"
+	if args.cluster_trimming == "dbscan_outliers_only":
+		if args.clustering_method != "dbscan":
+			parser.error("dbscan_outliers_only cluster trimming method only\
+ valid if --clustering_method == dbscan")
+
+	dfname = args.datafile_name
+	data_type = args.data_type
+	n = args.iterations
+	control_tag = args.control_tag
+	cases_tag = args.cases_tag
+	cfname = args.config_file
+	cflag = args.clustering_flag
+	cmethod = (args.clustering_method if args.clustering_method else "none")
+	tmethod = (args.cluster_trimming if args.cluster_trimming else "none")
+	sd_cutoff = (args.sd_cutoff if args.sd_cutoff else DEFAULT_SD_CUTOFF)
+	basename = (args.out if args.out else dfname.split('/')[-1].split('.')[0])
+	hfname = args.html_file
+	galaxy_flag = args.galaxy
+	ethfname = args.ethnicity_file
+	xsnps_filename = args.reject_snps
+	xsamples_filename = args.reject_samples
+
+	# function used to handle strange galaxy behaviour...
+	if galaxy_flag and cfname:
+		rc = format_galaxy_config_file(cfname)
+		if rc != 0:
+			print "Bad config file"
+			sys.exit()
+
+	path_here = os.path.dirname(os.path.abspath(__file__))
+	# determine output directory
+	if args.relative_prefix:
+		outdir = "{}/{}/full_output_{}".format(path_here, args.relative_prefix, basename)
+	elif args.absolute_prefix:
+		outdir = "{}/full_output_{}".format(args.absolute_prefix, basename)
+	else:
+		outdir = "{}/full_output_{}".format(path_here, basename)
+	if hfname == None:
+		hfname = "{}/{}.html".format(outdir, basename)
+	
+	####===BEGIN PIPELINE===####
+	# create a log file
+	lfname = "{}.log".format(basename)
+	logfile = open(lfname, 'w')
+	logfile.write("##Iterative PCA commenced at {}##\n".format(datetime.datetime.now()))
+	
+	# create output directory
+	prepare_directory(outdir, logfile)
+
+	# do pre-processing if necessary, else go straight to R
+	if data_type == "variant_data":
+		stage1_info = stage_text_to_ped(dfname, cfname, xsamples_filename, 
+			xsnps_filename, basename, outdir, logfile, True)
+		data_source_name = "{}/{}".format(outdir, stage1_info['pfname'])
+		data_type_out = "numeric_ped"
+		xsamples_filename = "{}/{}".format(outdir, stage1_info['xsname'])
+	elif data_type in ["rds", "rdata", "numeric_ped"]:
+		data_source_name = dfname
+		data_type_out = data_type
+	else:
+		print "Unrecognised input data type, exiting..."
+		sys.exit(0)
+
+	# do iterative pca in R (much faster)
+	rCommand = "Rscript {}/iterative_pca_plot.R '{}' '{}' '{}' '{}' \
+												'{}' '{}' '{}' '{}' '{}'\
+												'{}' '{}' '{}' '{}' '{}'".format(
+		path_here, 
+		n, outdir, basename, data_source_name, 
+		data_type_out, ethfname, control_tag, cases_tag, sd_cutoff, 
+		cmethod, tmethod, PATH_TO_R_FUNCTIONS, xsamples_filename, xsnps_filename)
+
+	subprocess.call(rCommand, shell=True)
+	# generate dictionary of output file names
+	if cflag:
+		num_plots = 1
+	else:
+		num_plots = 2
+	if ethfname:
+		num_plots += 1
+	if cases_tag or control_tag:
+		num_plots += 1
+	iteration_data = create_output_dict(outdir, basename, n, num_plots)
+
+	# clean up logfile
+	logfile.write("##Iterative PCA completed at {}##\n".format(datetime.datetime.now()))
+	logfile.close()
+	subprocess.call("mv '{}' '{}'".format(lfname, outdir), shell=True)
+	lfname = "{}/{}".format(outdir, lfname)
+	if cfname:
+		# copy and rename config file into outdir (if provided)
+		subprocess.call("cp '{}' '{}/{}.config'".format(cfname, outdir, basename), shell=True)
+	# output results summary in a nice html format
+	html_output = produce_report(hfname, dfname, lfname, cflag, 
+		cmethod, tmethod, sd_cutoff, iteration_data, galaxy_flag)
+##############################################################################
+
+""" 
+Function to convert a text file containing variant data into a numeric or genomic ped file
+Inputs:
+- dfname: The name of the text file to convert
+- cfname: Configuration file containing info on which columns to find required information in,
+			as well as filtering options
+- xsamples_filename: A file containg a list of exact sample ids to exclude
+- xsnps_filename: A file containing a list of exact snp ids to exclude
+- basename: Filename without the extension
+- outdir: Full path of output directory
+- logfile: A file to keep track of program status. Any shell commands executed will be written to this file
+- numeric_flag: If true, ped file will be numeric.
+				If false, ped file will be genomic
+Outputs:
+- ped file, either numeric or genomic
+- map file
+- excluded samples file, containing the ids of all samples which did not pass the filters and were rejected
+Returns:
+	a dict containing the PATHS to important output files 
+"""
+def stage_text_to_ped(dfname, cfname, xsamples_filename, xsnps_filename, basename, outdir, logfile, numeric_flag):
+	# Create ped and map files 
+	logfile.write("##STAGE 1: Create .ped and .map files from input text file##\n")
+
+	pfname = "{}/{}.ped".format(outdir, basename)
+	mfname = "{}/{}.map".format(outdir, basename)
+	xsname = "{}/{}_xsamples.txt".format(outdir, basename)
+
+	pc = pedify.PedConverter()
+	# read in config file
+	rc = pc.read_config_file(cfname)
+	# Add in string filters for excluded samples and snps
+	if xsamples_filename:
+		xsamples = open(xsamples_filename, 'r').read().split('\n')
+		pc.cfg.add_string_filter("comp_generated_sample_cuts", pc.cfg.col_names['sample_id_column'], 
+			"exact", "reject", xsamples)
+	if xsnps_filename:
+		xsnps = open(xsnps_filename, 'r').read().split('\n')
+		pc.cfg.add_string_filter("comp_generated_snp_cuts", pc.cfg.col_names['variant_id_column'], 
+			"exact", "reject", xsnps)
+	if rc == 0:
+		print 'config file read successfully'
+	else:
+		print 'failed to read in config file successfully. Error code: {}'.format(rc)
+	
+	# read in data file
+	rc = pc.read_data_file(dfname)
+	if rc == 0:
+		print 'data file read successfully'
+	else:
+		print 'failed to read in data file successfully. Error code: {}'.format(rc)	
+	
+	# create output
+	pc.create_ped_file(pfname, numeric=numeric_flag)
+	pc.create_map_file(mfname)
+	pc.create_excluded_samples_file(xsname)
+	outdict = {
+		"pfname": "{}.ped".format(basename),
+		"mfname": "{}.map".format(basename),
+		"xsname": "{}_xsamples.txt".format(basename),
+		"control_tag": pc.cfg.control_info['control_tag']['tag'],
+		"cases_tag": pc.cfg.control_info['cases_tag']['tag']
+	}
+
+	return outdict
+
+"""
+Function to produce the html report of all the data
+Inputs:
+hfname: The ABSOLUTE PATH (including the filename) of the html file to output
+dfname: path to datafile
+lfname: path to logfile
+sd_cutoff: multipler used to determine outliers
+iterationList: a list of dicts, each containing AT LEAST:
+	- outdir: ABSOLUTE PATH to parent directory containing all the output from the iteration
+	- count: which iteration this was
+	- plotpaths: A list of paths to output plots
+	- xsname: name of file containing ids of samples excluded from this iteration
+	- ofname: name of file containing ids of outliers detected in this iteration
+returns:
+	The html output that was written to the html file
+"""
+def produce_report(hfname, dfname, lfname, cflag, cmethod, tmethod, sd_cutoff, iteration_list, galaxy_flag):
+	hfdir = os.path.dirname(hfname)
+	htmldoc = open(hfname, 'w')
+	iteration_data = []
+	for it in iteration_list:
+		outdir = it['outdir']
+		if galaxy_flag:
+			dirs = outdir.split("/")
+			relpath = "{}/{}".format(dirs[-2], dirs[-1])
+		else:
+			relpath = os.path.relpath(outdir, hfdir)
+		it_dict = {}
+
+		ofname = "{}/{}".format(outdir, it['ofname'])
+		xsname = "{}/{}".format(outdir, it['xsname'])
+		of = open(ofname, 'r')
+		xf = open(xsname, 'r')
+		ol = []
+		xl = []
+		for line in of:
+			ol.append(line.split(','))
+		for line in xf:
+			xl.append(line.split(','))
+		of.close()
+		xf.close()
+		# fix the paths to the images
+		rel_plot_paths = []
+		for pp in it['plot_paths']:
+			rel_plot_paths.append("{}/{}".format(relpath, pp))
+
+		it_dict['count'] = it['count']
+		it_dict['ol'] = ol
+		it_dict['xl'] = xl
+		it_dict['rel_plot_paths']= rel_plot_paths
+		iteration_data.append(it_dict)
+
+	logcontent = open(lfname, 'r').read()
+	template_values = {
+		"date": datetime.datetime.now(),
+		"dfname": dfname,
+		"lfname": lfname,
+		"logcontent": logcontent,
+		"iteration_data": iteration_data,
+		"cflag": cflag,
+		"cmethod": cmethod,
+		"tmethod": tmethod,
+		"sd_cutoff": sd_cutoff,
+	}
+	template = JINJA_ENVIRONMENT.get_template(HTML_TEMPLATE_NAME)
+	rendered_template = template.render(template_values)
+	htmldoc.write(rendered_template)
+	htmldoc.close()
+	return rendered_template
+
+""" 
+Generate the expected directory structure of the root output directory,
+given the basname, number of iterations, and number of expected plots output by
+the R script
+"""
+def create_output_dict(outdir, basename, n, num_plots):
+	iteration_list = []
+	for i in range(n):
+		it_dict = {}
+		i_outdir = "{}/output_{}_iteration_{}".format(outdir, basename, i)
+		it_dict['outdir'] = i_outdir
+		it_dict['ofname'] = "{}_outliers.txt".format(basename)
+		it_dict['xsname'] = "{}_xfile.txt".format(basename)
+		it_dict['count'] = i
+		it_dict['plot_paths'] = []
+		for j in range(num_plots):
+			it_dict['plot_paths'].append("{}_plot_number_{}.png".format(basename, j+1))
+		iteration_list.append(it_dict)
+	return iteration_list
+
+
+###########################UTILITY########################################
+def prepare_directory(outdir, logfile):
+	# set up output directory
+	logfile.write("##Setting up output directory:{}##\n".format(outdir))
+	subprocess.call("rm -rf '{}'".format(outdir), shell=True)
+	logfile.write("rm -rf '{}'\n".format(outdir))
+	subprocess.call("mkdir -p '{}'".format(outdir), shell=True)
+	logfile.write("mkdir -p '{}'\n".format(outdir))
+
+# takes a list of dicts, merges them into a single big dict
+def merge_dicts(dictList):
+	result = {}
+	for d in dictList:
+		if type(d) == dict:
+			result.update(d)
+	return result
+
+# Some odd rules regarding character escaping and such in galaxy: fix those here
+def format_galaxy_config_file(cfname):
+	CHAR_CONVERSION = {
+		'g': '>',
+		'l': '<',
+		'e': '==',
+		'ge': '<=',
+		'le': '>='
+	}
+	cfile = open(cfname, 'r')
+	lines = cfile.readlines()
+	cfile.close()
+
+	newlines = []
+	section = None
+	for l in lines:
+		nl = l
+		if l[0] == '#':
+			if l.strip() == "#numeric_filters":
+				section = "numeric_filters"
+			else:
+				section = None
+		else:
+			if section:
+				tokens = l.split(',')
+				op = tokens[2]
+				if op in CHAR_CONVERSION.keys():
+					newop = CHAR_CONVERSION[op]
+				elif op in CHAR_CONVERSION.values():
+					newop = op
+				else:
+					# error code 1; bad op codes
+					return 1
+				tokens[2] = newop
+				nl = ','.join(tokens)
+
+
+		nl = nl.replace("__pd__", '#')
+		newlines.append(nl)
+
+	cfile = open(cfname, 'w')
+	cfile.writelines(newlines)
+	cfile.close()
+	return 0
+##############################################################################
+
+if __name__ == "__main__":
+	main()