view iterative_pca.py @ 0:64e75e21466e draft default tip

Uploaded
author pmac
date Wed, 01 Jun 2016 03:38:39 -0400
parents
children
line wrap: on
line source

import os
import subprocess
import sys
import argparse
import datetime
import csv

import jinja2

import pedify

JINJA_ENVIRONMENT = jinja2.Environment(
    loader=jinja2.FileSystemLoader(os.path.dirname(__file__)),
    extensions=['jinja2.ext.autoescape'],
    autoescape=True)
DATA_TYPES = ["variant_data", "rds", "rdata", "numeric_ped"]
CLUSTERING_METHODS = ["dbscan", "hclust", "kmeans"]
CLUSTER_TRIMMING = ["sd", "mcd", "dbscan_outliers_only"]
HTML_TEMPLATE_NAME = "pca_report.html"
DEFAULT_SD_CUTOFF = 2
PATH_TO_R_FUNCTIONS = "{}/R_functions".format(os.path.dirname(os.path.abspath(__file__)))


def main():
	###===ARGUMENT PARSING===###
	parser = argparse.ArgumentParser()
	parser.add_argument("datafile_name", help="name of data file")
	parser.add_argument("data_type", type=str.lower, choices=DATA_TYPES, help="type of data file")
	parser.add_argument("iterations", help="max number of iterations to run",
						type=int)
	parser.add_argument("--control_tag", help="substring present in the ids of all control samples, e.g. LP")
	parser.add_argument("--cases_tag", help="substring present in the ids of all case samples, e.g. HAPS")
	parser.add_argument("--config_file", help="name of file containing config info")
	parser.add_argument("--clustering_flag", help="Flag to indicate whether to do clustering", action="store_true")
	parser.add_argument("--clustering_method", type=str.lower, choices=CLUSTERING_METHODS,
						help="algorithm used to find clusters")
	parser.add_argument("--cluster_trimming", type=str.lower, choices=CLUSTER_TRIMMING, 
						help="method used to identify and trim outlier clusters")
	parser.add_argument("-s", "--sd_cutoff", help="number of standard deviations at which to cut outliers")
	parser.add_argument("-o", "--out", help="base name of output files and directories; defaults to datafile name")
	group = parser.add_mutually_exclusive_group()
	group.add_argument("-r", "--relative_prefix", help="RELATIVE path CONTAINING the output directory")
	group.add_argument("-a", "--absolute_prefix", help="ABSOLUTE path CONTAINING the output directory")
	parser.add_argument("--html_file", help="ABSOLUTE PATH of output html file")
	parser.add_argument("--ethnicity_file", help="File containing data about the ethnicity of the samples")
	parser.add_argument("--galaxy", help="Flag to indicate we are running this script in Galaxy", 
						action="store_true")
	parser.add_argument("--reject_snps", help="file containing SNP ids to ignore/remove")
	parser.add_argument("--reject_samples", help="file containing sample ids to ignore/remove")

	args = parser.parse_args()

	# enforce conditionally required args
	if args.data_type == "variant_data":
		if args.config_file == None:
			parser.error("Input variant data files require a complementary config file\
 specified with '--config_file <filename>' flag")
	
	if args.config_file:
		if args.data_type != "variant_data":
			print "Warning! Your config file will be ignored as the input data is not a\
 text file containing variant data"

	if args.clustering_flag:
		if (args.clustering_method == None) or (args.cluster_trimming == None):
			parser.error("If --clustering_flag is set,\
 --clustering_method and --cluster_trimming options must also be specified")

	if args.clustering_method or args.cluster_trimming:
		if args.clustering_flag == False:
			parser.error("--clustering_method and --cluster_trimming\
 options only valid if --clustering_flag is also set")

	# dbscan outliers only valid if cluster method == "DBSCAN"
	if args.cluster_trimming == "dbscan_outliers_only":
		if args.clustering_method != "dbscan":
			parser.error("dbscan_outliers_only cluster trimming method only\
 valid if --clustering_method == dbscan")

	dfname = args.datafile_name
	data_type = args.data_type
	n = args.iterations
	control_tag = args.control_tag
	cases_tag = args.cases_tag
	cfname = args.config_file
	cflag = args.clustering_flag
	cmethod = (args.clustering_method if args.clustering_method else "none")
	tmethod = (args.cluster_trimming if args.cluster_trimming else "none")
	sd_cutoff = (args.sd_cutoff if args.sd_cutoff else DEFAULT_SD_CUTOFF)
	basename = (args.out if args.out else dfname.split('/')[-1].split('.')[0])
	hfname = args.html_file
	galaxy_flag = args.galaxy
	ethfname = args.ethnicity_file
	xsnps_filename = args.reject_snps
	xsamples_filename = args.reject_samples

	# function used to handle strange galaxy behaviour...
	if galaxy_flag and cfname:
		rc = format_galaxy_config_file(cfname)
		if rc != 0:
			print "Bad config file"
			sys.exit()

	path_here = os.path.dirname(os.path.abspath(__file__))
	# determine output directory
	if args.relative_prefix:
		outdir = "{}/{}/full_output_{}".format(path_here, args.relative_prefix, basename)
	elif args.absolute_prefix:
		outdir = "{}/full_output_{}".format(args.absolute_prefix, basename)
	else:
		outdir = "{}/full_output_{}".format(path_here, basename)
	if hfname == None:
		hfname = "{}/{}.html".format(outdir, basename)
	
	####===BEGIN PIPELINE===####
	# create a log file
	lfname = "{}.log".format(basename)
	logfile = open(lfname, 'w')
	logfile.write("##Iterative PCA commenced at {}##\n".format(datetime.datetime.now()))
	
	# create output directory
	prepare_directory(outdir, logfile)

	# do pre-processing if necessary, else go straight to R
	if data_type == "variant_data":
		stage1_info = stage_text_to_ped(dfname, cfname, xsamples_filename, 
			xsnps_filename, basename, outdir, logfile, True)
		data_source_name = "{}/{}".format(outdir, stage1_info['pfname'])
		data_type_out = "numeric_ped"
		xsamples_filename = "{}/{}".format(outdir, stage1_info['xsname'])
	elif data_type in ["rds", "rdata", "numeric_ped"]:
		data_source_name = dfname
		data_type_out = data_type
	else:
		print "Unrecognised input data type, exiting..."
		sys.exit(0)

	# do iterative pca in R (much faster)
	rCommand = "Rscript {}/iterative_pca_plot.R '{}' '{}' '{}' '{}' \
												'{}' '{}' '{}' '{}' '{}'\
												'{}' '{}' '{}' '{}' '{}'".format(
		path_here, 
		n, outdir, basename, data_source_name, 
		data_type_out, ethfname, control_tag, cases_tag, sd_cutoff, 
		cmethod, tmethod, PATH_TO_R_FUNCTIONS, xsamples_filename, xsnps_filename)

	subprocess.call(rCommand, shell=True)
	# generate dictionary of output file names
	if cflag:
		num_plots = 1
	else:
		num_plots = 2
	if ethfname:
		num_plots += 1
	if cases_tag or control_tag:
		num_plots += 1
	iteration_data = create_output_dict(outdir, basename, n, num_plots)

	# clean up logfile
	logfile.write("##Iterative PCA completed at {}##\n".format(datetime.datetime.now()))
	logfile.close()
	subprocess.call("mv '{}' '{}'".format(lfname, outdir), shell=True)
	lfname = "{}/{}".format(outdir, lfname)
	if cfname:
		# copy and rename config file into outdir (if provided)
		subprocess.call("cp '{}' '{}/{}.config'".format(cfname, outdir, basename), shell=True)
	# output results summary in a nice html format
	html_output = produce_report(hfname, dfname, lfname, cflag, 
		cmethod, tmethod, sd_cutoff, iteration_data, galaxy_flag)
##############################################################################

""" 
Function to convert a text file containing variant data into a numeric or genomic ped file
Inputs:
- dfname: The name of the text file to convert
- cfname: Configuration file containing info on which columns to find required information in,
			as well as filtering options
- xsamples_filename: A file containg a list of exact sample ids to exclude
- xsnps_filename: A file containing a list of exact snp ids to exclude
- basename: Filename without the extension
- outdir: Full path of output directory
- logfile: A file to keep track of program status. Any shell commands executed will be written to this file
- numeric_flag: If true, ped file will be numeric.
				If false, ped file will be genomic
Outputs:
- ped file, either numeric or genomic
- map file
- excluded samples file, containing the ids of all samples which did not pass the filters and were rejected
Returns:
	a dict containing the PATHS to important output files 
"""
def stage_text_to_ped(dfname, cfname, xsamples_filename, xsnps_filename, basename, outdir, logfile, numeric_flag):
	# Create ped and map files 
	logfile.write("##STAGE 1: Create .ped and .map files from input text file##\n")

	pfname = "{}/{}.ped".format(outdir, basename)
	mfname = "{}/{}.map".format(outdir, basename)
	xsname = "{}/{}_xsamples.txt".format(outdir, basename)

	pc = pedify.PedConverter()
	# read in config file
	rc = pc.read_config_file(cfname)
	# Add in string filters for excluded samples and snps
	if xsamples_filename:
		xsamples = open(xsamples_filename, 'r').read().split('\n')
		pc.cfg.add_string_filter("comp_generated_sample_cuts", pc.cfg.col_names['sample_id_column'], 
			"exact", "reject", xsamples)
	if xsnps_filename:
		xsnps = open(xsnps_filename, 'r').read().split('\n')
		pc.cfg.add_string_filter("comp_generated_snp_cuts", pc.cfg.col_names['variant_id_column'], 
			"exact", "reject", xsnps)
	if rc == 0:
		print 'config file read successfully'
	else:
		print 'failed to read in config file successfully. Error code: {}'.format(rc)
	
	# read in data file
	rc = pc.read_data_file(dfname)
	if rc == 0:
		print 'data file read successfully'
	else:
		print 'failed to read in data file successfully. Error code: {}'.format(rc)	
	
	# create output
	pc.create_ped_file(pfname, numeric=numeric_flag)
	pc.create_map_file(mfname)
	pc.create_excluded_samples_file(xsname)
	outdict = {
		"pfname": "{}.ped".format(basename),
		"mfname": "{}.map".format(basename),
		"xsname": "{}_xsamples.txt".format(basename),
		"control_tag": pc.cfg.control_info['control_tag']['tag'],
		"cases_tag": pc.cfg.control_info['cases_tag']['tag']
	}

	return outdict

"""
Function to produce the html report of all the data
Inputs:
hfname: The ABSOLUTE PATH (including the filename) of the html file to output
dfname: path to datafile
lfname: path to logfile
sd_cutoff: multipler used to determine outliers
iterationList: a list of dicts, each containing AT LEAST:
	- outdir: ABSOLUTE PATH to parent directory containing all the output from the iteration
	- count: which iteration this was
	- plotpaths: A list of paths to output plots
	- xsname: name of file containing ids of samples excluded from this iteration
	- ofname: name of file containing ids of outliers detected in this iteration
returns:
	The html output that was written to the html file
"""
def produce_report(hfname, dfname, lfname, cflag, cmethod, tmethod, sd_cutoff, iteration_list, galaxy_flag):
	hfdir = os.path.dirname(hfname)
	htmldoc = open(hfname, 'w')
	iteration_data = []
	for it in iteration_list:
		outdir = it['outdir']
		if galaxy_flag:
			dirs = outdir.split("/")
			relpath = "{}/{}".format(dirs[-2], dirs[-1])
		else:
			relpath = os.path.relpath(outdir, hfdir)
		it_dict = {}

		ofname = "{}/{}".format(outdir, it['ofname'])
		xsname = "{}/{}".format(outdir, it['xsname'])
		of = open(ofname, 'r')
		xf = open(xsname, 'r')
		ol = []
		xl = []
		for line in of:
			ol.append(line.split(','))
		for line in xf:
			xl.append(line.split(','))
		of.close()
		xf.close()
		# fix the paths to the images
		rel_plot_paths = []
		for pp in it['plot_paths']:
			rel_plot_paths.append("{}/{}".format(relpath, pp))

		it_dict['count'] = it['count']
		it_dict['ol'] = ol
		it_dict['xl'] = xl
		it_dict['rel_plot_paths']= rel_plot_paths
		iteration_data.append(it_dict)

	logcontent = open(lfname, 'r').read()
	template_values = {
		"date": datetime.datetime.now(),
		"dfname": dfname,
		"lfname": lfname,
		"logcontent": logcontent,
		"iteration_data": iteration_data,
		"cflag": cflag,
		"cmethod": cmethod,
		"tmethod": tmethod,
		"sd_cutoff": sd_cutoff,
	}
	template = JINJA_ENVIRONMENT.get_template(HTML_TEMPLATE_NAME)
	rendered_template = template.render(template_values)
	htmldoc.write(rendered_template)
	htmldoc.close()
	return rendered_template

""" 
Generate the expected directory structure of the root output directory,
given the basname, number of iterations, and number of expected plots output by
the R script
"""
def create_output_dict(outdir, basename, n, num_plots):
	iteration_list = []
	for i in range(n):
		it_dict = {}
		i_outdir = "{}/output_{}_iteration_{}".format(outdir, basename, i)
		it_dict['outdir'] = i_outdir
		it_dict['ofname'] = "{}_outliers.txt".format(basename)
		it_dict['xsname'] = "{}_xfile.txt".format(basename)
		it_dict['count'] = i
		it_dict['plot_paths'] = []
		for j in range(num_plots):
			it_dict['plot_paths'].append("{}_plot_number_{}.png".format(basename, j+1))
		iteration_list.append(it_dict)
	return iteration_list


###########################UTILITY########################################
def prepare_directory(outdir, logfile):
	# set up output directory
	logfile.write("##Setting up output directory:{}##\n".format(outdir))
	subprocess.call("rm -rf '{}'".format(outdir), shell=True)
	logfile.write("rm -rf '{}'\n".format(outdir))
	subprocess.call("mkdir -p '{}'".format(outdir), shell=True)
	logfile.write("mkdir -p '{}'\n".format(outdir))

# takes a list of dicts, merges them into a single big dict
def merge_dicts(dictList):
	result = {}
	for d in dictList:
		if type(d) == dict:
			result.update(d)
	return result

# Some odd rules regarding character escaping and such in galaxy: fix those here
def format_galaxy_config_file(cfname):
	CHAR_CONVERSION = {
		'g': '>',
		'l': '<',
		'e': '==',
		'ge': '<=',
		'le': '>='
	}
	cfile = open(cfname, 'r')
	lines = cfile.readlines()
	cfile.close()

	newlines = []
	section = None
	for l in lines:
		nl = l
		if l[0] == '#':
			if l.strip() == "#numeric_filters":
				section = "numeric_filters"
			else:
				section = None
		else:
			if section:
				tokens = l.split(',')
				op = tokens[2]
				if op in CHAR_CONVERSION.keys():
					newop = CHAR_CONVERSION[op]
				elif op in CHAR_CONVERSION.values():
					newop = op
				else:
					# error code 1; bad op codes
					return 1
				tokens[2] = newop
				nl = ','.join(tokens)


		nl = nl.replace("__pd__", '#')
		newlines.append(nl)

	cfile = open(cfname, 'w')
	cfile.writelines(newlines)
	cfile.close()
	return 0
##############################################################################

if __name__ == "__main__":
	main()