Mercurial > repos > pmac > iterativepca
view iterative_pca.py @ 0:64e75e21466e draft default tip
Uploaded
author | pmac |
---|---|
date | Wed, 01 Jun 2016 03:38:39 -0400 |
parents | |
children |
line wrap: on
line source
import os import subprocess import sys import argparse import datetime import csv import jinja2 import pedify JINJA_ENVIRONMENT = jinja2.Environment( loader=jinja2.FileSystemLoader(os.path.dirname(__file__)), extensions=['jinja2.ext.autoescape'], autoescape=True) DATA_TYPES = ["variant_data", "rds", "rdata", "numeric_ped"] CLUSTERING_METHODS = ["dbscan", "hclust", "kmeans"] CLUSTER_TRIMMING = ["sd", "mcd", "dbscan_outliers_only"] HTML_TEMPLATE_NAME = "pca_report.html" DEFAULT_SD_CUTOFF = 2 PATH_TO_R_FUNCTIONS = "{}/R_functions".format(os.path.dirname(os.path.abspath(__file__))) def main(): ###===ARGUMENT PARSING===### parser = argparse.ArgumentParser() parser.add_argument("datafile_name", help="name of data file") parser.add_argument("data_type", type=str.lower, choices=DATA_TYPES, help="type of data file") parser.add_argument("iterations", help="max number of iterations to run", type=int) parser.add_argument("--control_tag", help="substring present in the ids of all control samples, e.g. LP") parser.add_argument("--cases_tag", help="substring present in the ids of all case samples, e.g. HAPS") parser.add_argument("--config_file", help="name of file containing config info") parser.add_argument("--clustering_flag", help="Flag to indicate whether to do clustering", action="store_true") parser.add_argument("--clustering_method", type=str.lower, choices=CLUSTERING_METHODS, help="algorithm used to find clusters") parser.add_argument("--cluster_trimming", type=str.lower, choices=CLUSTER_TRIMMING, help="method used to identify and trim outlier clusters") parser.add_argument("-s", "--sd_cutoff", help="number of standard deviations at which to cut outliers") parser.add_argument("-o", "--out", help="base name of output files and directories; defaults to datafile name") group = parser.add_mutually_exclusive_group() group.add_argument("-r", "--relative_prefix", help="RELATIVE path CONTAINING the output directory") group.add_argument("-a", "--absolute_prefix", help="ABSOLUTE path CONTAINING the output directory") parser.add_argument("--html_file", help="ABSOLUTE PATH of output html file") parser.add_argument("--ethnicity_file", help="File containing data about the ethnicity of the samples") parser.add_argument("--galaxy", help="Flag to indicate we are running this script in Galaxy", action="store_true") parser.add_argument("--reject_snps", help="file containing SNP ids to ignore/remove") parser.add_argument("--reject_samples", help="file containing sample ids to ignore/remove") args = parser.parse_args() # enforce conditionally required args if args.data_type == "variant_data": if args.config_file == None: parser.error("Input variant data files require a complementary config file\ specified with '--config_file <filename>' flag") if args.config_file: if args.data_type != "variant_data": print "Warning! Your config file will be ignored as the input data is not a\ text file containing variant data" if args.clustering_flag: if (args.clustering_method == None) or (args.cluster_trimming == None): parser.error("If --clustering_flag is set,\ --clustering_method and --cluster_trimming options must also be specified") if args.clustering_method or args.cluster_trimming: if args.clustering_flag == False: parser.error("--clustering_method and --cluster_trimming\ options only valid if --clustering_flag is also set") # dbscan outliers only valid if cluster method == "DBSCAN" if args.cluster_trimming == "dbscan_outliers_only": if args.clustering_method != "dbscan": parser.error("dbscan_outliers_only cluster trimming method only\ valid if --clustering_method == dbscan") dfname = args.datafile_name data_type = args.data_type n = args.iterations control_tag = args.control_tag cases_tag = args.cases_tag cfname = args.config_file cflag = args.clustering_flag cmethod = (args.clustering_method if args.clustering_method else "none") tmethod = (args.cluster_trimming if args.cluster_trimming else "none") sd_cutoff = (args.sd_cutoff if args.sd_cutoff else DEFAULT_SD_CUTOFF) basename = (args.out if args.out else dfname.split('/')[-1].split('.')[0]) hfname = args.html_file galaxy_flag = args.galaxy ethfname = args.ethnicity_file xsnps_filename = args.reject_snps xsamples_filename = args.reject_samples # function used to handle strange galaxy behaviour... if galaxy_flag and cfname: rc = format_galaxy_config_file(cfname) if rc != 0: print "Bad config file" sys.exit() path_here = os.path.dirname(os.path.abspath(__file__)) # determine output directory if args.relative_prefix: outdir = "{}/{}/full_output_{}".format(path_here, args.relative_prefix, basename) elif args.absolute_prefix: outdir = "{}/full_output_{}".format(args.absolute_prefix, basename) else: outdir = "{}/full_output_{}".format(path_here, basename) if hfname == None: hfname = "{}/{}.html".format(outdir, basename) ####===BEGIN PIPELINE===#### # create a log file lfname = "{}.log".format(basename) logfile = open(lfname, 'w') logfile.write("##Iterative PCA commenced at {}##\n".format(datetime.datetime.now())) # create output directory prepare_directory(outdir, logfile) # do pre-processing if necessary, else go straight to R if data_type == "variant_data": stage1_info = stage_text_to_ped(dfname, cfname, xsamples_filename, xsnps_filename, basename, outdir, logfile, True) data_source_name = "{}/{}".format(outdir, stage1_info['pfname']) data_type_out = "numeric_ped" xsamples_filename = "{}/{}".format(outdir, stage1_info['xsname']) elif data_type in ["rds", "rdata", "numeric_ped"]: data_source_name = dfname data_type_out = data_type else: print "Unrecognised input data type, exiting..." sys.exit(0) # do iterative pca in R (much faster) rCommand = "Rscript {}/iterative_pca_plot.R '{}' '{}' '{}' '{}' \ '{}' '{}' '{}' '{}' '{}'\ '{}' '{}' '{}' '{}' '{}'".format( path_here, n, outdir, basename, data_source_name, data_type_out, ethfname, control_tag, cases_tag, sd_cutoff, cmethod, tmethod, PATH_TO_R_FUNCTIONS, xsamples_filename, xsnps_filename) subprocess.call(rCommand, shell=True) # generate dictionary of output file names if cflag: num_plots = 1 else: num_plots = 2 if ethfname: num_plots += 1 if cases_tag or control_tag: num_plots += 1 iteration_data = create_output_dict(outdir, basename, n, num_plots) # clean up logfile logfile.write("##Iterative PCA completed at {}##\n".format(datetime.datetime.now())) logfile.close() subprocess.call("mv '{}' '{}'".format(lfname, outdir), shell=True) lfname = "{}/{}".format(outdir, lfname) if cfname: # copy and rename config file into outdir (if provided) subprocess.call("cp '{}' '{}/{}.config'".format(cfname, outdir, basename), shell=True) # output results summary in a nice html format html_output = produce_report(hfname, dfname, lfname, cflag, cmethod, tmethod, sd_cutoff, iteration_data, galaxy_flag) ############################################################################## """ Function to convert a text file containing variant data into a numeric or genomic ped file Inputs: - dfname: The name of the text file to convert - cfname: Configuration file containing info on which columns to find required information in, as well as filtering options - xsamples_filename: A file containg a list of exact sample ids to exclude - xsnps_filename: A file containing a list of exact snp ids to exclude - basename: Filename without the extension - outdir: Full path of output directory - logfile: A file to keep track of program status. Any shell commands executed will be written to this file - numeric_flag: If true, ped file will be numeric. If false, ped file will be genomic Outputs: - ped file, either numeric or genomic - map file - excluded samples file, containing the ids of all samples which did not pass the filters and were rejected Returns: a dict containing the PATHS to important output files """ def stage_text_to_ped(dfname, cfname, xsamples_filename, xsnps_filename, basename, outdir, logfile, numeric_flag): # Create ped and map files logfile.write("##STAGE 1: Create .ped and .map files from input text file##\n") pfname = "{}/{}.ped".format(outdir, basename) mfname = "{}/{}.map".format(outdir, basename) xsname = "{}/{}_xsamples.txt".format(outdir, basename) pc = pedify.PedConverter() # read in config file rc = pc.read_config_file(cfname) # Add in string filters for excluded samples and snps if xsamples_filename: xsamples = open(xsamples_filename, 'r').read().split('\n') pc.cfg.add_string_filter("comp_generated_sample_cuts", pc.cfg.col_names['sample_id_column'], "exact", "reject", xsamples) if xsnps_filename: xsnps = open(xsnps_filename, 'r').read().split('\n') pc.cfg.add_string_filter("comp_generated_snp_cuts", pc.cfg.col_names['variant_id_column'], "exact", "reject", xsnps) if rc == 0: print 'config file read successfully' else: print 'failed to read in config file successfully. Error code: {}'.format(rc) # read in data file rc = pc.read_data_file(dfname) if rc == 0: print 'data file read successfully' else: print 'failed to read in data file successfully. Error code: {}'.format(rc) # create output pc.create_ped_file(pfname, numeric=numeric_flag) pc.create_map_file(mfname) pc.create_excluded_samples_file(xsname) outdict = { "pfname": "{}.ped".format(basename), "mfname": "{}.map".format(basename), "xsname": "{}_xsamples.txt".format(basename), "control_tag": pc.cfg.control_info['control_tag']['tag'], "cases_tag": pc.cfg.control_info['cases_tag']['tag'] } return outdict """ Function to produce the html report of all the data Inputs: hfname: The ABSOLUTE PATH (including the filename) of the html file to output dfname: path to datafile lfname: path to logfile sd_cutoff: multipler used to determine outliers iterationList: a list of dicts, each containing AT LEAST: - outdir: ABSOLUTE PATH to parent directory containing all the output from the iteration - count: which iteration this was - plotpaths: A list of paths to output plots - xsname: name of file containing ids of samples excluded from this iteration - ofname: name of file containing ids of outliers detected in this iteration returns: The html output that was written to the html file """ def produce_report(hfname, dfname, lfname, cflag, cmethod, tmethod, sd_cutoff, iteration_list, galaxy_flag): hfdir = os.path.dirname(hfname) htmldoc = open(hfname, 'w') iteration_data = [] for it in iteration_list: outdir = it['outdir'] if galaxy_flag: dirs = outdir.split("/") relpath = "{}/{}".format(dirs[-2], dirs[-1]) else: relpath = os.path.relpath(outdir, hfdir) it_dict = {} ofname = "{}/{}".format(outdir, it['ofname']) xsname = "{}/{}".format(outdir, it['xsname']) of = open(ofname, 'r') xf = open(xsname, 'r') ol = [] xl = [] for line in of: ol.append(line.split(',')) for line in xf: xl.append(line.split(',')) of.close() xf.close() # fix the paths to the images rel_plot_paths = [] for pp in it['plot_paths']: rel_plot_paths.append("{}/{}".format(relpath, pp)) it_dict['count'] = it['count'] it_dict['ol'] = ol it_dict['xl'] = xl it_dict['rel_plot_paths']= rel_plot_paths iteration_data.append(it_dict) logcontent = open(lfname, 'r').read() template_values = { "date": datetime.datetime.now(), "dfname": dfname, "lfname": lfname, "logcontent": logcontent, "iteration_data": iteration_data, "cflag": cflag, "cmethod": cmethod, "tmethod": tmethod, "sd_cutoff": sd_cutoff, } template = JINJA_ENVIRONMENT.get_template(HTML_TEMPLATE_NAME) rendered_template = template.render(template_values) htmldoc.write(rendered_template) htmldoc.close() return rendered_template """ Generate the expected directory structure of the root output directory, given the basname, number of iterations, and number of expected plots output by the R script """ def create_output_dict(outdir, basename, n, num_plots): iteration_list = [] for i in range(n): it_dict = {} i_outdir = "{}/output_{}_iteration_{}".format(outdir, basename, i) it_dict['outdir'] = i_outdir it_dict['ofname'] = "{}_outliers.txt".format(basename) it_dict['xsname'] = "{}_xfile.txt".format(basename) it_dict['count'] = i it_dict['plot_paths'] = [] for j in range(num_plots): it_dict['plot_paths'].append("{}_plot_number_{}.png".format(basename, j+1)) iteration_list.append(it_dict) return iteration_list ###########################UTILITY######################################## def prepare_directory(outdir, logfile): # set up output directory logfile.write("##Setting up output directory:{}##\n".format(outdir)) subprocess.call("rm -rf '{}'".format(outdir), shell=True) logfile.write("rm -rf '{}'\n".format(outdir)) subprocess.call("mkdir -p '{}'".format(outdir), shell=True) logfile.write("mkdir -p '{}'\n".format(outdir)) # takes a list of dicts, merges them into a single big dict def merge_dicts(dictList): result = {} for d in dictList: if type(d) == dict: result.update(d) return result # Some odd rules regarding character escaping and such in galaxy: fix those here def format_galaxy_config_file(cfname): CHAR_CONVERSION = { 'g': '>', 'l': '<', 'e': '==', 'ge': '<=', 'le': '>=' } cfile = open(cfname, 'r') lines = cfile.readlines() cfile.close() newlines = [] section = None for l in lines: nl = l if l[0] == '#': if l.strip() == "#numeric_filters": section = "numeric_filters" else: section = None else: if section: tokens = l.split(',') op = tokens[2] if op in CHAR_CONVERSION.keys(): newop = CHAR_CONVERSION[op] elif op in CHAR_CONVERSION.values(): newop = op else: # error code 1; bad op codes return 1 tokens[2] = newop nl = ','.join(tokens) nl = nl.replace("__pd__", '#') newlines.append(nl) cfile = open(cfname, 'w') cfile.writelines(newlines) cfile.close() return 0 ############################################################################## if __name__ == "__main__": main()