Mercurial > repos > jason-ellul > iterativepca
comparison iterative_pca.py @ 0:cb54350e76ae draft default tip
Uploaded
| author | jason-ellul |
|---|---|
| date | Wed, 01 Jun 2016 03:24:56 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:cb54350e76ae |
|---|---|
| 1 import os | |
| 2 import subprocess | |
| 3 import sys | |
| 4 import argparse | |
| 5 import datetime | |
| 6 import csv | |
| 7 | |
| 8 import jinja2 | |
| 9 | |
| 10 import pedify | |
| 11 | |
| 12 JINJA_ENVIRONMENT = jinja2.Environment( | |
| 13 loader=jinja2.FileSystemLoader(os.path.dirname(__file__)), | |
| 14 extensions=['jinja2.ext.autoescape'], | |
| 15 autoescape=True) | |
| 16 DATA_TYPES = ["variant_data", "rds", "rdata", "numeric_ped"] | |
| 17 CLUSTERING_METHODS = ["dbscan", "hclust", "kmeans"] | |
| 18 CLUSTER_TRIMMING = ["sd", "mcd", "dbscan_outliers_only"] | |
| 19 HTML_TEMPLATE_NAME = "pca_report.html" | |
| 20 DEFAULT_SD_CUTOFF = 2 | |
| 21 PATH_TO_R_FUNCTIONS = "{}/R_functions".format(os.path.dirname(os.path.abspath(__file__))) | |
| 22 | |
| 23 | |
| 24 def main(): | |
| 25 ###===ARGUMENT PARSING===### | |
| 26 parser = argparse.ArgumentParser() | |
| 27 parser.add_argument("datafile_name", help="name of data file") | |
| 28 parser.add_argument("data_type", type=str.lower, choices=DATA_TYPES, help="type of data file") | |
| 29 parser.add_argument("iterations", help="max number of iterations to run", | |
| 30 type=int) | |
| 31 parser.add_argument("--control_tag", help="substring present in the ids of all control samples, e.g. LP") | |
| 32 parser.add_argument("--cases_tag", help="substring present in the ids of all case samples, e.g. HAPS") | |
| 33 parser.add_argument("--config_file", help="name of file containing config info") | |
| 34 parser.add_argument("--clustering_flag", help="Flag to indicate whether to do clustering", action="store_true") | |
| 35 parser.add_argument("--clustering_method", type=str.lower, choices=CLUSTERING_METHODS, | |
| 36 help="algorithm used to find clusters") | |
| 37 parser.add_argument("--cluster_trimming", type=str.lower, choices=CLUSTER_TRIMMING, | |
| 38 help="method used to identify and trim outlier clusters") | |
| 39 parser.add_argument("-s", "--sd_cutoff", help="number of standard deviations at which to cut outliers") | |
| 40 parser.add_argument("-o", "--out", help="base name of output files and directories; defaults to datafile name") | |
| 41 group = parser.add_mutually_exclusive_group() | |
| 42 group.add_argument("-r", "--relative_prefix", help="RELATIVE path CONTAINING the output directory") | |
| 43 group.add_argument("-a", "--absolute_prefix", help="ABSOLUTE path CONTAINING the output directory") | |
| 44 parser.add_argument("--html_file", help="ABSOLUTE PATH of output html file") | |
| 45 parser.add_argument("--ethnicity_file", help="File containing data about the ethnicity of the samples") | |
| 46 parser.add_argument("--galaxy", help="Flag to indicate we are running this script in Galaxy", | |
| 47 action="store_true") | |
| 48 parser.add_argument("--reject_snps", help="file containing SNP ids to ignore/remove") | |
| 49 parser.add_argument("--reject_samples", help="file containing sample ids to ignore/remove") | |
| 50 | |
| 51 args = parser.parse_args() | |
| 52 | |
| 53 # enforce conditionally required args | |
| 54 if args.data_type == "variant_data": | |
| 55 if args.config_file == None: | |
| 56 parser.error("Input variant data files require a complementary config file\ | |
| 57 specified with '--config_file <filename>' flag") | |
| 58 | |
| 59 if args.config_file: | |
| 60 if args.data_type != "variant_data": | |
| 61 print "Warning! Your config file will be ignored as the input data is not a\ | |
| 62 text file containing variant data" | |
| 63 | |
| 64 if args.clustering_flag: | |
| 65 if (args.clustering_method == None) or (args.cluster_trimming == None): | |
| 66 parser.error("If --clustering_flag is set,\ | |
| 67 --clustering_method and --cluster_trimming options must also be specified") | |
| 68 | |
| 69 if args.clustering_method or args.cluster_trimming: | |
| 70 if args.clustering_flag == False: | |
| 71 parser.error("--clustering_method and --cluster_trimming\ | |
| 72 options only valid if --clustering_flag is also set") | |
| 73 | |
| 74 # dbscan outliers only valid if cluster method == "DBSCAN" | |
| 75 if args.cluster_trimming == "dbscan_outliers_only": | |
| 76 if args.clustering_method != "dbscan": | |
| 77 parser.error("dbscan_outliers_only cluster trimming method only\ | |
| 78 valid if --clustering_method == dbscan") | |
| 79 | |
| 80 dfname = args.datafile_name | |
| 81 data_type = args.data_type | |
| 82 n = args.iterations | |
| 83 control_tag = args.control_tag | |
| 84 cases_tag = args.cases_tag | |
| 85 cfname = args.config_file | |
| 86 cflag = args.clustering_flag | |
| 87 cmethod = (args.clustering_method if args.clustering_method else "none") | |
| 88 tmethod = (args.cluster_trimming if args.cluster_trimming else "none") | |
| 89 sd_cutoff = (args.sd_cutoff if args.sd_cutoff else DEFAULT_SD_CUTOFF) | |
| 90 basename = (args.out if args.out else dfname.split('/')[-1].split('.')[0]) | |
| 91 hfname = args.html_file | |
| 92 galaxy_flag = args.galaxy | |
| 93 ethfname = args.ethnicity_file | |
| 94 xsnps_filename = args.reject_snps | |
| 95 xsamples_filename = args.reject_samples | |
| 96 | |
| 97 # function used to handle strange galaxy behaviour... | |
| 98 if galaxy_flag and cfname: | |
| 99 rc = format_galaxy_config_file(cfname) | |
| 100 if rc != 0: | |
| 101 print "Bad config file" | |
| 102 sys.exit() | |
| 103 | |
| 104 path_here = os.path.dirname(os.path.abspath(__file__)) | |
| 105 # determine output directory | |
| 106 if args.relative_prefix: | |
| 107 outdir = "{}/{}/full_output_{}".format(path_here, args.relative_prefix, basename) | |
| 108 elif args.absolute_prefix: | |
| 109 outdir = "{}/full_output_{}".format(args.absolute_prefix, basename) | |
| 110 else: | |
| 111 outdir = "{}/full_output_{}".format(path_here, basename) | |
| 112 if hfname == None: | |
| 113 hfname = "{}/{}.html".format(outdir, basename) | |
| 114 | |
| 115 ####===BEGIN PIPELINE===#### | |
| 116 # create a log file | |
| 117 lfname = "{}.log".format(basename) | |
| 118 logfile = open(lfname, 'w') | |
| 119 logfile.write("##Iterative PCA commenced at {}##\n".format(datetime.datetime.now())) | |
| 120 | |
| 121 # create output directory | |
| 122 prepare_directory(outdir, logfile) | |
| 123 | |
| 124 # do pre-processing if necessary, else go straight to R | |
| 125 if data_type == "variant_data": | |
| 126 stage1_info = stage_text_to_ped(dfname, cfname, xsamples_filename, | |
| 127 xsnps_filename, basename, outdir, logfile, True) | |
| 128 data_source_name = "{}/{}".format(outdir, stage1_info['pfname']) | |
| 129 data_type_out = "numeric_ped" | |
| 130 xsamples_filename = "{}/{}".format(outdir, stage1_info['xsname']) | |
| 131 elif data_type in ["rds", "rdata", "numeric_ped"]: | |
| 132 data_source_name = dfname | |
| 133 data_type_out = data_type | |
| 134 else: | |
| 135 print "Unrecognised input data type, exiting..." | |
| 136 sys.exit(0) | |
| 137 | |
| 138 # do iterative pca in R (much faster) | |
| 139 rCommand = "Rscript {}/iterative_pca_plot.R '{}' '{}' '{}' '{}' \ | |
| 140 '{}' '{}' '{}' '{}' '{}'\ | |
| 141 '{}' '{}' '{}' '{}' '{}'".format( | |
| 142 path_here, | |
| 143 n, outdir, basename, data_source_name, | |
| 144 data_type_out, ethfname, control_tag, cases_tag, sd_cutoff, | |
| 145 cmethod, tmethod, PATH_TO_R_FUNCTIONS, xsamples_filename, xsnps_filename) | |
| 146 | |
| 147 subprocess.call(rCommand, shell=True) | |
| 148 # generate dictionary of output file names | |
| 149 if cflag: | |
| 150 num_plots = 1 | |
| 151 else: | |
| 152 num_plots = 2 | |
| 153 if ethfname: | |
| 154 num_plots += 1 | |
| 155 if cases_tag or control_tag: | |
| 156 num_plots += 1 | |
| 157 iteration_data = create_output_dict(outdir, basename, n, num_plots) | |
| 158 | |
| 159 # clean up logfile | |
| 160 logfile.write("##Iterative PCA completed at {}##\n".format(datetime.datetime.now())) | |
| 161 logfile.close() | |
| 162 subprocess.call("mv '{}' '{}'".format(lfname, outdir), shell=True) | |
| 163 lfname = "{}/{}".format(outdir, lfname) | |
| 164 if cfname: | |
| 165 # copy and rename config file into outdir (if provided) | |
| 166 subprocess.call("cp '{}' '{}/{}.config'".format(cfname, outdir, basename), shell=True) | |
| 167 # output results summary in a nice html format | |
| 168 html_output = produce_report(hfname, dfname, lfname, cflag, | |
| 169 cmethod, tmethod, sd_cutoff, iteration_data, galaxy_flag) | |
| 170 ############################################################################## | |
| 171 | |
| 172 """ | |
| 173 Function to convert a text file containing variant data into a numeric or genomic ped file | |
| 174 Inputs: | |
| 175 - dfname: The name of the text file to convert | |
| 176 - cfname: Configuration file containing info on which columns to find required information in, | |
| 177 as well as filtering options | |
| 178 - xsamples_filename: A file containg a list of exact sample ids to exclude | |
| 179 - xsnps_filename: A file containing a list of exact snp ids to exclude | |
| 180 - basename: Filename without the extension | |
| 181 - outdir: Full path of output directory | |
| 182 - logfile: A file to keep track of program status. Any shell commands executed will be written to this file | |
| 183 - numeric_flag: If true, ped file will be numeric. | |
| 184 If false, ped file will be genomic | |
| 185 Outputs: | |
| 186 - ped file, either numeric or genomic | |
| 187 - map file | |
| 188 - excluded samples file, containing the ids of all samples which did not pass the filters and were rejected | |
| 189 Returns: | |
| 190 a dict containing the PATHS to important output files | |
| 191 """ | |
| 192 def stage_text_to_ped(dfname, cfname, xsamples_filename, xsnps_filename, basename, outdir, logfile, numeric_flag): | |
| 193 # Create ped and map files | |
| 194 logfile.write("##STAGE 1: Create .ped and .map files from input text file##\n") | |
| 195 | |
| 196 pfname = "{}/{}.ped".format(outdir, basename) | |
| 197 mfname = "{}/{}.map".format(outdir, basename) | |
| 198 xsname = "{}/{}_xsamples.txt".format(outdir, basename) | |
| 199 | |
| 200 pc = pedify.PedConverter() | |
| 201 # read in config file | |
| 202 rc = pc.read_config_file(cfname) | |
| 203 # Add in string filters for excluded samples and snps | |
| 204 if xsamples_filename: | |
| 205 xsamples = open(xsamples_filename, 'r').read().split('\n') | |
| 206 pc.cfg.add_string_filter("comp_generated_sample_cuts", pc.cfg.col_names['sample_id_column'], | |
| 207 "exact", "reject", xsamples) | |
| 208 if xsnps_filename: | |
| 209 xsnps = open(xsnps_filename, 'r').read().split('\n') | |
| 210 pc.cfg.add_string_filter("comp_generated_snp_cuts", pc.cfg.col_names['variant_id_column'], | |
| 211 "exact", "reject", xsnps) | |
| 212 if rc == 0: | |
| 213 print 'config file read successfully' | |
| 214 else: | |
| 215 print 'failed to read in config file successfully. Error code: {}'.format(rc) | |
| 216 | |
| 217 # read in data file | |
| 218 rc = pc.read_data_file(dfname) | |
| 219 if rc == 0: | |
| 220 print 'data file read successfully' | |
| 221 else: | |
| 222 print 'failed to read in data file successfully. Error code: {}'.format(rc) | |
| 223 | |
| 224 # create output | |
| 225 pc.create_ped_file(pfname, numeric=numeric_flag) | |
| 226 pc.create_map_file(mfname) | |
| 227 pc.create_excluded_samples_file(xsname) | |
| 228 outdict = { | |
| 229 "pfname": "{}.ped".format(basename), | |
| 230 "mfname": "{}.map".format(basename), | |
| 231 "xsname": "{}_xsamples.txt".format(basename), | |
| 232 "control_tag": pc.cfg.control_info['control_tag']['tag'], | |
| 233 "cases_tag": pc.cfg.control_info['cases_tag']['tag'] | |
| 234 } | |
| 235 | |
| 236 return outdict | |
| 237 | |
| 238 """ | |
| 239 Function to produce the html report of all the data | |
| 240 Inputs: | |
| 241 hfname: The ABSOLUTE PATH (including the filename) of the html file to output | |
| 242 dfname: path to datafile | |
| 243 lfname: path to logfile | |
| 244 sd_cutoff: multipler used to determine outliers | |
| 245 iterationList: a list of dicts, each containing AT LEAST: | |
| 246 - outdir: ABSOLUTE PATH to parent directory containing all the output from the iteration | |
| 247 - count: which iteration this was | |
| 248 - plotpaths: A list of paths to output plots | |
| 249 - xsname: name of file containing ids of samples excluded from this iteration | |
| 250 - ofname: name of file containing ids of outliers detected in this iteration | |
| 251 returns: | |
| 252 The html output that was written to the html file | |
| 253 """ | |
| 254 def produce_report(hfname, dfname, lfname, cflag, cmethod, tmethod, sd_cutoff, iteration_list, galaxy_flag): | |
| 255 hfdir = os.path.dirname(hfname) | |
| 256 htmldoc = open(hfname, 'w') | |
| 257 iteration_data = [] | |
| 258 for it in iteration_list: | |
| 259 outdir = it['outdir'] | |
| 260 if galaxy_flag: | |
| 261 dirs = outdir.split("/") | |
| 262 relpath = "{}/{}".format(dirs[-2], dirs[-1]) | |
| 263 else: | |
| 264 relpath = os.path.relpath(outdir, hfdir) | |
| 265 it_dict = {} | |
| 266 | |
| 267 ofname = "{}/{}".format(outdir, it['ofname']) | |
| 268 xsname = "{}/{}".format(outdir, it['xsname']) | |
| 269 of = open(ofname, 'r') | |
| 270 xf = open(xsname, 'r') | |
| 271 ol = [] | |
| 272 xl = [] | |
| 273 for line in of: | |
| 274 ol.append(line.split(',')) | |
| 275 for line in xf: | |
| 276 xl.append(line.split(',')) | |
| 277 of.close() | |
| 278 xf.close() | |
| 279 # fix the paths to the images | |
| 280 rel_plot_paths = [] | |
| 281 for pp in it['plot_paths']: | |
| 282 rel_plot_paths.append("{}/{}".format(relpath, pp)) | |
| 283 | |
| 284 it_dict['count'] = it['count'] | |
| 285 it_dict['ol'] = ol | |
| 286 it_dict['xl'] = xl | |
| 287 it_dict['rel_plot_paths']= rel_plot_paths | |
| 288 iteration_data.append(it_dict) | |
| 289 | |
| 290 logcontent = open(lfname, 'r').read() | |
| 291 template_values = { | |
| 292 "date": datetime.datetime.now(), | |
| 293 "dfname": dfname, | |
| 294 "lfname": lfname, | |
| 295 "logcontent": logcontent, | |
| 296 "iteration_data": iteration_data, | |
| 297 "cflag": cflag, | |
| 298 "cmethod": cmethod, | |
| 299 "tmethod": tmethod, | |
| 300 "sd_cutoff": sd_cutoff, | |
| 301 } | |
| 302 template = JINJA_ENVIRONMENT.get_template(HTML_TEMPLATE_NAME) | |
| 303 rendered_template = template.render(template_values) | |
| 304 htmldoc.write(rendered_template) | |
| 305 htmldoc.close() | |
| 306 return rendered_template | |
| 307 | |
| 308 """ | |
| 309 Generate the expected directory structure of the root output directory, | |
| 310 given the basname, number of iterations, and number of expected plots output by | |
| 311 the R script | |
| 312 """ | |
| 313 def create_output_dict(outdir, basename, n, num_plots): | |
| 314 iteration_list = [] | |
| 315 for i in range(n): | |
| 316 it_dict = {} | |
| 317 i_outdir = "{}/output_{}_iteration_{}".format(outdir, basename, i) | |
| 318 it_dict['outdir'] = i_outdir | |
| 319 it_dict['ofname'] = "{}_outliers.txt".format(basename) | |
| 320 it_dict['xsname'] = "{}_xfile.txt".format(basename) | |
| 321 it_dict['count'] = i | |
| 322 it_dict['plot_paths'] = [] | |
| 323 for j in range(num_plots): | |
| 324 it_dict['plot_paths'].append("{}_plot_number_{}.png".format(basename, j+1)) | |
| 325 iteration_list.append(it_dict) | |
| 326 return iteration_list | |
| 327 | |
| 328 | |
| 329 ###########################UTILITY######################################## | |
| 330 def prepare_directory(outdir, logfile): | |
| 331 # set up output directory | |
| 332 logfile.write("##Setting up output directory:{}##\n".format(outdir)) | |
| 333 subprocess.call("rm -rf '{}'".format(outdir), shell=True) | |
| 334 logfile.write("rm -rf '{}'\n".format(outdir)) | |
| 335 subprocess.call("mkdir -p '{}'".format(outdir), shell=True) | |
| 336 logfile.write("mkdir -p '{}'\n".format(outdir)) | |
| 337 | |
| 338 # takes a list of dicts, merges them into a single big dict | |
| 339 def merge_dicts(dictList): | |
| 340 result = {} | |
| 341 for d in dictList: | |
| 342 if type(d) == dict: | |
| 343 result.update(d) | |
| 344 return result | |
| 345 | |
| 346 # Some odd rules regarding character escaping and such in galaxy: fix those here | |
| 347 def format_galaxy_config_file(cfname): | |
| 348 CHAR_CONVERSION = { | |
| 349 'g': '>', | |
| 350 'l': '<', | |
| 351 'e': '==', | |
| 352 'ge': '<=', | |
| 353 'le': '>=' | |
| 354 } | |
| 355 cfile = open(cfname, 'r') | |
| 356 lines = cfile.readlines() | |
| 357 cfile.close() | |
| 358 | |
| 359 newlines = [] | |
| 360 section = None | |
| 361 for l in lines: | |
| 362 nl = l | |
| 363 if l[0] == '#': | |
| 364 if l.strip() == "#numeric_filters": | |
| 365 section = "numeric_filters" | |
| 366 else: | |
| 367 section = None | |
| 368 else: | |
| 369 if section: | |
| 370 tokens = l.split(',') | |
| 371 op = tokens[2] | |
| 372 if op in CHAR_CONVERSION.keys(): | |
| 373 newop = CHAR_CONVERSION[op] | |
| 374 elif op in CHAR_CONVERSION.values(): | |
| 375 newop = op | |
| 376 else: | |
| 377 # error code 1; bad op codes | |
| 378 return 1 | |
| 379 tokens[2] = newop | |
| 380 nl = ','.join(tokens) | |
| 381 | |
| 382 | |
| 383 nl = nl.replace("__pd__", '#') | |
| 384 newlines.append(nl) | |
| 385 | |
| 386 cfile = open(cfname, 'w') | |
| 387 cfile.writelines(newlines) | |
| 388 cfile.close() | |
| 389 return 0 | |
| 390 ############################################################################## | |
| 391 | |
| 392 if __name__ == "__main__": | |
| 393 main() |
