# HG changeset patch # User pmac # Date 1464766719 14400 # Node ID 64e75e21466e030809123180fdb58d6ae65e9fdc Uploaded diff -r 000000000000 -r 64e75e21466e README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.rst Wed Jun 01 03:38:39 2016 -0400 @@ -0,0 +1,331 @@ +.. class:: warningmark + +'''WARNING''' This tool requires the 'dbscan' (https://cran.r-project.org/web/packages/dbscan/index.html) and 'flashpcaR' (https://github.com/gabraham/flashpca/releases) R packages to be installed on the galaxy instance. + +====================================== +Principle Component Analysis Pipeline +====================================== + + :Author: Adrian Cheung + :Contact: adrian.che0222@gmail.com + :Date: 15-01-2015 + +Contents +-------- + +- `Overview`_ +- `Primary Input`_ +- `Primary Output`_ +- `Options/Secondary Inputs`_ +- `Other Output`_ +- `Command Line Interface`_ +- `Implementation Details`_ + +Overview +-------- +A tool which performs iterative principle component analysis. +The general idea is to seperate patient samples based on their ethnicity, by performing PCA on the variant data of each sample. +After each analysis step, outliers are identified. The PCA is then repeated, with the outliers removed. +This process continues for a set number of iterations specified by the user. After the pipeline completes, the user can see a +detailed summary, as well as have access to the outliers identified at each iteration. + +Primary Input +------------- +As primary input the tools accepts a single file, which may be formatted in the following ways: + +- **Variant data file:** This should be a tab-delimited text file, with each row containing data about a single variant site from a single person. If this option is selected, the column names which contain important information must also be specified, either via a configuration file (see below), or through the tool's form fields. +- **Numeric ped file:** See http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml for detailed information on PED format. This tool requires the affection status of each site to be specified numerically i.e.: + + - 0 = homozygous reference + - 1 = heterozygous + - 2 = homozygous alternate + + rather than consisting of pairs of genotypes for each site. +- **RData file:** File containing stored data from an R session. For this tool the input must meet certain requirements: + + - The file can only contain a SINGLE R object, which must be a list. + - The list must contain a named 'bed' element. + - The 'bed' element must be an n x m matrix/data frame, where n = number of samples, m = number of unique snps found in all the samples. + - The A(i,j)th entry in the 'bed' matrix should indicate affectation status of the ith sample at the jth SNP site, according to the key for numeric ped files (as above). + - The row names of the 'bed' matrix must contain the ids of the samples. + - The column names of the 'bed' matrix must contain the ids of the SNPs. + + If these very specific criteria are not met, the tool WILL fail. + +- **RDS file (command line only):** File containing a single R object. Object must follow same specifications as the RData file + +Primary Output +--------------- + +HTML file containing plots of the PCA for each iteration. +Possible plots, depending on user specified options: + +- **Control vs Cases Plot:** If control and/or cases tags are provided, this plot will be output. ALL samples are plotted, with controls shown in blue, cases in red, unknown samples in black. +- **Cluster Plot:** Output if user opts to do clustering. Samples are plotted, with clusters colour-coded. Outliers as identified by DBSCAN are always read and use an open circle as the icon. Trimmed clusters use a cross for the icon, instead of a circle. Both the outliers (open circles) AND the rejected clusters (crosses) will be dropped in the next iteration. +- **Outliers Plot:** Output if user does NOT opt to do clustering. Samples which are considered outliers (as described above in 'Detecting outliers without clustering') are plotted as red open circles; all other samples are plotted as green full circles. +- **Standard Deviations Plot:** Samples are colour-coded by standard deviation. Samples which fall within 1 standard devaiton of the median are red, <= 2 sds are green, <= 3 sds are blue, > 3 sds are purple. +- **Ethnicity Plot:** Each ethnicity uses a specific colour and symbol. Fairly self-explanotory. Plot is only output if an ethnicity data file is provided as input. + +Beneath the plots there are also two expandable lists. Samples excluded shows which samples were not part of the PCA for this iteration. This is cumulative. Outliers shows the outliers detected in THIS iteration. Any available data from the ethnicity file (if provided) is also displayed for each excluded sample. + +Options/Secondary Inputs +------------------------ +- **Type of input data file:** Either a ped file or a text file as specified above +- **Number of iterations to complete:** A single iteration would involve performing PCA on the input data, then identifying and removing outliers. Two iterations would involve performing PCA again with the outliers identified from the first iteration excluded, three iterations would exclude the outliers from the first 2 stages, and so on and so forth. +- **Detecting outliers without clustering:** This is done by obtaining the standard deviations of the first two principle components. Any samples whose scores for either of these first two components falls more than 'n' number of standard deviations away from the component median are considered outliers. +- **Clustering:** The user may select from a range of algorithms which will try to identify clusters in the data, with each cluster hopefully corresponding to an ethnic group. +- **Clustering methods:** + + - *DBSCAN (Density based spatial clustering of applications with noise):* + + Forms clusters based on density of points, and does not require the number of clusters to be specified beforehand. Good for irregularly shaped, non-spherical clusters. Does NOT require all points to be part of clusters, and produces a set of 'outliers', i.e. points which do not belong to any clusters. + + - *Hierarchical Clustering:* + + Forms clusters based on distance between points. Tends to result in spherical clusters, but able to handle clusters of varying density. Forces all points to be part of a single cluster. The number of clusters is determined seperately, using the silhouette scores of all the points as a heuristic. + +- **Cluster trimming methods:** All these methods first involve finding the centres of each cluster. + + - *Standard Deviations:* + + If the centroid of a cluster lies more than ‘n’ standard deviations (n is passed in as a parameter by the user) from the centroid of the entire dataset in either the x or y directions, the entire cluster is cut. If DBSCAN is selected, the outliers it identifies are also cut. + + - *Mean Cluster Distance:* + + Obtain the average distance between clusters, done by computing the distance between all pairs of clusters and taking the mean. For each cluster, we also compute an average “isolation” value, which is the mean of the distances between that particular cluster and all other clusters. If a cluster’s isolation value is larger than the average cluster distance (multiplied by the strictness weighting), then that cluster is considered an outlier and cut from the next iteration. If DBSCAN is selected, the outliers it identifies are also cut. + + - *DBSCAN outliers only:* + + Only cut the points identified by the DBSCAN algorithm as not belonging to any cluster. No entire clusters are cut. Obviously this method is only applicable if DBSCAN is selected as the clustering method. THE TOOL WILL NOT RUN IF YOU SELECT THIS OPTION TOGETHER WITH 'Hierarchical Clustering' AS THE CLUSTERING METHOD. + +- **Strictness:** A multiplier used to determine how 'strict' the outlier cutting methods are. For example, if strictness = 1, and we are not doing clustering, all points which lie more than 1 sd from the median are cut. If strictness = 2, all points which lie more than 2 sd from the median are cut, etc. + +- **Control Tag:** A pattern present in the ids of all the control samples, e.g. "LP" + +- **Cases Tag:** A pattern present in the ids of all the cases samples, e.g. "HAPS" + +- **Configuration file:** A configuration file to accompany an input variant text file. The config file has a rather specific format, an example is given below:: + + #control + control_tag,#Sample,HAPS + cases_tag,#Sample,LP + #column_names + genotype_column,GT + reference_column,REF + alternate_column,ALT + sample_id_column,#Sample + chromosome_column,CHROM + position_column,POS + variant_id_column,ID + #numeric_filters + strand_bias_filter,Fraction_with_strand_bias,<,0.03 + position_bias_filter,Fraction_with_positional_bias,<,0.03 + count_filter,Num_samples_variant,>,1 + pass_filter,Fraction_samples_passed_filter,>,0.9 + #string_filters + variant_type_filter,Variant_Type,exact,accept + SNV + genotype_filter,GT,exact,accept + '0/1,'1/1 + + File consists of up to four sections, the starts of which are marked by lines beginning with an octothorpe. + + - *'#control' section:* Indicates substrings found in ids of controls and cases + - *'#column_names' section:* This is the only required section. First column indicates what column name (in the variant text file) the second column specifies. The same keys i.e. left most column values, as shown in the example must be used, e.g. sample_id_column, the RHS column names must match the names in the variant data file. If using a generated config file, only modify the RHS column, and DO NOT REMOVE ANY rows from this section. + - *'#numeric_filters' section:* Each filter takes up a single line, and is seperated into 4 sections by commas. + + - Column 1: Name of the filter, which is arbitrary + - Column 2: The name of the column in the variant file to filter on. If this column is not found, a warning is displayed + - Column 3: The criteria of the filter which must be passed in order for us to accept a particular row. E.g. less than, greater than + - Column 4: The cutoff value to be compared against. + + - *'#string_filters' section:* Each filter takes up two lines. + + - Line 1, Column 1: Arbitrary filter name + - Line 1, Column 2: Column name to filter on + - Line 1, Column 3: Do the patterns have to be exact matches, or just a substrings? E.g. if pattern = "HAPS" and string being compared = "HAPS-909090", if exact was true this would not be a successfull match, whereas if not_exact was true it would be a match. + - Line 1, Column 4: What to do with the row if a successful match is detected, e.g. accept or reject + - Line 2: A comma seperated list of patterns to match on + + +- **Ethnicity file:** An ethnicity file containing ethnicity data, and possible other data, on the samples. Note this data is not used to sort the input and has no effect on the PCA itself. It is used only to label the results of the output. + + Requirements: + + - tab delimited + - Must have at least two columns + - First column has sample ID's + - Second column has ethnicities + - First row must be a header + + First few lines of a correctly formatted ethnicity file given below:: + + IID population Halo1.or.2. BloodAge SalivaAge COB ethnicity + LP-10000001 AUSTRALIAN Halo2 - LP-BC 67 NA Australia australian + LP-10000003 AUSTRALIAN Halo1 45 NA Australia australian southern_european + LP-10000005 AUSTRALIAN Halo1 73 NA Australia australian southern_european + LP-10000008 EUROPE Halo1 54 NA South Eastern Europe south_east_european + LP-10000009 OTHER Halo1 65 NA Southern & East Africa jewish + +- **Exclude samples file:** A text file containing exact ids of samples to exclude from the PCA. + + Requirements: + + - single column + - sample ids seperated by newlines + - one sample id per line + + Example:: + + HAPS-90573 + HAPS-90578R + HAPS-110542 + HAPS-110605 + HAPS-110620 + HAPS-110638 + HAPS-110649 + HAPS-110668 + HAPS-110799 + HAPS-110813 + HAPS-110959 + HAPS-111186 + HAPS-111298 + HAPS-111404 + HAPS-111493 + HAPS-111512 + HAPS-111538 + +- **Exclude SNPS file:** A text file containing exact ids of SNPs to exclude from the PCA. + + Requirements: + + - single column + - snp ids seperated by newlines + - one snp id per line + + Example:: + + rs72896283 + rs7534447 + rs4662775 + rs10932813 + rs10932816 + rs12330369 + rs1802904 + rs10902762 + rs9996817 + rs6446393 + rs871133 + rs4301095 + rs941849 + rs6917467 + rs75834296 + rs142922667 + +- **Required Column Headers:** If a variant text file is the primary input, the following information MUST be provided, either through the config file, or by filling out the corresponding fields in the tool submission form. + + - Sample IDs: Name of the column containing the sample ids + - Chromosome: Name of the column indicating what chromosome the SNP is found on + - Position: Name of the column indicating at which position on the chromosome the SNP is found + - Genotype: The genotype of the sample for this site + - Reference: The 'normal'/'common' genotype for this site + - Alternate: The alternate genotype for this site + - Variant IDs: Name of the column indicating the ID of the SNP + +- **Numeric Filters:** See Configuration file section +- **String Filters:** See Configuration file section + +Other Output +------------- + +- Tool will output a root folder containing the HTML file and all the plots, placed in directories seperated by iteration. +- If the input data was a variant file, the output folder will also contain a numeric ped file, generated before the first iteration, as well as a config file. The config file is either the exact one passed in by the user, or one automatically generated from the form input, which can be used for future PCA runs. + + +Command Line Interface +----------------------- +To run the tool via the command line, make sure you have the following files/folders:: + + + |-> iterative_pca.py + |-> iterative_pca_plot.R + |-> pedify.py + |-> pca_report.html + |-> R_functions + |-> plotting_functions.R + |-> clustering_functions.R + |-> outlier_trimming.R + |-> pca_helpers.R + |-> pipeline_code.R + + +**Output Directory Structure** + +By default, the root output directory will be called "full_output_", where basename is the name of the primary input data file with no extensions. The root output dir will contain the HTML report file, as well as a log file, as well as some other possible outputs. Each iteration will be stored in subdirectories inside the output root dir, these will be called "output__", and will contain the pngs for the plots, as well as the outliers file and excluded samples file. The output basename can be set with the --out command line argument. Here is a section of an example output directory tree, if the basename was 'data1':: + + full_output_data1 + |-> data1.html + |-> data1.log + |-> output_data1_iteration_0 + |-> data1_outliers.txt + |-> data1_xsamples.txt + |-> data1_plot1 + |-> data1_plot2 + ...etc + |-> output_data1_iteration_1 + ...etc + +**Example Usage** + +To see more help text on how to run the program, do:: + + python iterative_pca.py -h + +If we have a directory containing some valid input files at /input, then some here are some example +use cases: + +- *Text file containing variant data input*:: + + python iterative_pca.py /input/parsed_all_Halo1_150528_wGT.txt variant_data 10 --config_file /input/halo1_pca.config --ethnicity_file /input/Halo_ethnicity_rf.txt --clustering_flag --clustering_method dbscan --cluster_trimming sd --out halo1_out + + Does 10 iterations on the input file, using the ethnicity data from 'Halo_ethnicity_rf.txt', and outputs data to 'full_output_halo1_out'. Use the specified config file, and use DBSCAN for clustering, and trim by standard deviations. + +- *Numeric ped file input*:: + + python iterative_pca.py /input/halo1_numeric.ped numeric_ped 5 + + Does 5 iterations on the input file. Trim outliers purely by standard deviations (no clustering), and output data to 'full_output_halo1_numeric'. + +- *RDS file input*:: + + python iterative_pca.py /input/HapMap3_flashPCA_data.rds rds 20 --ethnicity_file /input/HapMap3_ethnicity_rf.txt --clustering_flag --clustering_method hclust --cluster_trimming mcd --out hapmap3_test1 --control LP --cases HAPS + + Do 20 iterations on the input rds file, using ethnicity data from 'HapMap3_ethnicity_rf.txt'. Use hierarchical clustering, and trim by mean cluster distance. Indicate that ids of control samples contain the pattern "LP", and ids of case samples contain the pattern "HAPS". + + +Implementation Details +---------------------- +The program consists of two main scripts: a top level python script and an R script. The python script prepares the input files so they can be read into the R script. The R script is where most of the heavy lifting is done; the iterative pca steps as well as the plotting all occurs here. After this script finishes, the python script will generate the HTML output using the Jinja2 templating engine, before exiting. + +Here is a brief overview of each of the files' functions: + +Python: + +- **iterative_pca.py**: Top level script which manages the entire pipeline. Main functions are preparing the input files for the R script and writing to the log file, and creating the HTML file at the end. +- **pedify.py**: Contains functions for parsing the configuration file and generating ped and map files from an input variant text file. Also responsible for filtering out unwanted samples/SNPs when parsing the text file. + +R: + +- **iterative_pca_plot.R**: reads in data, then runs multiple PCA iterations, keeping track of the data. This data is then used to generate the output folders, plots and files. +- **plotting_functions.R**: Functions used to prepare the different kinds of plots. +- **clustering_functions.R**: Clustering functions. Contains functions to optimise the DBSCAN and hclust algorithms, including ways to find k, the number of clusters. Also contains different methods for identifying cluster outliers. +- **outlier_trimming.R**: Functions to trim outliers (no clustering). +- **pca_helpers.R**: Functions to read in data and perform PCA, along with some other utility functions. +- **pipeline_code.R**: A function to run a single iteration of the pipeline. + +Other: + +- **pca_report.html**: HTML template which is filled in with the iteration data after completing a run successfully. + + + diff -r 000000000000 -r 64e75e21466e R_functions/clustering_functions.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R_functions/clustering_functions.R Wed Jun 01 03:38:39 2016 -0400 @@ -0,0 +1,134 @@ +# Automated clustering suite, that makes use of dbscan and hclust +# clustering algorithms +library(dbscan) +library(cluster) + +# Do the OPTICS clustering algorithm on the input data. +# Computationally determines the 'optimal' value for the epsilion neighbourhood argument. +# Outputs an integer array indicating which cluster each sample from the input data belongs to. +# 0 indicates the sample is an outlier and does not belong to any sample +automated_dbscan = function(pca_data, emax, mp) { + d = dist(pca_data$values[, c(1, 2)]) + oc = optics(d, emax, mp) + eps = compute_eps(oc, 3) + oc_clusters = optics_cut(oc, eps) + return(oc_clusters$cluster) +} + +# Do the hclust clustering algorithm on the input data, automatically +# computing the optimal value for 'k', i.e. the number of clusters. +# Outputs an integer array indicating which cluster each sample from the input data belongs to. +automated_hclust = function(pca_data) { + d = dist(pca_data$values[, c(1, 2)]) + hc = hclust(d) + k = compute_k(hc, d, 10) + clusters = cutree(hc, k) + return(clusters) +} + +# determine the optimal epsilon value for dbscan +# From the input OPTICS data, +compute_eps = function(oc, weight) { + rdist = oc$reachdist + iqr = IQR(rdist) + uc = median(rdist) + weight*iqr + outliers = rdist[which(rdist > uc)] + eps = median(outliers) + return(eps) +} + +# determining k using the silhouette method and hclust +# Values near 1 indicate clustering is appropriate. +compute_k = function(hc, d, n) { + s = 1:n + # try clusters from size 2 to n, evaluating the 'silhouette coefficient' + # for each value of k. + for (i in 1:n) { + # silhouette method not valid for cluster size = 1 + if (i == 1) { + # if the silhouette coeff is < 0.5 for all the other values, + # we assume there is just one big cluster + s[i] = 0.5 + } else { + # get the silhoutte coeff for every point, and take the mean + clusters = cutree(hc, i) + sil = silhouette(clusters, d) + sil_coeff = mean(sil[, 'sil_width']) + s[i] = sil_coeff + } + } + return(which.max(s)) +} + +# Compute the centroids of each cluster +# returns an n x m matrix, where +# n = number of clusters +# m = dimension of data +find_cluster_centers = function(clusters, x) { + cluster_ids = unique(clusters) + k = length(cluster_ids) + centers = matrix(nrow=k, ncol=ncol(x)) + rownames(centers) = 1:k + for (i in 1:k) { + this_id = cluster_ids[i] + # dont work out centre of outliers + cluster_points = x[which(clusters==this_id), ] + if (is.null(dim(cluster_points))) { + l = length(cluster_points) + cluster_points = matrix(cluster_points, ncol=l) + } + centers[i, ] = apply(cluster_points, 2, FUN=mean) + rownames(centers)[i] = this_id + } + centers = centers[rownames(centers) != 0, , drop=FALSE] + return(centers) +} + +# Determine which clusters are outliers based on how far away their cluster +# centers are from the center of all the data. Returns a list containing the +# 'ids' of all rejected CLUSTERS (not samples) +find_cluster_outliers_sd = function(clusters, centers, pca_data, numsds) { + if(nrow(centers) <= 1) { + print("no centers, or only one center") + return(NULL) + } + # work out the standard deviation of the data in pc1 and pc2 directions + pc1_sd = sd(pca_data$values[, 1]) + pc2_sd = sd(pca_data$values[, 2]) + # work out centroid of the pca data + centroid = apply(pca_data$values, 2, FUN=mean) + pc1_bar = centroid[1] + pc2_bar = centroid[2] + # get pc1 and pc2 hi-lo cutoffs + pc1_hi = pc1_bar + pc1_sd*numsds + pc1_lo = pc1_bar - pc1_sd*numsds + pc2_hi = pc2_bar + pc2_sd*numsds + pc2_lo = pc2_bar - pc2_sd*numsds + # check which clusters fall outside cutoffs + pc1_centers = centers[, 1] + pc2_centers = centers[, 2] + pc1_ol = which(pc1_centers > pc1_hi | pc1_centers < pc1_lo) + pc2_ol = which(pc2_centers > pc2_hi | pc2_centers < pc2_lo) + all_ol = union(pc1_ol, pc2_ol) + rc = rownames(centers)[all_ol] # id of rejected clusters + return(rc) +} + +# Determine which clusters are outliers based on how far away they are from other clusters +# Returns a list containing the 'ids' of all rejected CLUSTERS (not samples) +find_cluster_outliers_mcd = function(clusters, centers, pca_data, multiplier, ndim) { + if(nrow(centers) <= 1) { + print("no centers, or only one center") + return(NULL) + } + d = dist(centers[, 1:ndim]) + dm = as.matrix(d) + avg_distance = mean(as.numeric(d)) + # ignore the diagonal + cluster_distances = rowSums(dm)/(ncol(dm)-1) + # trim clusters based on how far apart they are + cutoff = multiplier*avg_distance + indices = which(cluster_distances > cutoff) + rc = names(cluster_distances)[indices] + return(rc) +} \ No newline at end of file diff -r 000000000000 -r 64e75e21466e R_functions/outlier_trimming.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R_functions/outlier_trimming.R Wed Jun 01 03:38:39 2016 -0400 @@ -0,0 +1,21 @@ +# Finding outliers by standard deviation + +# Get samples whose pc1 OR pc2 values lie more than 'numsds' s.devs +# away from the sample median for that pc. +outliers_by_sd = function(pca_data, xsamples, numsds) { + pc1_outliers = find_outliers(pca_data$values[, 1], numsds) + pc2_outliers = find_outliers(pca_data$values[, 2], numsds) + all_outliers = union(pc1_outliers, pc2_outliers) + return(all_outliers) +} + +# compute outliers +# Returns indices of all samples which lie more than +# 'numsds' s.devs away from the sample median +find_outliers = function(input_data, numsds) { + lower = median(input_data) - numsds*sd(input_data) + upper = median(input_data) + numsds*sd(input_data) + + outliers = which(input_data < lower | input_data > upper) + return(outliers) +} \ No newline at end of file diff -r 000000000000 -r 64e75e21466e R_functions/pca_helpers.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R_functions/pca_helpers.R Wed Jun 01 03:38:39 2016 -0400 @@ -0,0 +1,114 @@ +library(flashpcaR) + +# read in data from either a numeric ped file or an rds object +# output a numeric ped file, with the rownames set to the ids of the samples +get_source_data = function(data_source, data_type) { + data_type = tolower(data_type) + if (data_type == "numeric_ped") { + # read in ped file + ped_data = read.table(data_source, sep="\t", row.names=1) + } else if (data_type == "rds") { + hapmap3_object = readRDS(data_source) + ped_data = hapmap3_object$bed + } else if (data_type == "rdata") { + hapmap3_object = load_obj(data_source) + ped_data = hapmap3_object$bed + } else { + print("Unrecognised data type, returning NULL") + ped_data = NULL + } + return(ped_data) +} + +# A function that will read in and return a single object from an RData file +# This is a workaround so the program can run without needing to know name of the object; +# however the assumption is that the RData file contains only ONE object (the one we want) +load_obj = function(filename) { + # create new environment + env = new.env() + # load the rdata file into the new environment, and get the NAME + # of the first object + object_name = load(filename, env)[1] + # return the object + return(env[[object_name]]) +} + +# remove unwanted rows or columns (samples and snps, respectively) from +# the ped data +filter_ped_data = function(ped_data, xsamples, xsnps) { + # rows to remove + rr = which(rownames(ped_data) %in% xsamples) + # remove rejected samples + if (length(rr) != 0) { + fpd1 = ped_data[-rr, , drop=FALSE] + } else { + fpd1 = ped_data + } + # remove all zero and rejected snp columns + snps = which(colnames(ped_data) %in% xsnps) + zeros = which(colSums(abs(fpd1)) == 0) + cr = union(snps, zeros) + if (length(cr) != 0) { + fpd2 = fpd1[, -cr, drop=FALSE] + } else { + fpd2 = fpd1 + } + # remove monomorphic snps + snp_sds = apply(fpd2, 2, sd) + clean_ped = fpd2[, snp_sds >= 0.01, drop=FALSE] + return(clean_ped) +} + +# Ethnicity file requirements: +# - tab delimited +# - Must have at least two columns +# - First column has sample ID's +# - Second column has ethnicities +# - First row must be a header +parse_ethnicity_file = function(eth_filename) { + if (file.exists(eth_filename) == FALSE) { + print(paste0("Warning: Ethnicity file: ", eth_filename, " not found")) + return(NULL) + } + if (file.info(eth_filename)$size == 0) { + print(paste0("Warning: Ethnicity file: '", eth_filename, "' is empty")) + return(NULL) + } + eth_data = read.table(eth_filename, header=TRUE, sep="\t") + rownames(eth_data) = eth_data[, 1] + colnames(eth_data)[1] = "IID" + colnames(eth_data)[2] = "population" + return(eth_data) +} + +# Read in a file and return the first column as a +# character vector +get_first_column = function(fname) { + rv = c() + if (file.exists(fname) == FALSE) { + print(paste0("Warning: File: '", fname, "' not found")) + return(rv) + } + if (file.info(fname)$size == 0) { + print(paste0("Warning: File: '", fname, "' is empty")) + return(rv) + } else { + rv = as.character(read.table(fname)[, 1]) + return(rv) + } +} + +# Do pca using flashpcar. Returns a 2 element list +# values - contains the loadings of the pcs +# Will be an n x m matrix, where +# - n = Number of samples +# - m = number of pcs +# ids - Character array of ids, same length as number of rows in values +do_pca = function(ped_data) { + pca_data = list() + pm = data.matrix(ped_data) + values = flashpca(pm, ndim=6)$vectors + pca_data$values = values + pca_data$ids = as.character(rownames(ped_data)) + return(pca_data) +} \ No newline at end of file diff -r 000000000000 -r 64e75e21466e R_functions/pipeline_code.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R_functions/pipeline_code.R Wed Jun 01 03:38:39 2016 -0400 @@ -0,0 +1,112 @@ +### pipeline ### +# Complete a single iteration, which consists of: +# - Doing pca +# - Clustering, if required +# - Finding outliers +# - Setting up plots +# Outputs a list containing all the data regarding this iteration +single_iteration = function(outdir, basename, ped_data, xsamples, numsds, + cmethod, tmethod, control_tag, cases_tag, ethnicity_data=NULL) { + it_data = list() + it_data$old_xsamples = xsamples + # get data and do pca + pca_data = do_pca(ped_data) + it_data$pca_data = pca_data + it_data$plots = list() + plot_number = 1 + + # plot controls and cases + if (!is.null(control_tag) || !is.null(cases_tag)) { + it_data$plots[[plot_number]] = setup_cvc_plot(pca_data, control_tag, cases_tag) + plot_number = plot_number + 1 + } + + # if we have ethnicity data, setup a special plot + if (!is.null(ethnicity_data)) { + it_data$plots[[plot_number]] = setup_ethnicity_plot(pca_data, ethnicity_data) + plot_number = plot_number + 1 + } + + if (cmethod == "none") { + # get outliers by sd + all_outliers = outliers_by_sd(pca_data, xsamples, numsds) + new_xsamples = union(xsamples, pca_data$ids[all_outliers]) + it_data$xsamples = new_xsamples + # prepare outlier plot + it_data$plots[[plot_number]] = setup_ol_plot(pca_data, all_outliers) + plot_number = plot_number + 1 + # prepare sd plot + it_data$plots[[plot_number]] = setup_sd_plot(pca_data) + plot_number = plot_number + 1 + } else { + # do clustering + if (cmethod == "dbscan") { + emax = 2 + mp = 7 + clusters = automated_dbscan(pca_data, emax, mp) + } else if (cmethod == "hclust") { + clusters = automated_hclust(pca_data) + } else { + clusters = automated_hclust(pca_data) + } + + # get outliers + cluster_outliers = which(clusters == 0) + # get rejected clusters + centers = find_cluster_centers(clusters, pca_data$values) + if (tmethod == "mcd") { + rc = find_cluster_outliers_mcd(clusters, centers, pca_data, numsds, 2) + } else if (tmethod == "sd") { + rc = find_cluster_outliers_sd(clusters, centers, pca_data, numsds) + } else if (tmethod == "dbscan_outliers_only") { + rc = 0 + } + rc_indices = which(clusters %in% rc) + all_ol = union(cluster_outliers, rc_indices) + # it is possible that all samples get removed, in which case program will crash. + # so do not remove them + if (length(all_ol) == nrow(ped_data)) { + new_xsamples = xsamples + } else { + new_xsamples = union(xsamples, pca_data$ids[all_ol]) + } + it_data$xsamples = new_xsamples + # prepare plot + it_data$plots[[plot_number]] = setup_cluster_plot(pca_data, clusters, rc=rc) + plot_number = plot_number + 1 + } + it_data$outliers = setdiff(new_xsamples, xsamples) + it_data$num_plots = plot_number - 1 + return(it_data) +} + +# basically an inner join on a list of ids, and a table of ethnicity data +# if eth_data == null, then the second column is filled with NAs +add_ethnicity_data = function(ids, eth_data) { + n = length(ids) + if (!is.null(eth_data)) { + output = matrix(nrow=n, ncol=ncol(eth_data)) + colnames(output) = colnames(eth_data) + if (n > 0) { + for (i in 1:n) { + this_id = ids[i] + if (this_id %in% rownames(eth_data)) { + this_row = unlist(lapply(eth_data[this_id, ], as.character), use.names=FALSE) + } else { + this_row = c(this_id, rep(NA, ncol(output)-1)) + } + output[i, ] = this_row + } + } + } else { + output = cbind(ids, rep(NA, n)) + colnames(output) = c("IID", "Population") + } + return(output) +} + +generate_directory_name = function(outdir, basename, iteration) { + newdir = paste0("output_", basename, "_iteration_", iteration - 1) + full_path = paste0(outdir, "/", newdir) + return(full_path) +} \ No newline at end of file diff -r 000000000000 -r 64e75e21466e R_functions/plotting_functions.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/R_functions/plotting_functions.R Wed Jun 01 03:38:39 2016 -0400 @@ -0,0 +1,225 @@ +## Plotting and grouping ## +# input data: some number of 2d observations. Each row represents a single observation, +# column 1 = variable 1, to be plotted on the x-axis, +# column 2 = variable 2, to be plotted on the y-axis +# groups: Integer vector with same number of entries as there are rows in the input data, +# representing which group each observation belongs to. Negative numbers are not plotted +# tags: the tag to put on the legend for each group +# plot_colors: colors to use for each group +# plot_symbols: symbols to use for each group +# plot_title: as name suggests +# plot_filename: if this is not null, graph is output to a png with the specified name +plot_by_groups = function(input_data, groups, tags, plot_colors, plot_symbols, plot_title, plot_filename=NULL) { + if(!is.null(plot_filename)) { + png(plot_filename) + } + # leave some extra room on the RHS for the legend + par(mar=c(5.1, 4.1, 4.1, 8.1)) + x = as.numeric(input_data[, 1]) + y = as.numeric(input_data[, 2]) + gids = sort(unique(groups[which(groups >= 0)])) + n = length(gids) + + # first set up the plot area to the correct dimensions + plot(x, y, col="white") + + for (i in 1:n) { + gid = gids[i] + pts_x = x[which(groups == gid)] + pts_y = y[which(groups == gid)] + pts_color = plot_colors[i] + pts_symbol = plot_symbols[i] + points(pts_x, pts_y, col=pts_color, pch=pts_symbol) + } + legend(x="topright", + xpd=TRUE, + inset=c(-0.3, 0), + col=plot_colors, + pch=plot_symbols, + legend=tags, + text.col=plot_colors) + title(main=plot_title) + if(!is.null(plot_filename)) { + dev.off() + } +} + +# Controls vs cases plot. Colour controls blue, cases red, +# Samples which are neither control nor case are black. +setup_cvc_plot = function(pca_data, control_tag, cases_tag) { + plot_info = list() + nsamples = length(pca_data$ids) + groups = rep(1, nsamples) + control_legend = paste0("CO: ", control_tag) + cases_legend = paste0("CA: ", cases_tag) + if (!is.null(control_tag)) { + groups[grep(control_tag, pca_data$ids)] = 2 + } + if (!is.null(cases_tag)) { + groups[grep(cases_tag, pca_data$ids)] = 3 + } + res = sort(unique(groups)) + if (length(res) == 1) { + tags = c("UNKNOWN") + plot_colors = c("black") + } else if (length(res) == 3) { + tags = c("UNKNOWN", control_legend, cases_legend) + plot_colors = c("black", "blue", "red") + } else { + if (all(res == c(1, 2))) { + tags = c("UNKNOWN", control_legend) + plot_colors = c("black", "blue") + } else if (all(res == c(1, 3))) { + tags = c("UNKNOWN", cases_legend) + plot_colors = c("black", "red") + } else { + tags = c(control_legend, cases_legend) + plot_colors = c("blue", "red") + } + } + plot_info$groups = groups + plot_info$tags = tags + plot_info$plot_colors = plot_colors + plot_info$plot_symbols = rep(1, length(res)) + plot_info$plot_title = "Control vs Cases Plot" + return(plot_info) +} + +# outliers plot; colour outliers red, non-outliers green +setup_ol_plot = function(pca_data, outliers) { + plot_info = list() + nsamples = dim(pca_data$values)[1] + groups = 1:nsamples + groups[outliers] = 1 + groups[setdiff(1:nsamples, outliers)] = 2 + plot_info$groups = groups + plot_info$tags = c("outliers", "good data") + plot_info$plot_colors = c("red", "green") + plot_info$plot_symbols = c(1, 20) + plot_info$plot_title = "Outliers Plot" + return(plot_info) +} + +# standard deviations plot; colour samples by s.dev +setup_sd_plot = function(pca_data) { + plot_info = list() + nsamples = dim(pca_data$values)[1] + pc1 = as.numeric(pca_data$values[, 1]) + pc2 = as.numeric(pca_data$values[, 2]) + pc1_sds = as.numeric(lapply(pc1, compute_numsds, pc1)) + pc2_sds = as.numeric(lapply(pc2, compute_numsds, pc2)) + + groups = 1:nsamples + groups[get_sdset2d(pc1_sds, pc2_sds, 1)] = 1 + groups[get_sdset2d(pc1_sds, pc2_sds, 2)] = 2 + groups[get_sdset2d(pc1_sds, pc2_sds, 3)] = 3 + groups[union(which(pc1_sds > 3), which(pc2_sds > 3))] = 4 + plot_info$groups = groups + plot_info$tags = c("SD = 1", "SD = 2", "SD = 3", "SD > 3") + plot_info$plot_colors = rainbow(4) + plot_info$plot_symbols = rep(20, 4) + plot_info$plot_title = "Standard Deviations Plot" + return(plot_info) +} + +# Plot samples, with coloured clusters. Rejected clusters use +# a cross symbol instead of a filled circle +setup_cluster_plot = function(pca_data, clusters, rc=NULL) { + plot_info = list() + groups = clusters + ids = sort(unique(groups)) + n = length(ids) + tags = 1:n + for (i in 1:n) { + tags[i] = sprintf("cluster %s", ids[i]) + } + outliers = which(groups == 0) + if (length(outliers) != 0) { + tags[1] = "outliers" + } + plot_colors = rainbow(n) + plot_symbols = rep(20, n) + if (length(outliers) != 0) { + plot_symbols[1] = 1 + } + # labelling for rejected clusters + if(!is.null(rc)) { + for(i in 1:n) { + if((ids[i] != 0) && (ids[i] %in% as.numeric(rc))) { + tags[i] = "rej. clust." + plot_symbols[i] = 4 + } + } + } + plot_info$groups = groups + plot_info$tags = tags + plot_info$plot_colors = plot_colors + plot_info$plot_symbols = plot_symbols + plot_info$plot_title = "Cluster Plot" + return(plot_info) +} + +# Plot samples, colouring by ethnicity. Different ethnicities also +# have different symbols. +setup_ethnicity_plot = function(pca_data, ethnicity_data) { + plot_info = list() + nsamples = dim(pca_data$values)[1] + eth = 1:nsamples + + for (i in 1:nsamples) { + sample_id = pca_data$ids[i] + eth[i] = as.character(ethnicity_data[sample_id, "population"]) + if(is.na(eth[i])) { + eth[i] = "UNKNOWN" + } + } + n = length(unique(eth)) + plot_info$groups = as.numeric(as.factor(eth)) + plot_info$tags = sort(unique(eth)) + plot_info$plot_colors = rainbow(n) + plot_info$plot_symbols = 1:n + plot_info$plot_title = "Ethnicity Plot" + return(plot_info) +} + +draw_cutoffs = function(input_data, x, y, numsds) { + pcx = as.numeric(input_data[x, ]) + pcy = as.numeric(input_data[y, ]) + + vlines = c(median(pcx) - numsds*sd(pcx), + median(pcx) + numsds*sd(pcx)) + hlines = c(median(pcy) - numsds*sd(pcy), + median(pcy) + numsds*sd(pcy)) + abline(v=vlines) + abline(h=hlines) +} + +# Following helper functions are used in the 'setup_sd_plot' function +# given a list of standard deviations, work out which points are n standard deviations away +get_sdset2d = function(x1, x2, n) { + if (n == 1) { + ind = intersect(which(x1 == 1), which(x2 == 1)) + } else { + lower = get_sdset2d(x1, x2, n - 1) + upper = union(which(x1 > n), which(x2 > n)) + xset = union(lower, upper) + bigset = union(which(x1 == n), which(x2 == n)) + ind = setdiff(bigset, xset) + } + return(ind) +} + +# work out how many standard deviations away from the sample median a single point is +# accuracy of this decreases for outliers, as the error in the estimated sd is +# multiplied +compute_numsds = function(point, x) { + x_sd = sd(x) + sum = x_sd + m = median(x) + i = 1 + while(abs(point - m) > sum) { + i = i + 1 + sum = sum + x_sd + } + return(i) +} \ No newline at end of file diff -r 000000000000 -r 64e75e21466e iterative_pca.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/iterative_pca.py Wed Jun 01 03:38:39 2016 -0400 @@ -0,0 +1,393 @@ +import os +import subprocess +import sys +import argparse +import datetime +import csv + +import jinja2 + +import pedify + +JINJA_ENVIRONMENT = jinja2.Environment( + loader=jinja2.FileSystemLoader(os.path.dirname(__file__)), + extensions=['jinja2.ext.autoescape'], + autoescape=True) +DATA_TYPES = ["variant_data", "rds", "rdata", "numeric_ped"] +CLUSTERING_METHODS = ["dbscan", "hclust", "kmeans"] +CLUSTER_TRIMMING = ["sd", "mcd", "dbscan_outliers_only"] +HTML_TEMPLATE_NAME = "pca_report.html" +DEFAULT_SD_CUTOFF = 2 +PATH_TO_R_FUNCTIONS = "{}/R_functions".format(os.path.dirname(os.path.abspath(__file__))) + + +def main(): + ###===ARGUMENT PARSING===### + parser = argparse.ArgumentParser() + parser.add_argument("datafile_name", help="name of data file") + parser.add_argument("data_type", type=str.lower, choices=DATA_TYPES, help="type of data file") + parser.add_argument("iterations", help="max number of iterations to run", + type=int) + parser.add_argument("--control_tag", help="substring present in the ids of all control samples, e.g. LP") + parser.add_argument("--cases_tag", help="substring present in the ids of all case samples, e.g. HAPS") + parser.add_argument("--config_file", help="name of file containing config info") + parser.add_argument("--clustering_flag", help="Flag to indicate whether to do clustering", action="store_true") + parser.add_argument("--clustering_method", type=str.lower, choices=CLUSTERING_METHODS, + help="algorithm used to find clusters") + parser.add_argument("--cluster_trimming", type=str.lower, choices=CLUSTER_TRIMMING, + help="method used to identify and trim outlier clusters") + parser.add_argument("-s", "--sd_cutoff", help="number of standard deviations at which to cut outliers") + parser.add_argument("-o", "--out", help="base name of output files and directories; defaults to datafile name") + group = parser.add_mutually_exclusive_group() + group.add_argument("-r", "--relative_prefix", help="RELATIVE path CONTAINING the output directory") + group.add_argument("-a", "--absolute_prefix", help="ABSOLUTE path CONTAINING the output directory") + parser.add_argument("--html_file", help="ABSOLUTE PATH of output html file") + parser.add_argument("--ethnicity_file", help="File containing data about the ethnicity of the samples") + parser.add_argument("--galaxy", help="Flag to indicate we are running this script in Galaxy", + action="store_true") + parser.add_argument("--reject_snps", help="file containing SNP ids to ignore/remove") + parser.add_argument("--reject_samples", help="file containing sample ids to ignore/remove") + + args = parser.parse_args() + + # enforce conditionally required args + if args.data_type == "variant_data": + if args.config_file == None: + parser.error("Input variant data files require a complementary config file\ + specified with '--config_file ' flag") + + if args.config_file: + if args.data_type != "variant_data": + print "Warning! Your config file will be ignored as the input data is not a\ + text file containing variant data" + + if args.clustering_flag: + if (args.clustering_method == None) or (args.cluster_trimming == None): + parser.error("If --clustering_flag is set,\ + --clustering_method and --cluster_trimming options must also be specified") + + if args.clustering_method or args.cluster_trimming: + if args.clustering_flag == False: + parser.error("--clustering_method and --cluster_trimming\ + options only valid if --clustering_flag is also set") + + # dbscan outliers only valid if cluster method == "DBSCAN" + if args.cluster_trimming == "dbscan_outliers_only": + if args.clustering_method != "dbscan": + parser.error("dbscan_outliers_only cluster trimming method only\ + valid if --clustering_method == dbscan") + + dfname = args.datafile_name + data_type = args.data_type + n = args.iterations + control_tag = args.control_tag + cases_tag = args.cases_tag + cfname = args.config_file + cflag = args.clustering_flag + cmethod = (args.clustering_method if args.clustering_method else "none") + tmethod = (args.cluster_trimming if args.cluster_trimming else "none") + sd_cutoff = (args.sd_cutoff if args.sd_cutoff else DEFAULT_SD_CUTOFF) + basename = (args.out if args.out else dfname.split('/')[-1].split('.')[0]) + hfname = args.html_file + galaxy_flag = args.galaxy + ethfname = args.ethnicity_file + xsnps_filename = args.reject_snps + xsamples_filename = args.reject_samples + + # function used to handle strange galaxy behaviour... + if galaxy_flag and cfname: + rc = format_galaxy_config_file(cfname) + if rc != 0: + print "Bad config file" + sys.exit() + + path_here = os.path.dirname(os.path.abspath(__file__)) + # determine output directory + if args.relative_prefix: + outdir = "{}/{}/full_output_{}".format(path_here, args.relative_prefix, basename) + elif args.absolute_prefix: + outdir = "{}/full_output_{}".format(args.absolute_prefix, basename) + else: + outdir = "{}/full_output_{}".format(path_here, basename) + if hfname == None: + hfname = "{}/{}.html".format(outdir, basename) + + ####===BEGIN PIPELINE===#### + # create a log file + lfname = "{}.log".format(basename) + logfile = open(lfname, 'w') + logfile.write("##Iterative PCA commenced at {}##\n".format(datetime.datetime.now())) + + # create output directory + prepare_directory(outdir, logfile) + + # do pre-processing if necessary, else go straight to R + if data_type == "variant_data": + stage1_info = stage_text_to_ped(dfname, cfname, xsamples_filename, + xsnps_filename, basename, outdir, logfile, True) + data_source_name = "{}/{}".format(outdir, stage1_info['pfname']) + data_type_out = "numeric_ped" + xsamples_filename = "{}/{}".format(outdir, stage1_info['xsname']) + elif data_type in ["rds", "rdata", "numeric_ped"]: + data_source_name = dfname + data_type_out = data_type + else: + print "Unrecognised input data type, exiting..." + sys.exit(0) + + # do iterative pca in R (much faster) + rCommand = "Rscript {}/iterative_pca_plot.R '{}' '{}' '{}' '{}' \ + '{}' '{}' '{}' '{}' '{}'\ + '{}' '{}' '{}' '{}' '{}'".format( + path_here, + n, outdir, basename, data_source_name, + data_type_out, ethfname, control_tag, cases_tag, sd_cutoff, + cmethod, tmethod, PATH_TO_R_FUNCTIONS, xsamples_filename, xsnps_filename) + + subprocess.call(rCommand, shell=True) + # generate dictionary of output file names + if cflag: + num_plots = 1 + else: + num_plots = 2 + if ethfname: + num_plots += 1 + if cases_tag or control_tag: + num_plots += 1 + iteration_data = create_output_dict(outdir, basename, n, num_plots) + + # clean up logfile + logfile.write("##Iterative PCA completed at {}##\n".format(datetime.datetime.now())) + logfile.close() + subprocess.call("mv '{}' '{}'".format(lfname, outdir), shell=True) + lfname = "{}/{}".format(outdir, lfname) + if cfname: + # copy and rename config file into outdir (if provided) + subprocess.call("cp '{}' '{}/{}.config'".format(cfname, outdir, basename), shell=True) + # output results summary in a nice html format + html_output = produce_report(hfname, dfname, lfname, cflag, + cmethod, tmethod, sd_cutoff, iteration_data, galaxy_flag) +############################################################################## + +""" +Function to convert a text file containing variant data into a numeric or genomic ped file +Inputs: +- dfname: The name of the text file to convert +- cfname: Configuration file containing info on which columns to find required information in, + as well as filtering options +- xsamples_filename: A file containg a list of exact sample ids to exclude +- xsnps_filename: A file containing a list of exact snp ids to exclude +- basename: Filename without the extension +- outdir: Full path of output directory +- logfile: A file to keep track of program status. Any shell commands executed will be written to this file +- numeric_flag: If true, ped file will be numeric. + If false, ped file will be genomic +Outputs: +- ped file, either numeric or genomic +- map file +- excluded samples file, containing the ids of all samples which did not pass the filters and were rejected +Returns: + a dict containing the PATHS to important output files +""" +def stage_text_to_ped(dfname, cfname, xsamples_filename, xsnps_filename, basename, outdir, logfile, numeric_flag): + # Create ped and map files + logfile.write("##STAGE 1: Create .ped and .map files from input text file##\n") + + pfname = "{}/{}.ped".format(outdir, basename) + mfname = "{}/{}.map".format(outdir, basename) + xsname = "{}/{}_xsamples.txt".format(outdir, basename) + + pc = pedify.PedConverter() + # read in config file + rc = pc.read_config_file(cfname) + # Add in string filters for excluded samples and snps + if xsamples_filename: + xsamples = open(xsamples_filename, 'r').read().split('\n') + pc.cfg.add_string_filter("comp_generated_sample_cuts", pc.cfg.col_names['sample_id_column'], + "exact", "reject", xsamples) + if xsnps_filename: + xsnps = open(xsnps_filename, 'r').read().split('\n') + pc.cfg.add_string_filter("comp_generated_snp_cuts", pc.cfg.col_names['variant_id_column'], + "exact", "reject", xsnps) + if rc == 0: + print 'config file read successfully' + else: + print 'failed to read in config file successfully. Error code: {}'.format(rc) + + # read in data file + rc = pc.read_data_file(dfname) + if rc == 0: + print 'data file read successfully' + else: + print 'failed to read in data file successfully. Error code: {}'.format(rc) + + # create output + pc.create_ped_file(pfname, numeric=numeric_flag) + pc.create_map_file(mfname) + pc.create_excluded_samples_file(xsname) + outdict = { + "pfname": "{}.ped".format(basename), + "mfname": "{}.map".format(basename), + "xsname": "{}_xsamples.txt".format(basename), + "control_tag": pc.cfg.control_info['control_tag']['tag'], + "cases_tag": pc.cfg.control_info['cases_tag']['tag'] + } + + return outdict + +""" +Function to produce the html report of all the data +Inputs: +hfname: The ABSOLUTE PATH (including the filename) of the html file to output +dfname: path to datafile +lfname: path to logfile +sd_cutoff: multipler used to determine outliers +iterationList: a list of dicts, each containing AT LEAST: + - outdir: ABSOLUTE PATH to parent directory containing all the output from the iteration + - count: which iteration this was + - plotpaths: A list of paths to output plots + - xsname: name of file containing ids of samples excluded from this iteration + - ofname: name of file containing ids of outliers detected in this iteration +returns: + The html output that was written to the html file +""" +def produce_report(hfname, dfname, lfname, cflag, cmethod, tmethod, sd_cutoff, iteration_list, galaxy_flag): + hfdir = os.path.dirname(hfname) + htmldoc = open(hfname, 'w') + iteration_data = [] + for it in iteration_list: + outdir = it['outdir'] + if galaxy_flag: + dirs = outdir.split("/") + relpath = "{}/{}".format(dirs[-2], dirs[-1]) + else: + relpath = os.path.relpath(outdir, hfdir) + it_dict = {} + + ofname = "{}/{}".format(outdir, it['ofname']) + xsname = "{}/{}".format(outdir, it['xsname']) + of = open(ofname, 'r') + xf = open(xsname, 'r') + ol = [] + xl = [] + for line in of: + ol.append(line.split(',')) + for line in xf: + xl.append(line.split(',')) + of.close() + xf.close() + # fix the paths to the images + rel_plot_paths = [] + for pp in it['plot_paths']: + rel_plot_paths.append("{}/{}".format(relpath, pp)) + + it_dict['count'] = it['count'] + it_dict['ol'] = ol + it_dict['xl'] = xl + it_dict['rel_plot_paths']= rel_plot_paths + iteration_data.append(it_dict) + + logcontent = open(lfname, 'r').read() + template_values = { + "date": datetime.datetime.now(), + "dfname": dfname, + "lfname": lfname, + "logcontent": logcontent, + "iteration_data": iteration_data, + "cflag": cflag, + "cmethod": cmethod, + "tmethod": tmethod, + "sd_cutoff": sd_cutoff, + } + template = JINJA_ENVIRONMENT.get_template(HTML_TEMPLATE_NAME) + rendered_template = template.render(template_values) + htmldoc.write(rendered_template) + htmldoc.close() + return rendered_template + +""" +Generate the expected directory structure of the root output directory, +given the basname, number of iterations, and number of expected plots output by +the R script +""" +def create_output_dict(outdir, basename, n, num_plots): + iteration_list = [] + for i in range(n): + it_dict = {} + i_outdir = "{}/output_{}_iteration_{}".format(outdir, basename, i) + it_dict['outdir'] = i_outdir + it_dict['ofname'] = "{}_outliers.txt".format(basename) + it_dict['xsname'] = "{}_xfile.txt".format(basename) + it_dict['count'] = i + it_dict['plot_paths'] = [] + for j in range(num_plots): + it_dict['plot_paths'].append("{}_plot_number_{}.png".format(basename, j+1)) + iteration_list.append(it_dict) + return iteration_list + + +###########################UTILITY######################################## +def prepare_directory(outdir, logfile): + # set up output directory + logfile.write("##Setting up output directory:{}##\n".format(outdir)) + subprocess.call("rm -rf '{}'".format(outdir), shell=True) + logfile.write("rm -rf '{}'\n".format(outdir)) + subprocess.call("mkdir -p '{}'".format(outdir), shell=True) + logfile.write("mkdir -p '{}'\n".format(outdir)) + +# takes a list of dicts, merges them into a single big dict +def merge_dicts(dictList): + result = {} + for d in dictList: + if type(d) == dict: + result.update(d) + return result + +# Some odd rules regarding character escaping and such in galaxy: fix those here +def format_galaxy_config_file(cfname): + CHAR_CONVERSION = { + 'g': '>', + 'l': '<', + 'e': '==', + 'ge': '<=', + 'le': '>=' + } + cfile = open(cfname, 'r') + lines = cfile.readlines() + cfile.close() + + newlines = [] + section = None + for l in lines: + nl = l + if l[0] == '#': + if l.strip() == "#numeric_filters": + section = "numeric_filters" + else: + section = None + else: + if section: + tokens = l.split(',') + op = tokens[2] + if op in CHAR_CONVERSION.keys(): + newop = CHAR_CONVERSION[op] + elif op in CHAR_CONVERSION.values(): + newop = op + else: + # error code 1; bad op codes + return 1 + tokens[2] = newop + nl = ','.join(tokens) + + + nl = nl.replace("__pd__", '#') + newlines.append(nl) + + cfile = open(cfname, 'w') + cfile.writelines(newlines) + cfile.close() + return 0 +############################################################################## + +if __name__ == "__main__": + main() diff -r 000000000000 -r 64e75e21466e iterative_pca_plot.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/iterative_pca_plot.R Wed Jun 01 03:38:39 2016 -0400 @@ -0,0 +1,97 @@ +library(flashpcaR) +library(dbscan) +library(cluster) + +## MAIN ### +# get command line arguments +CLI_FLAG = 1 +if (CLI_FLAG == 1) { + n = commandArgs(trailingOnly=TRUE)[1] + outdir = commandArgs(trailingOnly=TRUE)[2] + basename = commandArgs(trailingOnly=TRUE)[3] + data_source = commandArgs(trailingOnly=TRUE)[4] + data_type = commandArgs(trailingOnly=TRUE)[5] + eth_filename = commandArgs(trailingOnly=TRUE)[6] + control_tag = commandArgs(trailingOnly=TRUE)[7] + if (control_tag == "None") {control_tag = NULL} + cases_tag = commandArgs(trailingOnly=TRUE)[8] + if (cases_tag == "None") {cases_tag = NULL} + numsds = as.numeric(commandArgs(trailingOnly=TRUE)[9]) + cmethod = commandArgs(trailingOnly=TRUE)[10] + tmethod = commandArgs(trailingOnly=TRUE)[11] + path_to_r_functions = commandArgs(trailingOnly=TRUE)[12] + xsamples_filename = commandArgs(trailingOnly=TRUE)[13] + xsnps_filename = commandArgs(trailingOnly=TRUE)[14] +} else { + n = 10 + basename = "test_eth2" + data_source = "./data/halo1_numeric.ped" + data_type = "numeric_ped" + outdir = paste0(getwd(), "/full_output_", basename) + #data_source = "./data/HapMap3_flashPCA_data.rds" + #data_type = "rds" + #eth_filename = "./data/HapMap3_ethnicity_rf.txt" + eth_filename = "./data/Halo_ethnicity_rf.txt" + control_tag = "HAPS" + cases_tag = NULL + numsds = 1.1 + cmethod = "hclust" + tmethod = "mcd" + path_to_r_functions = paste0(getwd(), "/R_functions/") + xsamples_filename = "./xfiles/halo1_xsamples.txt" + xsnps_filename = "./xfiles/halo1_xsnps.txt" +} + +# get source code +source(paste0(path_to_r_functions, "/", "plotting_functions.R")) +source(paste0(path_to_r_functions, "/", "pca_helpers.R")) +source(paste0(path_to_r_functions, "/", "pipeline_code.R")) +source(paste0(path_to_r_functions, "/", "clustering_functions.R")) +source(paste0(path_to_r_functions, "/", "outlier_trimming.R")) + +if (CLI_FLAG != 1) { + unlink(paste0(getwd(), "/", "full_output_", basename), recursive=TRUE) +} + +# read in data +ped_data = get_source_data(data_source, data_type) +eth_data = parse_ethnicity_file(eth_filename) +xsamples = get_first_column(xsamples_filename) +xsnps = get_first_column(xsnps_filename) + +# do the pca and prepare plots +iterations = list() +for(i in 1:n) { + fpd = filter_ped_data(ped_data, xsamples, xsnps) + iterations[[i]] = single_iteration(outdir, basename, fpd, xsamples, numsds, + cmethod, tmethod, control_tag, cases_tag, ethnicity_data=eth_data) + iterations[[i]]$dirname = generate_directory_name(outdir, basename, i) + xsamples = iterations[[i]]$xsamples +} + +# create folders and plots +for (i in 1:n) { + titer = iterations[[i]] + dir.create(titer$dirname, recursive=TRUE) + num_plots = titer$num_plots + + for (j in 1:num_plots) { + plot_filename = sprintf("%s/%s_plot_number_%d.png", titer$dirname, basename, j) + plot_by_groups(titer$pca_data$values[, c(1, 2)], + titer$plots[[j]]$groups, + titer$plots[[j]]$tags, + titer$plots[[j]]$plot_colors, + titer$plots[[j]]$plot_symbols, + titer$plots[[j]]$plot_title, + plot_filename=plot_filename + ) + } + + # write outliers to file + xfilename = paste0(titer$dirname, "/", basename, "_xfile.txt") + outliers_filename = paste0(titer$dirname, "/", basename, "_outliers.txt") + xscon = add_ethnicity_data(titer$old_xsamples, eth_data) + olcon = add_ethnicity_data(titer$outliers, eth_data) + write.table(xscon, file=xfilename, row.names=FALSE, col.names=TRUE, sep=",") + write.table(olcon, file=outliers_filename, row.names=FALSE, col.names=TRUE, sep=",") +} \ No newline at end of file diff -r 000000000000 -r 64e75e21466e pca_pipeline_def.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pca_pipeline_def.xml Wed Jun 01 03:38:39 2016 -0400 @@ -0,0 +1,377 @@ + + Iterative PCA pipeline + + Jinja2 + + + + + + + #control +control_tag,#Sample,$control_tag +cases_tag,#Sample,$cases_tag +#column_names +genotype_column,$genotype_column +reference_column,$reference_column +alternate_column,$alternate_column +sample_id_column,$sample_id_column +chromosome_column,$chromosome_column +position_column,$position_column +variant_id_column,$variant_id_column +#numeric_filters +#for $i, $f in enumerate($numeric_filters) +$f.filter_name,$f.column_name,$f.operation,$f.cutoff +#end for +#string_filters +#for $i, $s in enumerate($string_filters) +$s.filter_name,$s.column_name,$s.exact_flag,$s.accept_flag +$(','.join($s.patterns.split('\n'))) +#end for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 3 sds are purple. +- **Ethnicity Plot:** Each ethnicity uses a specific colour and symbol. Fairly self-explanotory. Plot is only output if an ethnicity data file is provided as input. + +Beneath the plots there are also two expandable lists. Samples excluded shows which samples were not part of the PCA for this iteration. This is cumulative. Outliers shows the outliers detected in THIS iteration. Any available data from the ethnicity file (if provided) is also displayed for each excluded sample. + +Options/Secondary Inputs +------------------------ +- **Type of input data file:** Either a ped file or a text file as specified above +- **Number of iterations to complete:** A single iteration would involve performing PCA on the input data, then identifying and removing outliers. Two iterations would involve performing PCA again with the outliers identified from the first iteration excluded, three iterations would exclude the outliers from the first 2 stages, and so on and so forth. +- **Detecting outliers without clustering:** This is done by obtaining the standard deviations of the first two principle components. Any samples whose scores for either of these first two components falls more than 'n' number of standard deviations away from the component median are considered outliers. +- **Clustering:** The user may select from a range of algorithms which will try to identify clusters in the data, with each cluster hopefully corresponding to an ethnic group. +- **Clustering methods:** + + - *DBSCAN (Density based spatial clustering of applications with noise):* + + Forms clusters based on density of points, and does not require the number of clusters to be specified beforehand. Good for irregularly shaped, non-spherical clusters. Does NOT require all points to be part of clusters, and produces a set of 'outliers', i.e. points which do not belong to any clusters. + + - *Hierarchical Clustering:* + + Forms clusters based on distance between points. Tends to result in spherical clusters, but able to handle clusters of varying density. Forces all points to be part of a single cluster. The number of clusters is determined seperately, using the silhouette scores of all the points as a heuristic. + +- **Cluster trimming methods:** All these methods first involve finding the centres of each cluster. + + - *Standard Deviations:* + + If the centroid of a cluster lies more than ‘n’ standard deviations (n is passed in as a parameter by the user) from the centroid of the entire dataset in either the x or y directions, the entire cluster is cut. If DBSCAN is selected, the outliers it identifies are also cut. + + - *Mean Cluster Distance:* + + Obtain the average distance between clusters, done by computing the distance between all pairs of clusters and taking the mean. For each cluster, we also compute an average “isolation” value, which is the mean of the distances between that particular cluster and all other clusters. If a cluster’s isolation value is larger than the average cluster distance (multiplied by the strictness weighting), then that cluster is considered an outlier and cut from the next iteration. If DBSCAN is selected, the outliers it identifies are also cut. + + - *DBSCAN outliers only:* + + Only cut the points identified by the DBSCAN algorithm as not belonging to any cluster. No entire clusters are cut. Obviously this method is only applicable if DBSCAN is selected as the clustering method. THE TOOL WILL NOT RUN IF YOU SELECT THIS OPTION TOGETHER WITH 'Hierarchical Clustering' AS THE CLUSTERING METHOD. + +- **Strictness:** A multiplier used to determine how 'strict' the outlier cutting methods are. For example, if strictness = 1, and we are not doing clustering, all points which lie more than 1 sd from the median are cut. If strictness = 2, all points which lie more than 2 sd from the median are cut, etc. + +- **Control Tag:** A pattern present in the ids of all the control samples, e.g. "LP" + +- **Cases Tag:** A pattern present in the ids of all the cases samples, e.g. "HAPS" + +- **Configuration file:** A configuration file to accompany an input variant text file. The config file has a rather specific format, an example is given below:: + + #control + control_tag,#Sample,HAPS + cases_tag,#Sample,LP + #column_names + genotype_column,GT + reference_column,REF + alternate_column,ALT + sample_id_column,#Sample + chromosome_column,CHROM + position_column,POS + variant_id_column,ID + #numeric_filters + strand_bias_filter,Fraction_with_strand_bias,<,0.03 + position_bias_filter,Fraction_with_positional_bias,<,0.03 + count_filter,Num_samples_variant,>,1 + pass_filter,Fraction_samples_passed_filter,>,0.9 + #string_filters + variant_type_filter,Variant_Type,exact,accept + SNV + genotype_filter,GT,exact,accept + '0/1,'1/1 + + File consists of up to four sections, the starts of which are marked by lines beginning with an octothorpe. + + - *'#control' section:* Indicates substrings found in ids of controls and cases + - *'#column_names' section:* This is the only required section. First column indicates what column name (in the variant text file) the second column specifies. The same keys i.e. left most column values, as shown in the example must be used, e.g. sample_id_column, the RHS column names must match the names in the variant data file. If using a generated config file, only modify the RHS column, and DO NOT REMOVE ANY rows from this section. + - *'#numeric_filters' section:* Each filter takes up a single line, and is seperated into 4 sections by commas. + + - Column 1: Name of the filter, which is arbitrary + - Column 2: The name of the column in the variant file to filter on. If this column is not found, a warning is displayed + - Column 3: The criteria of the filter which must be passed in order for us to accept a particular row. E.g. less than, greater than + - Column 4: The cutoff value to be compared against. + + - *'#string_filters' section:* Each filter takes up two lines. + + - Line 1, Column 1: Arbitrary filter name + - Line 1, Column 2: Column name to filter on + - Line 1, Column 3: Do the patterns have to be exact matches, or just a substrings? E.g. if pattern = "HAPS" and string being compared = "HAPS-909090", if exact was true this would not be a successfull match, whereas if not_exact was true it would be a match. + - Line 1, Column 4: What to do with the row if a successful match is detected, e.g. accept or reject + - Line 2: A comma seperated list of patterns to match on + + +- **Ethnicity file:** An ethnicity file containing ethnicity data, and possible other data, on the samples. Note this data is not used to sort the input and has no effect on the PCA itself. It is used only to label the results of the output. + + Requirements: + + - tab delimited + - Must have at least two columns + - First column has sample ID's + - Second column has ethnicities + - First row must be a header + + First few lines of a correctly formatted ethnicity file given below:: + + IID population Halo1.or.2. BloodAge SalivaAge COB ethnicity + LP-10000001 AUSTRALIAN Halo2 - LP-BC 67 NA Australia australian + LP-10000003 AUSTRALIAN Halo1 45 NA Australia australian southern_european + LP-10000005 AUSTRALIAN Halo1 73 NA Australia australian southern_european + LP-10000008 EUROPE Halo1 54 NA South Eastern Europe south_east_european + LP-10000009 OTHER Halo1 65 NA Southern & East Africa jewish + +- **Exclude samples file:** A text file containing exact ids of samples to exclude from the PCA. + + Requirements: + + - single column + - sample ids seperated by newlines + - one sample id per line + + Example:: + + HAPS-90573 + HAPS-90578R + HAPS-110542 + HAPS-110605 + HAPS-110620 + HAPS-110638 + HAPS-110649 + HAPS-110668 + HAPS-110799 + HAPS-110813 + HAPS-110959 + HAPS-111186 + HAPS-111298 + HAPS-111404 + HAPS-111493 + HAPS-111512 + HAPS-111538 + +- **Exclude SNPS file:** A text file containing exact ids of SNPs to exclude from the PCA. + + Requirements: + + - single column + - snp ids seperated by newlines + - one snp id per line + + Example:: + + rs72896283 + rs7534447 + rs4662775 + rs10932813 + rs10932816 + rs12330369 + rs1802904 + rs10902762 + rs9996817 + rs6446393 + rs871133 + rs4301095 + rs941849 + rs6917467 + rs75834296 + rs142922667 + +- **Required Column Headers:** If a variant text file is the primary input, the following information MUST be provided, either through the config file, or by filling out the corresponding fields in the tool submission form. + + - Sample IDs: Name of the column containing the sample ids + - Chromosome: Name of the column indicating what chromosome the SNP is found on + - Position: Name of the column indicating at which position on the chromosome the SNP is found + - Genotype: The genotype of the sample for this site + - Reference: The 'normal'/'common' genotype for this site + - Alternate: The alternate genotype for this site + - Variant IDs: Name of the column indicating the ID of the SNP + +- **Numeric Filters:** See Configuration file section +- **String Filters:** See Configuration file section + +Other Output +------------- + +- Tool will output a root folder containing the HTML file and all the plots, placed in directories seperated by iteration. +- If the input data was a variant file, the output folder will also contain a numeric ped file, generated before the first iteration, as well as a config file. The config file is either the exact one passed in by the user, or one automatically generated from the form input, which can be used for future PCA runs. + + ]]> + + + + \ No newline at end of file diff -r 000000000000 -r 64e75e21466e pca_report.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pca_report.html Wed Jun 01 03:38:39 2016 -0400 @@ -0,0 +1,78 @@ +{% macro display_iterations(ilist) %} + {% for it in ilist %} + {{ display_single_iteration(it) }} + {% endfor %} +{% endmacro %} + +{% macro display_single_iteration(idata) %} +

Iteration {{ idata.count }}

+ {% for pp in idata.rel_plot_paths %} + + {% endfor %} +
+ {{ idata.xl|length - 1 }} samples were excluded from the PCA +
+ + {% for x in idata.xl %} + + {% if loop.first %} + {% for cell in x %} + + {% endfor %} + {% else %} + {% for cell in x %} + + {% endfor %} + {% endif %} + + {% endfor %} +
{{ cell }}{{ cell }}
+
+
+
+ Detected {{ idata.ol|length - 1}} outliers +
+ + {% for x in idata.ol %} + + {% if loop.first %} + {% for cell in x %} + + {% endfor %} + {% else %} + {% for cell in x %} + + {% endfor %} + {% endif %} + + {% endfor %} +
{{ cell }}{{ cell }}
+
+
+ +{% endmacro %} + + + + + PCA Report + + +

PCA Report

+
Date: {{ date }}
+
Input data file: {{ dfname }}
+
Log File: {{ lfname }} +
{{ logcontent }}
+
+
Standard Deviation Multiplier: {{ sd_cutoff }}
+
Clustering Method: {{ cmethod }}
+
Cluster Trimming: {{ tmethod }}
+ +

Iteration data

+ {{ display_iterations(iteration_data) }} +
+ + + + + diff -r 000000000000 -r 64e75e21466e pedify.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pedify.py Wed Jun 01 03:38:39 2016 -0400 @@ -0,0 +1,395 @@ +import sys +import csv +import argparse + +DEBUG = 0 + +REQ_KEYS = ['genotype_column', 'reference_column', 'alternate_column', + 'sample_id_column', 'chromosome_column', 'position_column', + 'variant_id_column'] + +GENOTYPE_DICT = { + "'0/0": "hom_ref", + "'0/1": "het", + "'1/1": "hom_alt", + "'1/2": "tri_allelic" +} + +GENOTYPE_TO_NUMERIC = { + "'0/0": 0, + "'0/1": 1, + "'1/1": 2, + "'1/2": 2 +} + +class PedConverter: + def __init__(self): + self.cfg = None + self.samples = {} + self.sites = {} + self.xsamples = [] + + def verify_column_names(self, datafile_header): + # check all the config column names actually exist in the data file + for col in self.cfg.col_names.values(): + if col not in datafile_header: + print "The '{}' column was not found in the datafile! Double check your config file is correct. Exiting...".format( + col) + sys.exit(1) + + def verify_filters(self, datafile_header): + # print warning messages if filters invalid + all_filters = self.cfg.nfilters.copy() + all_filters.update(self.cfg.sfilters) + for key in all_filters: + col_name = all_filters[key]["col_name"] + if col_name not in datafile_header: + print "Warning! The '{}' filter was not applied as the datafile does not contain the column '{}'".format( + key, col_name) + + def read_config_file(self, cfname): + self.cfg = ConfigSettings() + rc = self.cfg.parse_config_file(cfname) + return rc + + def read_data_file(self, dfname): + if (self.cfg == None) or (not self.cfg.is_valid()): + return 1 + + datafile = open(dfname, 'r') + dreader = csv.DictReader(datafile, delimiter='\t') + # verify datafile data matches config file + self.verify_column_names(dreader.fieldnames) + self.verify_filters(dreader.fieldnames) + all_sample_ids = set() + i = 0 + + for row in dreader: + failed_filters = self.filter_all(row) + sample_key = row[self.cfg.col_names['sample_id_column']] + all_sample_ids.add(sample_key) + if not failed_filters: + # add to sample dict + # key is a tuple made up of which chromosome the snp is found on + # and the position on the chromosome itself + SNP_key = (row[self.cfg.col_names['chromosome_column']], int(row[self.cfg.col_names['position_column']])) + genotype = row[self.cfg.col_names['genotype_column']] + + # create a dictionary for each sample (person); each person is associated + # with another dictionary of all the SNPs found in that person + if sample_key not in self.samples: + self.samples[sample_key] = {SNP_key: genotype} + else: + self.samples[sample_key][SNP_key] = genotype + + # create a dict of all the sites where SNPs exist + if SNP_key not in self.sites: + # generate arbitrary ID's if there is no pre-existing ID for the SNP + if row[self.cfg.col_names['variant_id_column']] == '.': + SNP_id = "SNP_" + str(i) + else: + SNP_id = row[self.cfg.col_names['variant_id_column']] + + SNP_data = {'ref_col': row[self.cfg.col_names['reference_column']], + 'alt_col': row[self.cfg.col_names['alternate_column']], + 'SNP_id': SNP_id} + self.sites[SNP_key] = SNP_data + i += 1 + + # make sure every sample contains a genotype for every SNP found + for sample_key in self.samples: + this_sample = self.samples[sample_key] + for SNP_key in self.sites: + if SNP_key not in this_sample: + this_sample[SNP_key] = "'0/0" + datafile.close() + + # get list of samples which were filtered out + self.xsamples = list(all_sample_ids.difference(set(self.samples.keys()))) + return 0 + + # returns key of the specific filter/s that failed + def filter_numeric(self, row): + failed_filters = set() + for key in self.cfg.nfilters.keys(): + nfilter = self.cfg.nfilters[key] + cutoff = float(nfilter["cutoff"]) + op = nfilter["op"] + col_name = nfilter["col_name"] + if col_name in row.keys(): + cv = float(row[col_name]) + if not string_as_operator(cv, cutoff, op): + failed_filters.add(key) + + return failed_filters + + # returns key of the specific filter/s that failed + def filter_string(self, row): + failed_filters = set() + for key in self.cfg.sfilters.keys(): + sfilter = self.cfg.sfilters[key] + col_name = sfilter["col_name"] + if col_name in row.keys(): + cs = row[col_name] + af = sfilter['accept_flag'] + ef = sfilter['exact_flag'] + patterns = sfilter['patterns'] + if ef: + if af: + passed = False + for p in patterns: + if p == cs: + passed = True + break + if passed == False: + failed_filters.add(key) + else: + for p in patterns: + if p == cs: + failed_filters.add(key) + break + else: + if af: + passed = False + for p in patterns: + if p in cs: + passed = True + break + if passed == False: + failed_filters.add(key) + else: + for p in patterns: + if p in cs: + failed_filters.add(key) + break + + return failed_filters + + def filter_all(self, row): + return self.filter_numeric(row).union(self.filter_string(row)) + + def create_ped_file(self, filename, numeric=False): + output = "" + + sorted_sample_keys = sorted(self.samples.keys()) + for sample_key in sorted_sample_keys: + this_sample = self.samples[sample_key] + sorted_site_keys = sorted(this_sample.keys()) + variants = [] + if numeric: + pef = sample_key + else: + pef = self.create_ped_entry_front(sample_key) + + for SNP_key in sorted_site_keys: + genotype = this_sample[SNP_key] + if numeric == True: + variants.append(str(GENOTYPE_TO_NUMERIC[genotype])) + else: + variants.append(genotype_to_bases(genotype, self.sites[SNP_key])) + + output += "{}\t{}\n".format(pef, '\t'.join(variants)) + + pedfile = open(filename, 'w') + pedfile.write(output) + pedfile.close() + + def create_ped_entry_front(self, sample_id): + if self.cfg.control_info["control_tag"]["tag"] in sample_id: + group = 2 + elif self.cfg.control_info["cases_tag"]["tag"] in sample_id: + group = 1 + else: + group = 1 + + entry = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format( + sample_id, + sample_id, + sample_id + "_F", + sample_id + "_M", + 2, + group) + + return entry + + def create_map_file(self, filename): + output = "" + for SNP_key in sorted(self.sites.keys()): + chrom = SNP_key[0] + SNP_id = self.sites[SNP_key]['SNP_id'] + posn = SNP_key[1] + output += "{}\t{}\t{}\n".format(chrom, SNP_id, str(posn)) + + mapfile = open(filename, 'w') + mapfile.write(output) + mapfile.close() + + def create_excluded_samples_file(self, filename): + xsfile = open(filename, 'w') + xsfile.write('\n'.join(self.xsamples)) + xsfile.close() + +class ConfigSettings: + + SECTIONS = [ + "#control", + "#column_names", + "#numeric_filters", + "#string_filters" + ] + + def __init__(self): + self.control_info = {} + self.col_names = {} + self.nfilters = {} + self.sfilters = {} + + def parse_config_file(self, cfname): + cffile = open(cfname, 'r') + section = None + rc = 0 + + for line in cffile: + # clean trailing/leading whitespace/newlines + line = line.strip() + # set section flag + if line[0] == '#': + if line in ConfigSettings.SECTIONS: + section = line + else: + continue + else: + # fill up config dicts + if section == "#control": + (key, col_name, tag) = line.split(',') + self.control_info[key] = {'col_name': col_name, 'tag': tag} + elif section == "#column_names": + (key, col_name) = line.split(',') + self.col_names[key] = col_name + elif section == "#numeric_filters": + (key, col_name, op, cutoff) = line.split(',') + self.add_numeric_filter(key, col_name, op, float(cutoff)) + elif section == "#string_filters": + (key, col_name, exact_flag, accept_flag) = line.split(',') + patterns = next(cffile).strip().split(',') + self.add_string_filter(key, col_name, exact_flag, accept_flag, patterns) + else: + rc = 2 + break + + cffile.close() + if rc != 0: + return rc + if not self.is_valid(): + rc = 3 + return rc + + + def is_valid(self): + for k in REQ_KEYS: + if k not in self.col_names.keys(): + return False + return True + + def add_numeric_filter(self, key, col_name, op, cutoff): + self.nfilters[key] = { + 'col_name': col_name, + 'op': op, + 'cutoff': cutoff + } + + def add_string_filter(self, key, col_name, exact_flag, accept_flag, patterns): + if exact_flag == "exact": + ef = True + elif exact_flag == "not_exact": + ef = False + else: + return False + + if accept_flag == "accept": + af = True + elif accept_flag == "reject": + af = False + else: + return False + + self.sfilters[key] = { + 'col_name': col_name, + 'exact_flag': ef, + 'accept_flag': af, + 'patterns': patterns + } + + def __str__(self): + rv = "is Valid: {} || control info: {} || col names: {} || numeric filters: {} || string filters: {}".format( + self.is_valid(), self.control_info, self.col_names, self.nfilters, self.sfilters) + return rv + + +### Utility ### +def string_as_operator(arg1, arg2, op): + if op == "==": + return arg1 == arg2 + elif op == ">": + return arg1 > arg2 + elif op == "<": + return arg1 < arg2 + elif op == "<=": + return arg1 <= arg2 + elif op == ">=": + return arg1 >= arg2 + +def genotype_to_bases(genotype, SNPdata): + bases = "" + if genotype in GENOTYPE_DICT: + gtype = GENOTYPE_DICT[genotype] + if gtype == "hom_ref": + bases = "{} {}".format(SNPdata['ref_col'], SNPdata['ref_col']) + elif gtype == "hom_alt": + bases = "{} {}".format(SNPdata['alt_col'], SNPdata['alt_col']) + elif gtype == "het": + bases = "{} {}".format(SNPdata['ref_col'], SNPdata['alt_col']) + elif gtype == "tri_allelic": + aa_col = SNPdata['alt_col'] + if len(aa_col) > 1: + # arbitrarily choose the first one + alt_allele = aa_col[0] + else: + alt_allele = aa_col + bases = "{} {}".format(alt_allele, alt_allele) + else: + print genotype + print "Unrecognized genotype!" + sys.exit(1) + return bases + +### Main ### +def main(): + # argument parsing + parser = argparse.ArgumentParser() + parser.add_argument("dfname", help="name of input data file") + parser.add_argument("cfname", help="name of input configuration file") + parser.add_argument("pfname", help="name of output ped file") + parser.add_argument("mfname", help="name of output map file") + parser.add_argument("xsname", help="name of output file containing exact IDs of samples who were excluded") + args = parser.parse_args() + + pc = PedConverter() + # read in config file + rc = pc.read_config_file(args.cfname) + if rc == 0: + print 'config file read successfully' + else: + print 'failed to read in config file successfully. Error code: {}'.format(rc) + # read in data file + rc = pc.read_data_file(args.dfname) + if rc == 0: + print 'data file read successfully' + else: + print 'failed to read in data file successfully. Error code: {}'.format(rc) + pc.create_ped_file(args.pfname, numeric=True) + pc.create_map_file(args.mfname) + pc.create_excluded_samples_file(args.xsname) + +if __name__ == "__main__": + main() diff -r 000000000000 -r 64e75e21466e tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Wed Jun 01 03:38:39 2016 -0400 @@ -0,0 +1,22 @@ + + + + + + https://pypi.python.org/packages/source/J/Jinja2/Jinja2-2.8.tar.gz#md5=edb51693fe22c53cee5403775c71a99e + $INSTALL_DIR/lib/python + + export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python && + python setup.py install --install-lib $INSTALL_DIR/lib/python --install-scripts $INSTALL_DIR/bin + + + $INSTALL_DIR/lib/python + $INSTALL_DIR/bin + + + + + Jinja2 python library for templating + + +