view encode.R @ 67:99e8e055ddd6 draft

Deleted selected files
author nicolas
date Wed, 26 Oct 2016 19:15:52 -0400
parents 89175737f16b
children
line wrap: on
line source

########################################################
#
# creation date : 04/01/16
# last modification : 17/09/16
# author : Dr Nicolas Beaume
# owner : IRRI
#
########################################################

############################ helper functions #######################

# encode one position in one individual
encodeGenotype.position <- function(x, major, code=c(0,1,2), sep=""){
  res <- x
  if(!is.na(x)) {
    if(isHeterozygous(x, sep = sep)) {
      # heterozygous
      res <- code[2]
    } else {
      # determine whether it is the minor or major allele
      x <- unlist(strsplit(x, sep))
      # need to check only one element as we already know it is a homozygous
      if(length(x) > 1) {
        x <- x[1]
      }
      if(x==major) {
        res <- code[3]
      } else {
        res <- code[1]
      }
    }
  } else {
    # keep NA as NA
    res <- NA
  }
  return(res)
}

# rewrite a marker to make an exact count of the allele for the current marker
encodeGenotype.rewrite <- function(x, sep=""){
  res <- x
  if(!is.na(x)) {
    if(length(unlist(strsplit(x,sep)))==1) {
      # in case of homozygous, must be counted 2 times so the caracter is written 2 times
      res <- c(x,x)
    } else {
      # heterozygous
      res <- unlist(strsplit(x, split=sep))
    }
  } else {
    res <- NA
  }
  return(res)
}

# encode one individual
encodeGenotype.vec <- function(indiv, sep="", code=c(0,1,2)){
  # rewrite genotype to make sure everything is written as a double caracter value
  newIndiv <- unlist(lapply(as.character(indiv), encodeGenotype.rewrite, sep))
  # compute the occurcence of each genotype to determine major an minor allele
  stat <- table(as.character(newIndiv))
  major <- names(stat)[which.max(stat)]
  # Encode using the appropriate code
  indiv <- unlist(lapply(indiv, encodeGenotype.position, major, code, sep))
  return(indiv)
}

# determine if the genotype is an homozygous or heterozygous one
# genotype must be written with two characters, even homozygous 
# (see encodeGenotype.rewrite() function )
isHeterozygous <- function(genotype, sep=""){
  bool <- F
  # case of NA, can't be determined
  if(is.na(genotype)){
    bool <- NA
  } else {
    # check whether both element of the genotype are the same or not
    x <- unlist(strsplit(genotype, sep))
    if(length(x) > 1 & !(x[1] %in% x[2])) {
      bool <- T
    }
    
  }
  return(bool)
}

# check if encoding has been made properly. return a boolean vector
# which has the same length that the number of columns of the input matrix
# at marker i, check[i] is true if code[3] is larger than code[1], false otherwise
# please note that this function is not used in the current tool and let
# by convinience for being used outside of galaxy
checkEncoding <- function(encoded, code=c(0,1,2)) {
  check <- NULL
  for(i in 1:ncol(encoded)) {
    # find major an minor allele
    major <- length(which(encoded[,i]==code[3]))
    minor <- length(which(encoded[,i]==code[1]))
    # comaprison
    if(major >= minor) {
      check <- c(check, T)
    } else {
      check <- c(check, F)
    }
  }
  return(check)
}

################################## main function ###########################
# encode all individuals
encodeGenotype <- function(raw, sep="", code=c(0,1,2), outPath){
  # encode genotype
  encoded <- apply(raw, 2, encodeGenotype.vec, sep, code)
  # set all NA to -1 (thus encoding schems with -1 are not allowed)
  encoded[is.na(encoded)] <- -1
  write.table(encoded, file=paste(outPath,".csv", sep=""), row.names = F, sep="\t")
}

############################ main #############################
# load argument from xml file
cmd <- commandArgs(T)
source(cmd[1])
# load genotype
genotype <- read.table(genotype, sep="\t", stringsAsFactors = F, h=T)
code <- c(0,1,2)
encodeGenotype(raw=genotype, sep=sep, code = code, outPath = out)
cat(paste(out,".csv", "\n", sep=""))