Mercurial > repos > davidvanzessen > shm_csr
diff baseline/filter.r @ 63:8728284105ee draft
Uploaded
author | davidvanzessen |
---|---|
date | Wed, 06 Dec 2017 08:04:52 -0500 |
parents | c33d93683a09 |
children | c6dd3215ebe0 |
line wrap: on
line diff
--- a/baseline/filter.r Tue Dec 05 10:57:13 2017 -0500 +++ b/baseline/filter.r Wed Dec 06 08:04:52 2017 -0500 @@ -9,6 +9,20 @@ summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F) gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F) +fix_column_names = function(df){ + if("V.DOMAIN.Functionality" %in% names(df)){ + names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality" + print("found V.DOMAIN.Functionality, changed") + } + if("V.DOMAIN.Functionality.comment" %in% names(df)){ + names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment" + print("found V.DOMAIN.Functionality.comment, changed") + } + return(df) +} + +gappeddat = fix_column_names(gappeddat) + #dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T)) dat = cbind(gappeddat, summarydat$AA.JUNCTION) @@ -24,12 +38,18 @@ dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele) dat$JGene = gsub("[*].*", "", dat$JGene) -#print(str(dat)) +print(str(dat)) dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":")) dat = dat[!duplicated(dat$past), ] +print(paste("Sequences remaining after duplicate filter:", nrow(dat))) + dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",] +print(paste("Sequences remaining after functionality filter:", nrow(dat))) + +print(paste("Sequences remaining:", nrow(dat))) + write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)