diff baseline/filter.r @ 63:8728284105ee draft

Uploaded
author davidvanzessen
date Wed, 06 Dec 2017 08:04:52 -0500
parents c33d93683a09
children c6dd3215ebe0
line wrap: on
line diff
--- a/baseline/filter.r	Tue Dec 05 10:57:13 2017 -0500
+++ b/baseline/filter.r	Wed Dec 06 08:04:52 2017 -0500
@@ -9,6 +9,20 @@
 summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
 gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
 
+fix_column_names = function(df){
+    if("V.DOMAIN.Functionality" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
+        print("found V.DOMAIN.Functionality, changed")
+    }
+    if("V.DOMAIN.Functionality.comment" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
+        print("found V.DOMAIN.Functionality.comment, changed")
+    }
+    return(df)
+}
+
+gappeddat = fix_column_names(gappeddat)
+
 #dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T))
 
 dat = cbind(gappeddat, summarydat$AA.JUNCTION)
@@ -24,12 +38,18 @@
 dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele)
 dat$JGene = gsub("[*].*", "", dat$JGene)
 
-#print(str(dat))
+print(str(dat))
 
 dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":"))
 
 dat = dat[!duplicated(dat$past), ]
 
+print(paste("Sequences remaining after duplicate filter:", nrow(dat)))
+
 dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",]
 
+print(paste("Sequences remaining after functionality filter:", nrow(dat)))
+
+print(paste("Sequences remaining:", nrow(dat)))
+
 write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)