comparison baseline/filter.r @ 63:8728284105ee draft

Uploaded
author davidvanzessen
date Wed, 06 Dec 2017 08:04:52 -0500
parents c33d93683a09
children c6dd3215ebe0
comparison
equal deleted inserted replaced
62:aa8d37bd1930 63:8728284105ee
6 print(paste("selection = ", selection)) 6 print(paste("selection = ", selection))
7 7
8 8
9 summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F) 9 summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
10 gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F) 10 gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
11
12 fix_column_names = function(df){
13 if("V.DOMAIN.Functionality" %in% names(df)){
14 names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
15 print("found V.DOMAIN.Functionality, changed")
16 }
17 if("V.DOMAIN.Functionality.comment" %in% names(df)){
18 names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
19 print("found V.DOMAIN.Functionality.comment, changed")
20 }
21 return(df)
22 }
23
24 gappeddat = fix_column_names(gappeddat)
11 25
12 #dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T)) 26 #dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T))
13 27
14 dat = cbind(gappeddat, summarydat$AA.JUNCTION) 28 dat = cbind(gappeddat, summarydat$AA.JUNCTION)
15 29
22 dat$DGene = gsub("[*].*", "", dat$DGene) 36 dat$DGene = gsub("[*].*", "", dat$DGene)
23 37
24 dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele) 38 dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele)
25 dat$JGene = gsub("[*].*", "", dat$JGene) 39 dat$JGene = gsub("[*].*", "", dat$JGene)
26 40
27 #print(str(dat)) 41 print(str(dat))
28 42
29 dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":")) 43 dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":"))
30 44
31 dat = dat[!duplicated(dat$past), ] 45 dat = dat[!duplicated(dat$past), ]
32 46
47 print(paste("Sequences remaining after duplicate filter:", nrow(dat)))
48
33 dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",] 49 dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",]
34 50
51 print(paste("Sequences remaining after functionality filter:", nrow(dat)))
52
53 print(paste("Sequences remaining:", nrow(dat)))
54
35 write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T) 55 write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)