annotate baseline/filter.r @ 79:98e3fecedd2b draft

Uploaded
author davidvanzessen
date Tue, 01 Sep 2020 16:03:44 -0400
parents c6dd3215ebe0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
1 arg = commandArgs(TRUE)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
2 summaryfile = arg[1]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
3 gappedfile = arg[2]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
4 selection = arg[3]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
5 output = arg[4]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
6 print(paste("selection = ", selection))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
7
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
8
64
c6dd3215ebe0 Uploaded
davidvanzessen
parents: 63
diff changeset
9 summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote = "")
c6dd3215ebe0 Uploaded
davidvanzessen
parents: 63
diff changeset
10 gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote = "")
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
11
63
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
12 fix_column_names = function(df){
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
13 if("V.DOMAIN.Functionality" %in% names(df)){
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
14 names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
15 print("found V.DOMAIN.Functionality, changed")
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
16 }
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
17 if("V.DOMAIN.Functionality.comment" %in% names(df)){
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
18 names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
19 print("found V.DOMAIN.Functionality.comment, changed")
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
20 }
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
21 return(df)
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
22 }
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
23
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
24 gappeddat = fix_column_names(gappeddat)
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
25
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
26 #dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
27
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
28 dat = cbind(gappeddat, summarydat$AA.JUNCTION)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
29
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
30 colnames(dat)[length(dat)] = "AA.JUNCTION"
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
31
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
32 dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
33 dat$VGene = gsub("[*].*", "", dat$VGene)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
34
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
35 dat$DGene = gsub("^Homsap ", "", dat$D.GENE.and.allele)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
36 dat$DGene = gsub("[*].*", "", dat$DGene)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
37
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
38 dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
39 dat$JGene = gsub("[*].*", "", dat$JGene)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
40
63
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
41 print(str(dat))
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
42
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
43 dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":"))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
44
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
45 dat = dat[!duplicated(dat$past), ]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
46
63
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
47 print(paste("Sequences remaining after duplicate filter:", nrow(dat)))
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
48
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
49 dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
50
63
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
51 print(paste("Sequences remaining after functionality filter:", nrow(dat)))
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
52
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
53 print(paste("Sequences remaining:", nrow(dat)))
8728284105ee Uploaded
davidvanzessen
parents: 0
diff changeset
54
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
55 write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)