annotate baseline/filter.r @ 82:a103134ee6e0 draft

Uploaded
author davidvanzessen
date Thu, 25 Feb 2021 10:32:32 +0000
parents b6f9a640e098
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
81
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
1 arg = commandArgs(TRUE)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
2 summaryfile = arg[1]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
3 gappedfile = arg[2]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
4 selection = arg[3]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
5 output = arg[4]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
6 print(paste("selection = ", selection))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
7
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
8
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
9 summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote = "")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
10 gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote = "")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
11
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
12 fix_column_names = function(df){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
13 if("V.DOMAIN.Functionality" %in% names(df)){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
14 names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
15 print("found V.DOMAIN.Functionality, changed")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
16 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
17 if("V.DOMAIN.Functionality.comment" %in% names(df)){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
18 names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
19 print("found V.DOMAIN.Functionality.comment, changed")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
20 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
21 return(df)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
22 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
23
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
24 gappeddat = fix_column_names(gappeddat)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
25
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
26 #dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
27
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
28 dat = cbind(gappeddat, summarydat$AA.JUNCTION)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
29
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
30 colnames(dat)[length(dat)] = "AA.JUNCTION"
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
31
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
32 dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
33 dat$VGene = gsub("[*].*", "", dat$VGene)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
34
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
35 dat$DGene = gsub("^Homsap ", "", dat$D.GENE.and.allele)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
36 dat$DGene = gsub("[*].*", "", dat$DGene)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
37
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
38 dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
39 dat$JGene = gsub("[*].*", "", dat$JGene)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
40
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
41 print(str(dat))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
42
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
43 dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":"))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
44
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
45 dat = dat[!duplicated(dat$past), ]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
46
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
47 print(paste("Sequences remaining after duplicate filter:", nrow(dat)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
48
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
49 dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
50
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
51 print(paste("Sequences remaining after functionality filter:", nrow(dat)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
52
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
53 print(paste("Sequences remaining:", nrow(dat)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
54
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
55 write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)