0
|
1 arg = commandArgs(TRUE)
|
|
2 summaryfile = arg[1]
|
|
3 gappedfile = arg[2]
|
|
4 selection = arg[3]
|
|
5 output = arg[4]
|
|
6 print(paste("selection = ", selection))
|
|
7
|
|
8
|
64
|
9 summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote = "")
|
|
10 gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote = "")
|
0
|
11
|
63
|
12 fix_column_names = function(df){
|
|
13 if("V.DOMAIN.Functionality" %in% names(df)){
|
|
14 names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
|
|
15 print("found V.DOMAIN.Functionality, changed")
|
|
16 }
|
|
17 if("V.DOMAIN.Functionality.comment" %in% names(df)){
|
|
18 names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
|
|
19 print("found V.DOMAIN.Functionality.comment, changed")
|
|
20 }
|
|
21 return(df)
|
|
22 }
|
|
23
|
|
24 gappeddat = fix_column_names(gappeddat)
|
|
25
|
0
|
26 #dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T))
|
|
27
|
|
28 dat = cbind(gappeddat, summarydat$AA.JUNCTION)
|
|
29
|
|
30 colnames(dat)[length(dat)] = "AA.JUNCTION"
|
|
31
|
|
32 dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele)
|
|
33 dat$VGene = gsub("[*].*", "", dat$VGene)
|
|
34
|
|
35 dat$DGene = gsub("^Homsap ", "", dat$D.GENE.and.allele)
|
|
36 dat$DGene = gsub("[*].*", "", dat$DGene)
|
|
37
|
|
38 dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele)
|
|
39 dat$JGene = gsub("[*].*", "", dat$JGene)
|
|
40
|
63
|
41 print(str(dat))
|
0
|
42
|
|
43 dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":"))
|
|
44
|
|
45 dat = dat[!duplicated(dat$past), ]
|
|
46
|
63
|
47 print(paste("Sequences remaining after duplicate filter:", nrow(dat)))
|
|
48
|
0
|
49 dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",]
|
|
50
|
63
|
51 print(paste("Sequences remaining after functionality filter:", nrow(dat)))
|
|
52
|
|
53 print(paste("Sequences remaining:", nrow(dat)))
|
|
54
|
0
|
55 write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)
|