annotate merge_and_filter.r @ 0:c33d93683a09 draft

Uploaded
author davidvanzessen
date Thu, 13 Oct 2016 10:52:24 -0400
parents
children faae21ba5c63
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
1 args <- commandArgs(trailingOnly = TRUE)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
2
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
3
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
4 summaryfile = args[1]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
5 sequencesfile = args[2]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
6 mutationanalysisfile = args[3]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
7 mutationstatsfile = args[4]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
8 hotspotsfile = args[5]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
9 gene_identification_file= args[6]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
10 output = args[7]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
11 before.unique.file = args[8]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
12 unmatchedfile = args[9]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
13 method=args[10]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
14 functionality=args[11]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
15 unique.type=args[12]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
16 filter.unique=args[13]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
17 class.filter=args[14]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
18 empty.region.filter=args[15]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
19
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
20 summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
21 sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
22 mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
23 mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
24 hotspots = read.table(hotspotsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
25 gene_identification = read.table(gene_identification_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
26
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
27 if(method == "blastn"){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
28 "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
29 gene_identification = gene_identification[!duplicated(gene_identification$qseqid),]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
30 ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
31 gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
32 gene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
33 gene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
34 colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
35
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
36 }
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
37
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
38 input.sequence.count = nrow(summ)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
39 print(paste("Number of sequences in summary file:", input.sequence.count))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
40
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
41 filtering.steps = data.frame(character(0), numeric(0))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
42
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
43 filtering.steps = rbind(filtering.steps, c("Input", input.sequence.count))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
44
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
45 filtering.steps[,1] = as.character(filtering.steps[,1])
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
46 filtering.steps[,2] = as.character(filtering.steps[,2])
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
47 #filtering.steps[,3] = as.numeric(filtering.steps[,3])
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
48
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
49 summ = merge(summ, gene_identification, by="Sequence.ID")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
50
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
51 summ = summ[summ$Functionality != "No results",]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
52
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
53 print(paste("Number of sequences after 'No results' filter:", nrow(summ)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
54
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
55 filtering.steps = rbind(filtering.steps, c("After 'No results' filter", nrow(summ)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
56
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
57 if(functionality == "productive"){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
58 summ = summ[summ$Functionality == "productive (see comment)" | summ$Functionality == "productive",]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
59 } else if (functionality == "unproductive"){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
60 summ = summ[summ$Functionality == "unproductive (see comment)" | summ$Functionality == "unproductive",]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
61 } else if (functionality == "remove_unknown"){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
62 summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
63 }
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
64
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
65 print(paste("Number of sequences after productive filter:", nrow(summ)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
66
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
67 filtering.steps = rbind(filtering.steps, c("After productive filter", nrow(summ)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
68
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
69 splt = strsplit(class.filter, "_")[[1]]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
70 chunk_hit_threshold = as.numeric(splt[1])
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
71 nt_hit_threshold = as.numeric(splt[2])
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
72
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
73 higher_than=(summ$chunk_hit_percentage >= chunk_hit_threshold & summ$nt_hit_percentage >= nt_hit_threshold)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
74
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
75 unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
76
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
77 if(!all(higher_than, na.rm=T)){ #check for 'not all' because that would mean the unmatched set is empty
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
78 unmatched = summ[!higher_than,]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
79 unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
80 unmatched$best_match = paste("unmatched,", unmatched$best_match)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
81 summ[!higher_than,"best_match"] = paste("unmatched,", summ[!higher_than,"best_match"])
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
82 }
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
83
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
84 if(any(higher_than, na.rm=T)){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
85 #summ = summ[higher_than,]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
86 }
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
87
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
88 if(nrow(summ) == 0){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
89 stop("No data remaining after filter")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
90 }
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
91
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
92 result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
93
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
94 print(paste("Number of sequences after merging with mutation analysis file:", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
95
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
96 result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
97
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
98 print(paste("Number of sequences after merging with mutation stats file:", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
99
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
100 result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
101
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
102 print(paste("Number of sequences after merging with hotspots file:", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
103
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
104 sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
105 names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
106 result = merge(result, sequences, by="Sequence.ID", all.x=T)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
107
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
108 print(paste("Number of sequences in result after merging with sequences:", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
109
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
110 result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
111 result$VGene = gsub("[*].*", "", result$VGene)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
112 result$DGene = gsub("^Homsap ", "", result$D.GENE.and.allele)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
113 result$DGene = gsub("[*].*", "", result$DGene)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
114 result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
115 result$JGene = gsub("[*].*", "", result$JGene)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
116
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
117 result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
118
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
119 result = result[!(duplicated(result$past)), ]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
120
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
121 result = result[,!(names(result) %in% c("past"))]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
122
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
123 print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
124
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
125 filtering.steps = rbind(filtering.steps, c("After duplicate filter", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
126
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
127 print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "")))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
128 print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "")))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
129 print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "")))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
130 print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "")))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
131
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
132 if(empty.region.filter == "FR1"){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
133 result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
134 print(paste("Number of sequences after empty CDR1, FR2, CDR2 and FR3 column filter:", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
135 filtering.steps = rbind(filtering.steps, c("After empty CDR1, FR2, CDR2, FR3 filter", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
136 } else if(empty.region.filter == "CDR1"){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
137 result = result[result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
138 print(paste("Number of sequences after empty FR2, CDR2 and FR3 column filter:", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
139 filtering.steps = rbind(filtering.steps, c("After empty FR2, CDR2, FR3 filter", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
140 } else if(empty.region.filter == "FR2"){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
141 result = result[result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
142 print(paste("Number of sequences after empty CDR2 and FR3 column filter:", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
143 filtering.steps = rbind(filtering.steps, c("After empty CDR2, FR3 filter", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
144 }
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
145
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
146 if(empty.region.filter == "FR1"){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
147 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
148 } else if(empty.region.filter == "CDR1"){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
149 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
150 } else if(empty.region.filter == "FR2"){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
151 result = result[!(grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
152 }
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
153
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
154 print(paste("Number of sequences in result after n filtering:", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
155 filtering.steps = rbind(filtering.steps, c("After N filter", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
156
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
157 cleanup_columns = c("FR1.IMGT.Nb.of.mutations",
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
158 "CDR1.IMGT.Nb.of.mutations",
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
159 "FR2.IMGT.Nb.of.mutations",
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
160 "CDR2.IMGT.Nb.of.mutations",
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
161 "FR3.IMGT.Nb.of.mutations")
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
162
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
163 for(col in cleanup_columns){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
164 result[,col] = gsub("\\(.*\\)", "", result[,col])
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
165 result[,col] = as.numeric(result[,col])
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
166 result[is.na(result[,col]),] = 0
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
167 }
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
168
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
169 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
170
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
171 if(filter.unique != "no"){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
172 clmns = names(result)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
173
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
174 if(empty.region.filter == "FR1"){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
175 result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
176 } else if(empty.region.filter == "CDR1"){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
177 rresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
178 } else if(empty.region.filter == "FR2"){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
179 result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
180 }
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
181
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
182 if(grepl("_c", filter.unique)){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
183 result$unique.def = paste(result$unique.def, result$best_match)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
184 }
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
185
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
186 #fltr = result$unique.def %in% result.filtered$unique.def
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
187
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
188 if(grepl("keep", filter.unique)){
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
189 result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
190 result = result[!duplicated(result$unique.def),]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
191 } else {
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
192 result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
193 result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
194 result = result[!duplicated(result$unique.def),]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
195 }
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
196
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
197 #result = result[,clmns]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
198
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
199 #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
200 }
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
201
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
202 print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
203 print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),])))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
204
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
205 filtering.steps = rbind(filtering.steps, c("After unique filter", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
206
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
207 print(paste("Number of rows in result:", nrow(result)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
208 print(paste("Number of rows in unmatched:", nrow(unmatched)))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
209
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
210 matched.sequences = result[!grepl("^unmatched", result$best_match),]
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
211
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
212 write.table(x=matched.sequences, file=gsub("merged.txt$", "filtered.txt", output), sep="\t",quote=F,row.names=F,col.names=T)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
213
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
214 matched.sequences.count = nrow(matched.sequences)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
215 unmatched.sequences.count = sum(grepl("^unmatched", result$best_match))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
216
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
217 filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
218 filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count))
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
219 filtering.steps[,2] = as.numeric(filtering.steps[,2])
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
220 filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
221
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
222 write.table(x=filtering.steps, file=gsub("unmatched", "filtering_steps", unmatchedfile), sep="\t",quote=F,row.names=F,col.names=F)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
223
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
224 write.table(x=result, file=output, sep="\t",quote=F,row.names=F,col.names=T)
c33d93683a09 Uploaded
davidvanzessen
parents:
diff changeset
225 write.table(x=unmatched, file=unmatchedfile, sep="\t",quote=F,row.names=F,col.names=T)