comparison merge_and_filter.r @ 1:faae21ba5c63 draft

Uploaded
author davidvanzessen
date Tue, 25 Oct 2016 07:28:43 -0400
parents c33d93683a09
children e85fec274cde
comparison
equal deleted inserted replaced
0:c33d93683a09 1:faae21ba5c63
30 ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52)) 30 ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52))
31 gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T) 31 gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T)
32 gene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100 32 gene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100
33 gene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")] 33 gene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")]
34 colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match") 34 colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")
35
36 } 35 }
37 36
38 input.sequence.count = nrow(summ) 37 input.sequence.count = nrow(summ)
39 print(paste("Number of sequences in summary file:", input.sequence.count)) 38 print(paste("Number of sequences in summary file:", input.sequence.count))
40 39
60 summ = summ[summ$Functionality == "unproductive (see comment)" | summ$Functionality == "unproductive",] 59 summ = summ[summ$Functionality == "unproductive (see comment)" | summ$Functionality == "unproductive",]
61 } else if (functionality == "remove_unknown"){ 60 } else if (functionality == "remove_unknown"){
62 summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",] 61 summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",]
63 } 62 }
64 63
65 print(paste("Number of sequences after productive filter:", nrow(summ))) 64 print(paste("Number of sequences after functionality filter:", nrow(summ)))
66 65
67 filtering.steps = rbind(filtering.steps, c("After productive filter", nrow(summ))) 66 filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ)))
68
69 splt = strsplit(class.filter, "_")[[1]]
70 chunk_hit_threshold = as.numeric(splt[1])
71 nt_hit_threshold = as.numeric(splt[2])
72
73 higher_than=(summ$chunk_hit_percentage >= chunk_hit_threshold & summ$nt_hit_percentage >= nt_hit_threshold)
74
75 unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
76
77 if(!all(higher_than, na.rm=T)){ #check for 'not all' because that would mean the unmatched set is empty
78 unmatched = summ[!higher_than,]
79 unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
80 unmatched$best_match = paste("unmatched,", unmatched$best_match)
81 summ[!higher_than,"best_match"] = paste("unmatched,", summ[!higher_than,"best_match"])
82 }
83
84 if(any(higher_than, na.rm=T)){
85 #summ = summ[higher_than,]
86 }
87
88 if(nrow(summ) == 0){
89 stop("No data remaining after filter")
90 }
91 67
92 result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID") 68 result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID")
93 69
94 print(paste("Number of sequences after merging with mutation analysis file:", nrow(result))) 70 print(paste("Number of sequences after merging with mutation analysis file:", nrow(result)))
95 71
112 result$DGene = gsub("^Homsap ", "", result$D.GENE.and.allele) 88 result$DGene = gsub("^Homsap ", "", result$D.GENE.and.allele)
113 result$DGene = gsub("[*].*", "", result$DGene) 89 result$DGene = gsub("[*].*", "", result$DGene)
114 result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) 90 result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
115 result$JGene = gsub("[*].*", "", result$JGene) 91 result$JGene = gsub("[*].*", "", result$JGene)
116 92
117 result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))
118
119 result = result[!(duplicated(result$past)), ]
120
121 result = result[,!(names(result) %in% c("past"))]
122
123 print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result)))
124
125 filtering.steps = rbind(filtering.steps, c("After duplicate filter", nrow(result)))
126
127 print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == ""))) 93 print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "")))
128 print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == ""))) 94 print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "")))
129 print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == ""))) 95 print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "")))
130 print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == ""))) 96 print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "")))
131 97
132 if(empty.region.filter == "FR1"){ 98 if(empty.region.filter == "leader"){
99 result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
100 print(paste("Number of sequences after empty FR1, CDR1, FR2, CDR2 and FR3 column filter:", nrow(result)))
101 filtering.steps = rbind(filtering.steps, c("After empty FR1, CDR1, FR2, CDR2, FR3 filter", nrow(result)))
102 } else if(empty.region.filter == "FR1"){
133 result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] 103 result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
134 print(paste("Number of sequences after empty CDR1, FR2, CDR2 and FR3 column filter:", nrow(result))) 104 print(paste("Number of sequences after empty CDR1, FR2, CDR2 and FR3 column filter:", nrow(result)))
135 filtering.steps = rbind(filtering.steps, c("After empty CDR1, FR2, CDR2, FR3 filter", nrow(result))) 105 filtering.steps = rbind(filtering.steps, c("After empty CDR1, FR2, CDR2, FR3 filter", nrow(result)))
136 } else if(empty.region.filter == "CDR1"){ 106 } else if(empty.region.filter == "CDR1"){
137 result = result[result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] 107 result = result[result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
141 result = result[result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] 111 result = result[result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
142 print(paste("Number of sequences after empty CDR2 and FR3 column filter:", nrow(result))) 112 print(paste("Number of sequences after empty CDR2 and FR3 column filter:", nrow(result)))
143 filtering.steps = rbind(filtering.steps, c("After empty CDR2, FR3 filter", nrow(result))) 113 filtering.steps = rbind(filtering.steps, c("After empty CDR2, FR3 filter", nrow(result)))
144 } 114 }
145 115
146 if(empty.region.filter == "FR1"){ 116 print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result)))
117 print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),])))
118
119 if(empty.region.filter == "leader"){
120 result = result[!(grepl("n|N", result$FR1.IMGT.seq) | grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
121 } else if(empty.region.filter == "FR1"){
147 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] 122 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
148 } else if(empty.region.filter == "CDR1"){ 123 } else if(empty.region.filter == "CDR1"){
149 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] 124 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
150 } else if(empty.region.filter == "FR2"){ 125 } else if(empty.region.filter == "FR2"){
151 result = result[!(grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] 126 result = result[!(grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
169 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) 144 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
170 145
171 if(filter.unique != "no"){ 146 if(filter.unique != "no"){
172 clmns = names(result) 147 clmns = names(result)
173 148
174 if(empty.region.filter == "FR1"){ 149 if(empty.region.filter == "leader"){
150 result$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
151 } else if(empty.region.filter == "FR1"){
175 result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) 152 result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
176 } else if(empty.region.filter == "CDR1"){ 153 } else if(empty.region.filter == "CDR1"){
177 rresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) 154 rresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
178 } else if(empty.region.filter == "FR2"){ 155 } else if(empty.region.filter == "FR2"){
179 result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) 156 result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
197 #result = result[,clmns] 174 #result = result[,clmns]
198 175
199 #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T) 176 #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T)
200 } 177 }
201 178
202 print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result))) 179 filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result)))
203 print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),]))) 180
204 181
205 filtering.steps = rbind(filtering.steps, c("After unique filter", nrow(result))) 182 splt = strsplit(class.filter, "_")[[1]]
183 chunk_hit_threshold = as.numeric(splt[1])
184 nt_hit_threshold = as.numeric(splt[2])
185
186 higher_than=(summ$chunk_hit_percentage >= chunk_hit_threshold & summ$nt_hit_percentage >= nt_hit_threshold)
187
188 unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
189
190 if(!all(higher_than, na.rm=T)){ #check for 'not all' because that would mean the unmatched set is empty
191 unmatched = summ[!higher_than,]
192 unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
193 unmatched$best_match = paste("unmatched,", unmatched$best_match)
194 summ[!higher_than,"best_match"] = paste("unmatched,", summ[!higher_than,"best_match"])
195 }
196
197 if(any(higher_than, na.rm=T)){
198 #summ = summ[higher_than,]
199 }
200
201 if(nrow(summ) == 0){
202 stop("No data remaining after filter")
203 }
204
205 result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))
206
207 result = result[!(duplicated(result$past)), ]
208
209 result = result[,!(names(result) %in% c("past"))]
210
211 print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result)))
212
213 filtering.steps = rbind(filtering.steps, c("After remove duplicates based on filter", nrow(result)))
206 214
207 print(paste("Number of rows in result:", nrow(result))) 215 print(paste("Number of rows in result:", nrow(result)))
208 print(paste("Number of rows in unmatched:", nrow(unmatched))) 216 print(paste("Number of rows in unmatched:", nrow(unmatched)))
209 217
210 matched.sequences = result[!grepl("^unmatched", result$best_match),] 218 matched.sequences = result[!grepl("^unmatched", result$best_match),]