Mercurial > repos > davidvanzessen > shm_csr
comparison merge_and_filter.r @ 1:faae21ba5c63 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 25 Oct 2016 07:28:43 -0400 |
parents | c33d93683a09 |
children | e85fec274cde |
comparison
equal
deleted
inserted
replaced
0:c33d93683a09 | 1:faae21ba5c63 |
---|---|
30 ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52)) | 30 ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52)) |
31 gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T) | 31 gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T) |
32 gene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100 | 32 gene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100 |
33 gene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")] | 33 gene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")] |
34 colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match") | 34 colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match") |
35 | |
36 } | 35 } |
37 | 36 |
38 input.sequence.count = nrow(summ) | 37 input.sequence.count = nrow(summ) |
39 print(paste("Number of sequences in summary file:", input.sequence.count)) | 38 print(paste("Number of sequences in summary file:", input.sequence.count)) |
40 | 39 |
60 summ = summ[summ$Functionality == "unproductive (see comment)" | summ$Functionality == "unproductive",] | 59 summ = summ[summ$Functionality == "unproductive (see comment)" | summ$Functionality == "unproductive",] |
61 } else if (functionality == "remove_unknown"){ | 60 } else if (functionality == "remove_unknown"){ |
62 summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",] | 61 summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",] |
63 } | 62 } |
64 | 63 |
65 print(paste("Number of sequences after productive filter:", nrow(summ))) | 64 print(paste("Number of sequences after functionality filter:", nrow(summ))) |
66 | 65 |
67 filtering.steps = rbind(filtering.steps, c("After productive filter", nrow(summ))) | 66 filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ))) |
68 | |
69 splt = strsplit(class.filter, "_")[[1]] | |
70 chunk_hit_threshold = as.numeric(splt[1]) | |
71 nt_hit_threshold = as.numeric(splt[2]) | |
72 | |
73 higher_than=(summ$chunk_hit_percentage >= chunk_hit_threshold & summ$nt_hit_percentage >= nt_hit_threshold) | |
74 | |
75 unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] | |
76 | |
77 if(!all(higher_than, na.rm=T)){ #check for 'not all' because that would mean the unmatched set is empty | |
78 unmatched = summ[!higher_than,] | |
79 unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] | |
80 unmatched$best_match = paste("unmatched,", unmatched$best_match) | |
81 summ[!higher_than,"best_match"] = paste("unmatched,", summ[!higher_than,"best_match"]) | |
82 } | |
83 | |
84 if(any(higher_than, na.rm=T)){ | |
85 #summ = summ[higher_than,] | |
86 } | |
87 | |
88 if(nrow(summ) == 0){ | |
89 stop("No data remaining after filter") | |
90 } | |
91 | 67 |
92 result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID") | 68 result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID") |
93 | 69 |
94 print(paste("Number of sequences after merging with mutation analysis file:", nrow(result))) | 70 print(paste("Number of sequences after merging with mutation analysis file:", nrow(result))) |
95 | 71 |
112 result$DGene = gsub("^Homsap ", "", result$D.GENE.and.allele) | 88 result$DGene = gsub("^Homsap ", "", result$D.GENE.and.allele) |
113 result$DGene = gsub("[*].*", "", result$DGene) | 89 result$DGene = gsub("[*].*", "", result$DGene) |
114 result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) | 90 result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) |
115 result$JGene = gsub("[*].*", "", result$JGene) | 91 result$JGene = gsub("[*].*", "", result$JGene) |
116 | 92 |
117 result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":")) | |
118 | |
119 result = result[!(duplicated(result$past)), ] | |
120 | |
121 result = result[,!(names(result) %in% c("past"))] | |
122 | |
123 print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result))) | |
124 | |
125 filtering.steps = rbind(filtering.steps, c("After duplicate filter", nrow(result))) | |
126 | |
127 print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == ""))) | 93 print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == ""))) |
128 print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == ""))) | 94 print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == ""))) |
129 print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == ""))) | 95 print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == ""))) |
130 print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == ""))) | 96 print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == ""))) |
131 | 97 |
132 if(empty.region.filter == "FR1"){ | 98 if(empty.region.filter == "leader"){ |
99 result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] | |
100 print(paste("Number of sequences after empty FR1, CDR1, FR2, CDR2 and FR3 column filter:", nrow(result))) | |
101 filtering.steps = rbind(filtering.steps, c("After empty FR1, CDR1, FR2, CDR2, FR3 filter", nrow(result))) | |
102 } else if(empty.region.filter == "FR1"){ | |
133 result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] | 103 result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] |
134 print(paste("Number of sequences after empty CDR1, FR2, CDR2 and FR3 column filter:", nrow(result))) | 104 print(paste("Number of sequences after empty CDR1, FR2, CDR2 and FR3 column filter:", nrow(result))) |
135 filtering.steps = rbind(filtering.steps, c("After empty CDR1, FR2, CDR2, FR3 filter", nrow(result))) | 105 filtering.steps = rbind(filtering.steps, c("After empty CDR1, FR2, CDR2, FR3 filter", nrow(result))) |
136 } else if(empty.region.filter == "CDR1"){ | 106 } else if(empty.region.filter == "CDR1"){ |
137 result = result[result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] | 107 result = result[result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] |
141 result = result[result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] | 111 result = result[result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] |
142 print(paste("Number of sequences after empty CDR2 and FR3 column filter:", nrow(result))) | 112 print(paste("Number of sequences after empty CDR2 and FR3 column filter:", nrow(result))) |
143 filtering.steps = rbind(filtering.steps, c("After empty CDR2, FR3 filter", nrow(result))) | 113 filtering.steps = rbind(filtering.steps, c("After empty CDR2, FR3 filter", nrow(result))) |
144 } | 114 } |
145 | 115 |
146 if(empty.region.filter == "FR1"){ | 116 print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result))) |
117 print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),]))) | |
118 | |
119 if(empty.region.filter == "leader"){ | |
120 result = result[!(grepl("n|N", result$FR1.IMGT.seq) | grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] | |
121 } else if(empty.region.filter == "FR1"){ | |
147 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] | 122 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] |
148 } else if(empty.region.filter == "CDR1"){ | 123 } else if(empty.region.filter == "CDR1"){ |
149 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] | 124 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] |
150 } else if(empty.region.filter == "FR2"){ | 125 } else if(empty.region.filter == "FR2"){ |
151 result = result[!(grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] | 126 result = result[!(grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] |
169 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) | 144 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) |
170 | 145 |
171 if(filter.unique != "no"){ | 146 if(filter.unique != "no"){ |
172 clmns = names(result) | 147 clmns = names(result) |
173 | 148 |
174 if(empty.region.filter == "FR1"){ | 149 if(empty.region.filter == "leader"){ |
150 result$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) | |
151 } else if(empty.region.filter == "FR1"){ | |
175 result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) | 152 result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) |
176 } else if(empty.region.filter == "CDR1"){ | 153 } else if(empty.region.filter == "CDR1"){ |
177 rresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) | 154 rresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) |
178 } else if(empty.region.filter == "FR2"){ | 155 } else if(empty.region.filter == "FR2"){ |
179 result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) | 156 result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) |
197 #result = result[,clmns] | 174 #result = result[,clmns] |
198 | 175 |
199 #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T) | 176 #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T) |
200 } | 177 } |
201 | 178 |
202 print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result))) | 179 filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result))) |
203 print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),]))) | 180 |
204 | 181 |
205 filtering.steps = rbind(filtering.steps, c("After unique filter", nrow(result))) | 182 splt = strsplit(class.filter, "_")[[1]] |
183 chunk_hit_threshold = as.numeric(splt[1]) | |
184 nt_hit_threshold = as.numeric(splt[2]) | |
185 | |
186 higher_than=(summ$chunk_hit_percentage >= chunk_hit_threshold & summ$nt_hit_percentage >= nt_hit_threshold) | |
187 | |
188 unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] | |
189 | |
190 if(!all(higher_than, na.rm=T)){ #check for 'not all' because that would mean the unmatched set is empty | |
191 unmatched = summ[!higher_than,] | |
192 unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] | |
193 unmatched$best_match = paste("unmatched,", unmatched$best_match) | |
194 summ[!higher_than,"best_match"] = paste("unmatched,", summ[!higher_than,"best_match"]) | |
195 } | |
196 | |
197 if(any(higher_than, na.rm=T)){ | |
198 #summ = summ[higher_than,] | |
199 } | |
200 | |
201 if(nrow(summ) == 0){ | |
202 stop("No data remaining after filter") | |
203 } | |
204 | |
205 result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":")) | |
206 | |
207 result = result[!(duplicated(result$past)), ] | |
208 | |
209 result = result[,!(names(result) %in% c("past"))] | |
210 | |
211 print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result))) | |
212 | |
213 filtering.steps = rbind(filtering.steps, c("After remove duplicates based on filter", nrow(result))) | |
206 | 214 |
207 print(paste("Number of rows in result:", nrow(result))) | 215 print(paste("Number of rows in result:", nrow(result))) |
208 print(paste("Number of rows in unmatched:", nrow(unmatched))) | 216 print(paste("Number of rows in unmatched:", nrow(unmatched))) |
209 | 217 |
210 matched.sequences = result[!grepl("^unmatched", result$best_match),] | 218 matched.sequences = result[!grepl("^unmatched", result$best_match),] |