Mercurial > repos > davidvanzessen > shm_csr
comparison merge_and_filter.r @ 77:58d2377b507d draft
Uploaded
author | davidvanzessen |
---|---|
date | Wed, 19 Jun 2019 04:31:44 -0400 |
parents | ba33b94637ca |
children | aff3ba86ef7a |
comparison
equal
deleted
inserted
replaced
76:a93136637bea | 77:58d2377b507d |
---|---|
51 mutationanalysis = fix_column_names(mutationanalysis) | 51 mutationanalysis = fix_column_names(mutationanalysis) |
52 mutationstats = fix_column_names(mutationstats) | 52 mutationstats = fix_column_names(mutationstats) |
53 hotspots = fix_column_names(hotspots) | 53 hotspots = fix_column_names(hotspots) |
54 AAs = fix_column_names(AAs) | 54 AAs = fix_column_names(AAs) |
55 | 55 |
56 if(!("Sequence.number" %in% names(summ))){ | |
57 summ["Sequence.number"] = 1:nrow(summ) | |
58 } | |
59 | |
56 if(method == "blastn"){ | 60 if(method == "blastn"){ |
57 #"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" | 61 #"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" |
58 gene_identification = gene_identification[!duplicated(gene_identification$qseqid),] | 62 gene_identification = gene_identification[!duplicated(gene_identification$qseqid),] |
59 ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52)) | 63 ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52)) |
60 gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T) | 64 gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T) |
138 | 142 |
139 sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")] | 143 sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")] |
140 names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq") | 144 names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq") |
141 result = merge(result, sequences, by="Sequence.ID", all.x=T) | 145 result = merge(result, sequences, by="Sequence.ID", all.x=T) |
142 | 146 |
143 print("sequences files columns") | |
144 print("CDR3.IMGT") | |
145 | |
146 AAs = AAs[,c("Sequence.ID", "CDR3.IMGT")] | 147 AAs = AAs[,c("Sequence.ID", "CDR3.IMGT")] |
147 names(AAs) = c("Sequence.ID", "CDR3.IMGT.AA") | 148 names(AAs) = c("Sequence.ID", "CDR3.IMGT.AA") |
148 result = merge(result, AAs, by="Sequence.ID", all.x=T) | 149 result = merge(result, AAs, by="Sequence.ID", all.x=T) |
149 | 150 |
150 print(paste("Number of sequences in result after merging with sequences:", nrow(result))) | 151 print(paste("Number of sequences in result after merging with sequences:", nrow(result))) |
170 result$best_match = "all" | 171 result$best_match = "all" |
171 } | 172 } |
172 | 173 |
173 write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T) | 174 write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T) |
174 | 175 |
175 print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "", na.rm=T))) | 176 missing.FR1 = result$FR1.IMGT.seq == "" | is.na(result$FR1.IMGT.seq) |
176 print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "", na.rm=T))) | 177 missing.CDR1 = result$CDR1.IMGT.seq == "" | is.na(result$CDR1.IMGT.seq) |
177 print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "", na.rm=T))) | 178 missing.FR2 = result$FR2.IMGT.seq == "" | is.na(result$FR2.IMGT.seq) |
178 print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "", na.rm=T))) | 179 missing.CDR2 = result$CDR2.IMGT.seq == "" | is.na(result$CDR2.IMGT.seq) |
180 missing.FR3 = result$FR3.IMGT.seq == "" | is.na(result$FR3.IMGT.seq) | |
181 | |
182 print(paste("Number of empty CDR1 sequences:", sum(missing.FR1))) | |
183 print(paste("Number of empty FR2 sequences:", sum(missing.CDR1))) | |
184 print(paste("Number of empty CDR2 sequences:", sum(missing.FR2))) | |
185 print(paste("Number of empty FR3 sequences:", sum(missing.CDR2))) | |
186 print(paste("Number of empty FR3 sequences:", sum(missing.FR3))) | |
179 | 187 |
180 if(empty.region.filter == "leader"){ | 188 if(empty.region.filter == "leader"){ |
181 result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] | 189 result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] |
182 } else if(empty.region.filter == "FR1"){ | 190 } else if(empty.region.filter == "FR1"){ |
183 result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] | 191 result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] |
214 result[,col] = as.numeric(result[,col]) | 222 result[,col] = as.numeric(result[,col]) |
215 result[is.na(result[,col]),] = 0 | 223 result[is.na(result[,col]),] = 0 |
216 } | 224 } |
217 | 225 |
218 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) | 226 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) |
227 | |
219 | 228 |
220 if(filter.unique != "no"){ | 229 if(filter.unique != "no"){ |
221 clmns = names(result) | 230 clmns = names(result) |
222 if(filter.unique == "remove_vjaa"){ | 231 if(filter.unique == "remove_vjaa"){ |
223 result$unique.def = paste(result$VGene, result$JGene, result$CDR3.IMGT.AA) | 232 result$unique.def = paste(result$VGene, result$JGene, result$CDR3.IMGT.AA) |
289 | 298 |
290 write.table(x=matched.sequences, file=gsub("merged.txt$", "filtered.txt", output), sep="\t",quote=F,row.names=F,col.names=T) | 299 write.table(x=matched.sequences, file=gsub("merged.txt$", "filtered.txt", output), sep="\t",quote=F,row.names=F,col.names=T) |
291 | 300 |
292 matched.sequences.count = nrow(matched.sequences) | 301 matched.sequences.count = nrow(matched.sequences) |
293 unmatched.sequences.count = sum(grepl("^unmatched", result$best_match)) | 302 unmatched.sequences.count = sum(grepl("^unmatched", result$best_match)) |
303 if(matched.sequences.count <= unmatched.sequences.count){ | |
304 print("WARNING NO MATCHED (SUB)CLASS SEQUENCES!!") | |
305 } | |
294 | 306 |
295 filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count)) | 307 filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count)) |
296 filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count)) | 308 filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count)) |
297 filtering.steps[,2] = as.numeric(filtering.steps[,2]) | 309 filtering.steps[,2] = as.numeric(filtering.steps[,2]) |
298 filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2) | 310 filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2) |