comparison merge_and_filter.r @ 77:58d2377b507d draft

Uploaded
author davidvanzessen
date Wed, 19 Jun 2019 04:31:44 -0400
parents ba33b94637ca
children aff3ba86ef7a
comparison
equal deleted inserted replaced
76:a93136637bea 77:58d2377b507d
51 mutationanalysis = fix_column_names(mutationanalysis) 51 mutationanalysis = fix_column_names(mutationanalysis)
52 mutationstats = fix_column_names(mutationstats) 52 mutationstats = fix_column_names(mutationstats)
53 hotspots = fix_column_names(hotspots) 53 hotspots = fix_column_names(hotspots)
54 AAs = fix_column_names(AAs) 54 AAs = fix_column_names(AAs)
55 55
56 if(!("Sequence.number" %in% names(summ))){
57 summ["Sequence.number"] = 1:nrow(summ)
58 }
59
56 if(method == "blastn"){ 60 if(method == "blastn"){
57 #"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" 61 #"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
58 gene_identification = gene_identification[!duplicated(gene_identification$qseqid),] 62 gene_identification = gene_identification[!duplicated(gene_identification$qseqid),]
59 ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52)) 63 ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52))
60 gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T) 64 gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T)
138 142
139 sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")] 143 sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")]
140 names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq") 144 names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq")
141 result = merge(result, sequences, by="Sequence.ID", all.x=T) 145 result = merge(result, sequences, by="Sequence.ID", all.x=T)
142 146
143 print("sequences files columns")
144 print("CDR3.IMGT")
145
146 AAs = AAs[,c("Sequence.ID", "CDR3.IMGT")] 147 AAs = AAs[,c("Sequence.ID", "CDR3.IMGT")]
147 names(AAs) = c("Sequence.ID", "CDR3.IMGT.AA") 148 names(AAs) = c("Sequence.ID", "CDR3.IMGT.AA")
148 result = merge(result, AAs, by="Sequence.ID", all.x=T) 149 result = merge(result, AAs, by="Sequence.ID", all.x=T)
149 150
150 print(paste("Number of sequences in result after merging with sequences:", nrow(result))) 151 print(paste("Number of sequences in result after merging with sequences:", nrow(result)))
170 result$best_match = "all" 171 result$best_match = "all"
171 } 172 }
172 173
173 write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T) 174 write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T)
174 175
175 print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "", na.rm=T))) 176 missing.FR1 = result$FR1.IMGT.seq == "" | is.na(result$FR1.IMGT.seq)
176 print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "", na.rm=T))) 177 missing.CDR1 = result$CDR1.IMGT.seq == "" | is.na(result$CDR1.IMGT.seq)
177 print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "", na.rm=T))) 178 missing.FR2 = result$FR2.IMGT.seq == "" | is.na(result$FR2.IMGT.seq)
178 print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "", na.rm=T))) 179 missing.CDR2 = result$CDR2.IMGT.seq == "" | is.na(result$CDR2.IMGT.seq)
180 missing.FR3 = result$FR3.IMGT.seq == "" | is.na(result$FR3.IMGT.seq)
181
182 print(paste("Number of empty CDR1 sequences:", sum(missing.FR1)))
183 print(paste("Number of empty FR2 sequences:", sum(missing.CDR1)))
184 print(paste("Number of empty CDR2 sequences:", sum(missing.FR2)))
185 print(paste("Number of empty FR3 sequences:", sum(missing.CDR2)))
186 print(paste("Number of empty FR3 sequences:", sum(missing.FR3)))
179 187
180 if(empty.region.filter == "leader"){ 188 if(empty.region.filter == "leader"){
181 result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] 189 result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
182 } else if(empty.region.filter == "FR1"){ 190 } else if(empty.region.filter == "FR1"){
183 result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] 191 result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
214 result[,col] = as.numeric(result[,col]) 222 result[,col] = as.numeric(result[,col])
215 result[is.na(result[,col]),] = 0 223 result[is.na(result[,col]),] = 0
216 } 224 }
217 225
218 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) 226 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
227
219 228
220 if(filter.unique != "no"){ 229 if(filter.unique != "no"){
221 clmns = names(result) 230 clmns = names(result)
222 if(filter.unique == "remove_vjaa"){ 231 if(filter.unique == "remove_vjaa"){
223 result$unique.def = paste(result$VGene, result$JGene, result$CDR3.IMGT.AA) 232 result$unique.def = paste(result$VGene, result$JGene, result$CDR3.IMGT.AA)
289 298
290 write.table(x=matched.sequences, file=gsub("merged.txt$", "filtered.txt", output), sep="\t",quote=F,row.names=F,col.names=T) 299 write.table(x=matched.sequences, file=gsub("merged.txt$", "filtered.txt", output), sep="\t",quote=F,row.names=F,col.names=T)
291 300
292 matched.sequences.count = nrow(matched.sequences) 301 matched.sequences.count = nrow(matched.sequences)
293 unmatched.sequences.count = sum(grepl("^unmatched", result$best_match)) 302 unmatched.sequences.count = sum(grepl("^unmatched", result$best_match))
303 if(matched.sequences.count <= unmatched.sequences.count){
304 print("WARNING NO MATCHED (SUB)CLASS SEQUENCES!!")
305 }
294 306
295 filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count)) 307 filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count))
296 filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count)) 308 filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count))
297 filtering.steps[,2] = as.numeric(filtering.steps[,2]) 309 filtering.steps[,2] = as.numeric(filtering.steps[,2])
298 filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2) 310 filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2)