Mercurial > repos > davidvanzessen > shm_csr
diff merge_and_filter.r @ 0:c33d93683a09 draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 13 Oct 2016 10:52:24 -0400 |
parents | |
children | faae21ba5c63 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_and_filter.r Thu Oct 13 10:52:24 2016 -0400 @@ -0,0 +1,225 @@ +args <- commandArgs(trailingOnly = TRUE) + + +summaryfile = args[1] +sequencesfile = args[2] +mutationanalysisfile = args[3] +mutationstatsfile = args[4] +hotspotsfile = args[5] +gene_identification_file= args[6] +output = args[7] +before.unique.file = args[8] +unmatchedfile = args[9] +method=args[10] +functionality=args[11] +unique.type=args[12] +filter.unique=args[13] +class.filter=args[14] +empty.region.filter=args[15] + +summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") +sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") +mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") +mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") +hotspots = read.table(hotspotsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") +gene_identification = read.table(gene_identification_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") + +if(method == "blastn"){ + "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" + gene_identification = gene_identification[!duplicated(gene_identification$qseqid),] + ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52)) + gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T) + gene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100 + gene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")] + colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match") + +} + +input.sequence.count = nrow(summ) +print(paste("Number of sequences in summary file:", input.sequence.count)) + +filtering.steps = data.frame(character(0), numeric(0)) + +filtering.steps = rbind(filtering.steps, c("Input", input.sequence.count)) + +filtering.steps[,1] = as.character(filtering.steps[,1]) +filtering.steps[,2] = as.character(filtering.steps[,2]) +#filtering.steps[,3] = as.numeric(filtering.steps[,3]) + +summ = merge(summ, gene_identification, by="Sequence.ID") + +summ = summ[summ$Functionality != "No results",] + +print(paste("Number of sequences after 'No results' filter:", nrow(summ))) + +filtering.steps = rbind(filtering.steps, c("After 'No results' filter", nrow(summ))) + +if(functionality == "productive"){ + summ = summ[summ$Functionality == "productive (see comment)" | summ$Functionality == "productive",] +} else if (functionality == "unproductive"){ + summ = summ[summ$Functionality == "unproductive (see comment)" | summ$Functionality == "unproductive",] +} else if (functionality == "remove_unknown"){ + summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",] +} + +print(paste("Number of sequences after productive filter:", nrow(summ))) + +filtering.steps = rbind(filtering.steps, c("After productive filter", nrow(summ))) + +splt = strsplit(class.filter, "_")[[1]] +chunk_hit_threshold = as.numeric(splt[1]) +nt_hit_threshold = as.numeric(splt[2]) + +higher_than=(summ$chunk_hit_percentage >= chunk_hit_threshold & summ$nt_hit_percentage >= nt_hit_threshold) + +unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] + +if(!all(higher_than, na.rm=T)){ #check for 'not all' because that would mean the unmatched set is empty + unmatched = summ[!higher_than,] + unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] + unmatched$best_match = paste("unmatched,", unmatched$best_match) + summ[!higher_than,"best_match"] = paste("unmatched,", summ[!higher_than,"best_match"]) +} + +if(any(higher_than, na.rm=T)){ + #summ = summ[higher_than,] +} + +if(nrow(summ) == 0){ + stop("No data remaining after filter") +} + +result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID") + +print(paste("Number of sequences after merging with mutation analysis file:", nrow(result))) + +result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID") + +print(paste("Number of sequences after merging with mutation stats file:", nrow(result))) + +result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID") + +print(paste("Number of sequences after merging with hotspots file:", nrow(result))) + +sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")] +names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq") +result = merge(result, sequences, by="Sequence.ID", all.x=T) + +print(paste("Number of sequences in result after merging with sequences:", nrow(result))) + +result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele) +result$VGene = gsub("[*].*", "", result$VGene) +result$DGene = gsub("^Homsap ", "", result$D.GENE.and.allele) +result$DGene = gsub("[*].*", "", result$DGene) +result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) +result$JGene = gsub("[*].*", "", result$JGene) + +result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":")) + +result = result[!(duplicated(result$past)), ] + +result = result[,!(names(result) %in% c("past"))] + +print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result))) + +filtering.steps = rbind(filtering.steps, c("After duplicate filter", nrow(result))) + +print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == ""))) +print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == ""))) +print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == ""))) +print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == ""))) + +if(empty.region.filter == "FR1"){ + result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] + print(paste("Number of sequences after empty CDR1, FR2, CDR2 and FR3 column filter:", nrow(result))) + filtering.steps = rbind(filtering.steps, c("After empty CDR1, FR2, CDR2, FR3 filter", nrow(result))) +} else if(empty.region.filter == "CDR1"){ + result = result[result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] + print(paste("Number of sequences after empty FR2, CDR2 and FR3 column filter:", nrow(result))) + filtering.steps = rbind(filtering.steps, c("After empty FR2, CDR2, FR3 filter", nrow(result))) +} else if(empty.region.filter == "FR2"){ + result = result[result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] + print(paste("Number of sequences after empty CDR2 and FR3 column filter:", nrow(result))) + filtering.steps = rbind(filtering.steps, c("After empty CDR2, FR3 filter", nrow(result))) +} + +if(empty.region.filter == "FR1"){ + result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] +} else if(empty.region.filter == "CDR1"){ + result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] +} else if(empty.region.filter == "FR2"){ + result = result[!(grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] +} + +print(paste("Number of sequences in result after n filtering:", nrow(result))) +filtering.steps = rbind(filtering.steps, c("After N filter", nrow(result))) + +cleanup_columns = c("FR1.IMGT.Nb.of.mutations", + "CDR1.IMGT.Nb.of.mutations", + "FR2.IMGT.Nb.of.mutations", + "CDR2.IMGT.Nb.of.mutations", + "FR3.IMGT.Nb.of.mutations") + +for(col in cleanup_columns){ + result[,col] = gsub("\\(.*\\)", "", result[,col]) + result[,col] = as.numeric(result[,col]) + result[is.na(result[,col]),] = 0 +} + +write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) + +if(filter.unique != "no"){ + clmns = names(result) + + if(empty.region.filter == "FR1"){ + result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) + } else if(empty.region.filter == "CDR1"){ + rresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) + } else if(empty.region.filter == "FR2"){ + result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) + } + + if(grepl("_c", filter.unique)){ + result$unique.def = paste(result$unique.def, result$best_match) + } + + #fltr = result$unique.def %in% result.filtered$unique.def + + if(grepl("keep", filter.unique)){ + result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes + result = result[!duplicated(result$unique.def),] + } else { + result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),] + result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes + result = result[!duplicated(result$unique.def),] + } + + #result = result[,clmns] + + #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T) +} + +print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result))) +print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),]))) + +filtering.steps = rbind(filtering.steps, c("After unique filter", nrow(result))) + +print(paste("Number of rows in result:", nrow(result))) +print(paste("Number of rows in unmatched:", nrow(unmatched))) + +matched.sequences = result[!grepl("^unmatched", result$best_match),] + +write.table(x=matched.sequences, file=gsub("merged.txt$", "filtered.txt", output), sep="\t",quote=F,row.names=F,col.names=T) + +matched.sequences.count = nrow(matched.sequences) +unmatched.sequences.count = sum(grepl("^unmatched", result$best_match)) + +filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count)) +filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count)) +filtering.steps[,2] = as.numeric(filtering.steps[,2]) +filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2) + +write.table(x=filtering.steps, file=gsub("unmatched", "filtering_steps", unmatchedfile), sep="\t",quote=F,row.names=F,col.names=F) + +write.table(x=result, file=output, sep="\t",quote=F,row.names=F,col.names=T) +write.table(x=unmatched, file=unmatchedfile, sep="\t",quote=F,row.names=F,col.names=T)