Mercurial > repos > davidvanzessen > shm_csr
diff merge_and_filter.r @ 77:58d2377b507d draft
Uploaded
author | davidvanzessen |
---|---|
date | Wed, 19 Jun 2019 04:31:44 -0400 |
parents | ba33b94637ca |
children | aff3ba86ef7a |
line wrap: on
line diff
--- a/merge_and_filter.r Tue Jun 18 04:47:44 2019 -0400 +++ b/merge_and_filter.r Wed Jun 19 04:31:44 2019 -0400 @@ -53,6 +53,10 @@ hotspots = fix_column_names(hotspots) AAs = fix_column_names(AAs) +if(!("Sequence.number" %in% names(summ))){ + summ["Sequence.number"] = 1:nrow(summ) +} + if(method == "blastn"){ #"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" gene_identification = gene_identification[!duplicated(gene_identification$qseqid),] @@ -140,9 +144,6 @@ names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq") result = merge(result, sequences, by="Sequence.ID", all.x=T) -print("sequences files columns") -print("CDR3.IMGT") - AAs = AAs[,c("Sequence.ID", "CDR3.IMGT")] names(AAs) = c("Sequence.ID", "CDR3.IMGT.AA") result = merge(result, AAs, by="Sequence.ID", all.x=T) @@ -172,10 +173,17 @@ write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T) -print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "", na.rm=T))) -print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "", na.rm=T))) -print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "", na.rm=T))) -print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "", na.rm=T))) +missing.FR1 = result$FR1.IMGT.seq == "" | is.na(result$FR1.IMGT.seq) +missing.CDR1 = result$CDR1.IMGT.seq == "" | is.na(result$CDR1.IMGT.seq) +missing.FR2 = result$FR2.IMGT.seq == "" | is.na(result$FR2.IMGT.seq) +missing.CDR2 = result$CDR2.IMGT.seq == "" | is.na(result$CDR2.IMGT.seq) +missing.FR3 = result$FR3.IMGT.seq == "" | is.na(result$FR3.IMGT.seq) + +print(paste("Number of empty CDR1 sequences:", sum(missing.FR1))) +print(paste("Number of empty FR2 sequences:", sum(missing.CDR1))) +print(paste("Number of empty CDR2 sequences:", sum(missing.FR2))) +print(paste("Number of empty FR3 sequences:", sum(missing.CDR2))) +print(paste("Number of empty FR3 sequences:", sum(missing.FR3))) if(empty.region.filter == "leader"){ result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] @@ -217,6 +225,7 @@ write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) + if(filter.unique != "no"){ clmns = names(result) if(filter.unique == "remove_vjaa"){ @@ -291,6 +300,9 @@ matched.sequences.count = nrow(matched.sequences) unmatched.sequences.count = sum(grepl("^unmatched", result$best_match)) +if(matched.sequences.count <= unmatched.sequences.count){ + print("WARNING NO MATCHED (SUB)CLASS SEQUENCES!!") +} filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count)) filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count))