diff merge_and_filter.r @ 77:58d2377b507d draft

Uploaded
author davidvanzessen
date Wed, 19 Jun 2019 04:31:44 -0400
parents ba33b94637ca
children aff3ba86ef7a
line wrap: on
line diff
--- a/merge_and_filter.r	Tue Jun 18 04:47:44 2019 -0400
+++ b/merge_and_filter.r	Wed Jun 19 04:31:44 2019 -0400
@@ -53,6 +53,10 @@
 hotspots = fix_column_names(hotspots)
 AAs = fix_column_names(AAs)
 
+if(!("Sequence.number" %in% names(summ))){ 
+	summ["Sequence.number"] = 1:nrow(summ)
+}
+
 if(method == "blastn"){
 	#"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
 	gene_identification = gene_identification[!duplicated(gene_identification$qseqid),]
@@ -140,9 +144,6 @@
 names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq")
 result = merge(result, sequences, by="Sequence.ID", all.x=T)
 
-print("sequences files columns")
-print("CDR3.IMGT")
-
 AAs = AAs[,c("Sequence.ID", "CDR3.IMGT")]
 names(AAs) = c("Sequence.ID", "CDR3.IMGT.AA")
 result = merge(result, AAs, by="Sequence.ID", all.x=T)
@@ -172,10 +173,17 @@
 
 write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T)
 
-print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "", na.rm=T)))
-print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "", na.rm=T)))
-print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "", na.rm=T)))
-print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "", na.rm=T)))
+missing.FR1 = result$FR1.IMGT.seq == "" | is.na(result$FR1.IMGT.seq)
+missing.CDR1 = result$CDR1.IMGT.seq == "" | is.na(result$CDR1.IMGT.seq)
+missing.FR2 = result$FR2.IMGT.seq == "" | is.na(result$FR2.IMGT.seq)
+missing.CDR2 = result$CDR2.IMGT.seq == "" | is.na(result$CDR2.IMGT.seq)
+missing.FR3 = result$FR3.IMGT.seq == "" | is.na(result$FR3.IMGT.seq)
+
+print(paste("Number of empty CDR1 sequences:", sum(missing.FR1)))
+print(paste("Number of empty FR2 sequences:", sum(missing.CDR1)))
+print(paste("Number of empty CDR2 sequences:", sum(missing.FR2)))
+print(paste("Number of empty FR3 sequences:", sum(missing.CDR2)))
+print(paste("Number of empty FR3 sequences:", sum(missing.FR3)))
 
 if(empty.region.filter == "leader"){
 	result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
@@ -217,6 +225,7 @@
 
 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
 
+
 if(filter.unique != "no"){
 	clmns = names(result)
 	if(filter.unique == "remove_vjaa"){
@@ -291,6 +300,9 @@
 
 matched.sequences.count = nrow(matched.sequences)
 unmatched.sequences.count = sum(grepl("^unmatched", result$best_match))
+if(matched.sequences.count <= unmatched.sequences.count){
+	print("WARNING NO MATCHED (SUB)CLASS SEQUENCES!!")
+}
 
 filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count))
 filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count))