annotate merge_and_filter.r @ 81:b6f9a640e098 draft

Uploaded
author davidvanzessen
date Fri, 19 Feb 2021 15:10:54 +0000
parents
children 729738462297
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
81
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
1 args <- commandArgs(trailingOnly = TRUE)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
2
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
3
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
4 summaryfile = args[1]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
5 sequencesfile = args[2]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
6 mutationanalysisfile = args[3]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
7 mutationstatsfile = args[4]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
8 hotspotsfile = args[5]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
9 aafile = args[6]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
10 gene_identification_file= args[7]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
11 output = args[8]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
12 before.unique.file = args[9]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
13 unmatchedfile = args[10]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
14 method=args[11]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
15 functionality=args[12]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
16 unique.type=args[13]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
17 filter.unique=args[14]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
18 filter.unique.count=as.numeric(args[15])
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
19 class.filter=args[16]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
20 empty.region.filter=args[17]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
21
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
22 print(paste("filter.unique.count:", filter.unique.count))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
23
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
24 summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
25 sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
26 mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
27 mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
28 hotspots = read.table(hotspotsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
29 AAs = read.table(aafile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
30 gene_identification = read.table(gene_identification_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
31
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
32 fix_column_names = function(df){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
33 if("V.DOMAIN.Functionality" %in% names(df)){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
34 names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
35 print("found V.DOMAIN.Functionality, changed")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
36 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
37 if("V.DOMAIN.Functionality.comment" %in% names(df)){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
38 names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
39 print("found V.DOMAIN.Functionality.comment, changed")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
40 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
41 return(df)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
42 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
43
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
44 fix_non_unique_ids = function(df){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
45 df$Sequence.ID = paste(df$Sequence.ID, 1:nrow(df))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
46 return(df)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
47 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
48
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
49 summ = fix_column_names(summ)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
50 sequences = fix_column_names(sequences)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
51 mutationanalysis = fix_column_names(mutationanalysis)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
52 mutationstats = fix_column_names(mutationstats)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
53 hotspots = fix_column_names(hotspots)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
54 AAs = fix_column_names(AAs)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
55
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
56 if(method == "blastn"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
57 #"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
58 gene_identification = gene_identification[!duplicated(gene_identification$qseqid),]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
59 ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
60 gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
61 gene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
62 gene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
63 colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
64 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
65
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
66 #print("Summary analysis files columns")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
67 #print(names(summ))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
68
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
69
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
70
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
71 input.sequence.count = nrow(summ)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
72 print(paste("Number of sequences in summary file:", input.sequence.count))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
73
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
74 filtering.steps = data.frame(character(0), numeric(0))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
75
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
76 filtering.steps = rbind(filtering.steps, c("Input", input.sequence.count))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
77
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
78 filtering.steps[,1] = as.character(filtering.steps[,1])
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
79 filtering.steps[,2] = as.character(filtering.steps[,2])
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
80 #filtering.steps[,3] = as.numeric(filtering.steps[,3])
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
81
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
82 #print("summary files columns")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
83 #print(names(summ))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
84
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
85 summ = merge(summ, gene_identification, by="Sequence.ID")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
86
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
87 print(paste("Number of sequences after merging with gene identification:", nrow(summ)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
88
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
89 summ = summ[summ$Functionality != "No results",]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
90
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
91 print(paste("Number of sequences after 'No results' filter:", nrow(summ)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
92
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
93 filtering.steps = rbind(filtering.steps, c("After 'No results' filter", nrow(summ)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
94
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
95 if(functionality == "productive"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
96 summ = summ[summ$Functionality == "productive (see comment)" | summ$Functionality == "productive",]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
97 } else if (functionality == "unproductive"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
98 summ = summ[summ$Functionality == "unproductive (see comment)" | summ$Functionality == "unproductive",]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
99 } else if (functionality == "remove_unknown"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
100 summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
101 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
102
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
103 print(paste("Number of sequences after functionality filter:", nrow(summ)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
104
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
105 filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
106
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
107 if(F){ #to speed up debugging
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
108 set.seed(1)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
109 summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.03)),]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
110 print(paste("Number of sequences after sampling 3%:", nrow(summ)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
111
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
112 filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 3%", nrow(summ)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
113 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
114
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
115 print("mutation analysis files columns")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
116 print(names(mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])]))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
117
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
118 result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
119
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
120 print(paste("Number of sequences after merging with mutation analysis file:", nrow(result)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
121
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
122 #print("mutation stats files columns")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
123 #print(names(mutationstats[,!(names(mutationstats) %in% names(result)[-1])]))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
124
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
125 result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
126
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
127 print(paste("Number of sequences after merging with mutation stats file:", nrow(result)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
128
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
129 print("hotspots files columns")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
130 print(names(hotspots[,!(names(hotspots) %in% names(result)[-1])]))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
131
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
132 result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
133
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
134 print(paste("Number of sequences after merging with hotspots file:", nrow(result)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
135
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
136 print("sequences files columns")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
137 print(c("FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT"))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
138
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
139 sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
140 names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
141 result = merge(result, sequences, by="Sequence.ID", all.x=T)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
142
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
143 AAs = AAs[,c("Sequence.ID", "CDR3.IMGT")]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
144 names(AAs) = c("Sequence.ID", "CDR3.IMGT.AA")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
145 result = merge(result, AAs, by="Sequence.ID", all.x=T)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
146
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
147 print(paste("Number of sequences in result after merging with sequences:", nrow(result)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
148
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
149 result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
150 result$VGene = gsub("[*].*", "", result$VGene)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
151 result$DGene = gsub("^Homsap ", "", result$D.GENE.and.allele)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
152 result$DGene = gsub("[*].*", "", result$DGene)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
153 result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
154 result$JGene = gsub("[*].*", "", result$JGene)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
155
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
156 splt = strsplit(class.filter, "_")[[1]]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
157 chunk_hit_threshold = as.numeric(splt[1])
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
158 nt_hit_threshold = as.numeric(splt[2])
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
159
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
160 higher_than=(result$chunk_hit_percentage >= chunk_hit_threshold & result$nt_hit_percentage >= nt_hit_threshold)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
161
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
162 if(!all(higher_than, na.rm=T)){ #check for no unmatched
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
163 result[!higher_than,"best_match"] = paste("unmatched,", result[!higher_than,"best_match"])
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
164 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
165
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
166 if(class.filter == "101_101"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
167 result$best_match = "all"
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
168 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
169
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
170 write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
171
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
172 print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "", na.rm=T)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
173 print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "", na.rm=T)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
174 print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "", na.rm=T)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
175 print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "", na.rm=T)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
176
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
177 if(empty.region.filter == "leader"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
178 result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
179 } else if(empty.region.filter == "FR1"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
180 result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
181 } else if(empty.region.filter == "CDR1"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
182 result = result[result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
183 } else if(empty.region.filter == "FR2"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
184 result = result[result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
185 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
186
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
187 print(paste("After removal sequences that are missing a gene region:", nrow(result)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
188 filtering.steps = rbind(filtering.steps, c("After removal sequences that are missing a gene region", nrow(result)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
189
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
190 if(empty.region.filter == "leader"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
191 result = result[!(grepl("n|N", result$FR1.IMGT.seq) | grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
192 } else if(empty.region.filter == "FR1"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
193 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
194 } else if(empty.region.filter == "CDR1"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
195 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
196 } else if(empty.region.filter == "FR2"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
197 result = result[!(grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
198 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
199
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
200 print(paste("Number of sequences in result after n filtering:", nrow(result)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
201 filtering.steps = rbind(filtering.steps, c("After N filter", nrow(result)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
202
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
203 cleanup_columns = c("FR1.IMGT.Nb.of.mutations",
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
204 "CDR1.IMGT.Nb.of.mutations",
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
205 "FR2.IMGT.Nb.of.mutations",
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
206 "CDR2.IMGT.Nb.of.mutations",
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
207 "FR3.IMGT.Nb.of.mutations")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
208
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
209 for(col in cleanup_columns){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
210 result[,col] = gsub("\\(.*\\)", "", result[,col])
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
211 result[,col] = as.numeric(result[,col])
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
212 result[is.na(result[,col]),] = 0
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
213 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
214
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
215 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
216
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
217
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
218 if(filter.unique != "no"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
219 clmns = names(result)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
220 if(filter.unique == "remove_vjaa"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
221 result$unique.def = paste(result$VGene, result$JGene, result$CDR3.IMGT.AA)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
222 } else if(empty.region.filter == "leader"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
223 result$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
224 } else if(empty.region.filter == "FR1"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
225 result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
226 } else if(empty.region.filter == "CDR1"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
227 result$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
228 } else if(empty.region.filter == "FR2"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
229 result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
230 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
231
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
232 if(grepl("remove", filter.unique)){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
233 result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
234 unique.defs = data.frame(table(result$unique.def))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
235 unique.defs = unique.defs[unique.defs$Freq >= filter.unique.count,]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
236 result = result[result$unique.def %in% unique.defs$Var1,]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
237 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
238
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
239 if(filter.unique != "remove_vjaa"){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
240 result$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don't have a class after it
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
241 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
242
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
243 result = result[!duplicated(result$unique.def),]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
244 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
245
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
246 write.table(result, gsub("before_unique_filter.txt", "after_unique_filter.txt", before.unique.file), sep="\t", quote=F,row.names=F,col.names=T)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
247
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
248 filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
249
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
250 print(paste("Number of sequences in result after unique filtering:", nrow(result)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
251
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
252 if(nrow(summ) == 0){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
253 stop("No data remaining after filter")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
254 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
255
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
256 result$best_match_class = gsub(",.*", "", result$best_match) #gsub so the unmatched don't have a class after it
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
257
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
258 #result$past = ""
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
259 #cls = unlist(strsplit(unique.type, ","))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
260 #for (i in 1:nrow(result)){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
261 # result[i,"past"] = paste(result[i,cls], collapse=":")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
262 #}
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
263
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
264
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
265
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
266 result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
267
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
268 result.matched = result[!grepl("unmatched", result$best_match),]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
269 result.unmatched = result[grepl("unmatched", result$best_match),]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
270
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
271 result = rbind(result.matched, result.unmatched)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
272
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
273 result = result[!(duplicated(result$past)), ]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
274
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
275 result = result[,!(names(result) %in% c("past", "best_match_class"))]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
276
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
277 print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
278
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
279 filtering.steps = rbind(filtering.steps, c("After remove duplicates based on filter", nrow(result)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
280
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
281 unmatched = result[grepl("^unmatched", result$best_match),c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
282
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
283 print(paste("Number of rows in result:", nrow(result)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
284 print(paste("Number of rows in unmatched:", nrow(unmatched)))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
285
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
286 matched.sequences = result[!grepl("^unmatched", result$best_match),]
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
287
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
288 write.table(x=matched.sequences, file=gsub("merged.txt$", "filtered.txt", output), sep="\t",quote=F,row.names=F,col.names=T)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
289
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
290 matched.sequences.count = nrow(matched.sequences)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
291 unmatched.sequences.count = sum(grepl("^unmatched", result$best_match))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
292 if(matched.sequences.count <= unmatched.sequences.count){
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
293 print("WARNING NO MATCHED (SUB)CLASS SEQUENCES!!")
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
294 }
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
295
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
296 filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
297 filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count))
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
298 filtering.steps[,2] = as.numeric(filtering.steps[,2])
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
299 filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
300
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
301 write.table(x=filtering.steps, file=gsub("unmatched", "filtering_steps", unmatchedfile), sep="\t",quote=F,row.names=F,col.names=F)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
302
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
303 write.table(x=result, file=output, sep="\t",quote=F,row.names=F,col.names=T)
b6f9a640e098 Uploaded
davidvanzessen
parents:
diff changeset
304 write.table(x=unmatched, file=unmatchedfile, sep="\t",quote=F,row.names=F,col.names=T)