Mercurial > repos > davidvanzessen > shm_csr
comparison merge_and_filter.r @ 0:c33d93683a09 draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 13 Oct 2016 10:52:24 -0400 |
parents | |
children | faae21ba5c63 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c33d93683a09 |
---|---|
1 args <- commandArgs(trailingOnly = TRUE) | |
2 | |
3 | |
4 summaryfile = args[1] | |
5 sequencesfile = args[2] | |
6 mutationanalysisfile = args[3] | |
7 mutationstatsfile = args[4] | |
8 hotspotsfile = args[5] | |
9 gene_identification_file= args[6] | |
10 output = args[7] | |
11 before.unique.file = args[8] | |
12 unmatchedfile = args[9] | |
13 method=args[10] | |
14 functionality=args[11] | |
15 unique.type=args[12] | |
16 filter.unique=args[13] | |
17 class.filter=args[14] | |
18 empty.region.filter=args[15] | |
19 | |
20 summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") | |
21 sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") | |
22 mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") | |
23 mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") | |
24 hotspots = read.table(hotspotsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") | |
25 gene_identification = read.table(gene_identification_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") | |
26 | |
27 if(method == "blastn"){ | |
28 "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" | |
29 gene_identification = gene_identification[!duplicated(gene_identification$qseqid),] | |
30 ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52)) | |
31 gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T) | |
32 gene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100 | |
33 gene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")] | |
34 colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match") | |
35 | |
36 } | |
37 | |
38 input.sequence.count = nrow(summ) | |
39 print(paste("Number of sequences in summary file:", input.sequence.count)) | |
40 | |
41 filtering.steps = data.frame(character(0), numeric(0)) | |
42 | |
43 filtering.steps = rbind(filtering.steps, c("Input", input.sequence.count)) | |
44 | |
45 filtering.steps[,1] = as.character(filtering.steps[,1]) | |
46 filtering.steps[,2] = as.character(filtering.steps[,2]) | |
47 #filtering.steps[,3] = as.numeric(filtering.steps[,3]) | |
48 | |
49 summ = merge(summ, gene_identification, by="Sequence.ID") | |
50 | |
51 summ = summ[summ$Functionality != "No results",] | |
52 | |
53 print(paste("Number of sequences after 'No results' filter:", nrow(summ))) | |
54 | |
55 filtering.steps = rbind(filtering.steps, c("After 'No results' filter", nrow(summ))) | |
56 | |
57 if(functionality == "productive"){ | |
58 summ = summ[summ$Functionality == "productive (see comment)" | summ$Functionality == "productive",] | |
59 } else if (functionality == "unproductive"){ | |
60 summ = summ[summ$Functionality == "unproductive (see comment)" | summ$Functionality == "unproductive",] | |
61 } else if (functionality == "remove_unknown"){ | |
62 summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",] | |
63 } | |
64 | |
65 print(paste("Number of sequences after productive filter:", nrow(summ))) | |
66 | |
67 filtering.steps = rbind(filtering.steps, c("After productive filter", nrow(summ))) | |
68 | |
69 splt = strsplit(class.filter, "_")[[1]] | |
70 chunk_hit_threshold = as.numeric(splt[1]) | |
71 nt_hit_threshold = as.numeric(splt[2]) | |
72 | |
73 higher_than=(summ$chunk_hit_percentage >= chunk_hit_threshold & summ$nt_hit_percentage >= nt_hit_threshold) | |
74 | |
75 unmatched=summ[NULL,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] | |
76 | |
77 if(!all(higher_than, na.rm=T)){ #check for 'not all' because that would mean the unmatched set is empty | |
78 unmatched = summ[!higher_than,] | |
79 unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] | |
80 unmatched$best_match = paste("unmatched,", unmatched$best_match) | |
81 summ[!higher_than,"best_match"] = paste("unmatched,", summ[!higher_than,"best_match"]) | |
82 } | |
83 | |
84 if(any(higher_than, na.rm=T)){ | |
85 #summ = summ[higher_than,] | |
86 } | |
87 | |
88 if(nrow(summ) == 0){ | |
89 stop("No data remaining after filter") | |
90 } | |
91 | |
92 result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID") | |
93 | |
94 print(paste("Number of sequences after merging with mutation analysis file:", nrow(result))) | |
95 | |
96 result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID") | |
97 | |
98 print(paste("Number of sequences after merging with mutation stats file:", nrow(result))) | |
99 | |
100 result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID") | |
101 | |
102 print(paste("Number of sequences after merging with hotspots file:", nrow(result))) | |
103 | |
104 sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")] | |
105 names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq") | |
106 result = merge(result, sequences, by="Sequence.ID", all.x=T) | |
107 | |
108 print(paste("Number of sequences in result after merging with sequences:", nrow(result))) | |
109 | |
110 result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele) | |
111 result$VGene = gsub("[*].*", "", result$VGene) | |
112 result$DGene = gsub("^Homsap ", "", result$D.GENE.and.allele) | |
113 result$DGene = gsub("[*].*", "", result$DGene) | |
114 result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) | |
115 result$JGene = gsub("[*].*", "", result$JGene) | |
116 | |
117 result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":")) | |
118 | |
119 result = result[!(duplicated(result$past)), ] | |
120 | |
121 result = result[,!(names(result) %in% c("past"))] | |
122 | |
123 print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result))) | |
124 | |
125 filtering.steps = rbind(filtering.steps, c("After duplicate filter", nrow(result))) | |
126 | |
127 print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == ""))) | |
128 print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == ""))) | |
129 print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == ""))) | |
130 print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == ""))) | |
131 | |
132 if(empty.region.filter == "FR1"){ | |
133 result = result[result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] | |
134 print(paste("Number of sequences after empty CDR1, FR2, CDR2 and FR3 column filter:", nrow(result))) | |
135 filtering.steps = rbind(filtering.steps, c("After empty CDR1, FR2, CDR2, FR3 filter", nrow(result))) | |
136 } else if(empty.region.filter == "CDR1"){ | |
137 result = result[result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] | |
138 print(paste("Number of sequences after empty FR2, CDR2 and FR3 column filter:", nrow(result))) | |
139 filtering.steps = rbind(filtering.steps, c("After empty FR2, CDR2, FR3 filter", nrow(result))) | |
140 } else if(empty.region.filter == "FR2"){ | |
141 result = result[result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] | |
142 print(paste("Number of sequences after empty CDR2 and FR3 column filter:", nrow(result))) | |
143 filtering.steps = rbind(filtering.steps, c("After empty CDR2, FR3 filter", nrow(result))) | |
144 } | |
145 | |
146 if(empty.region.filter == "FR1"){ | |
147 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR1.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] | |
148 } else if(empty.region.filter == "CDR1"){ | |
149 result = result[!(grepl("n|N", result$FR2.IMGT.seq) | grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR2.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] | |
150 } else if(empty.region.filter == "FR2"){ | |
151 result = result[!(grepl("n|N", result$FR3.IMGT.seq) | grepl("n|N", result$CDR3.IMGT.seq)),] | |
152 } | |
153 | |
154 print(paste("Number of sequences in result after n filtering:", nrow(result))) | |
155 filtering.steps = rbind(filtering.steps, c("After N filter", nrow(result))) | |
156 | |
157 cleanup_columns = c("FR1.IMGT.Nb.of.mutations", | |
158 "CDR1.IMGT.Nb.of.mutations", | |
159 "FR2.IMGT.Nb.of.mutations", | |
160 "CDR2.IMGT.Nb.of.mutations", | |
161 "FR3.IMGT.Nb.of.mutations") | |
162 | |
163 for(col in cleanup_columns){ | |
164 result[,col] = gsub("\\(.*\\)", "", result[,col]) | |
165 result[,col] = as.numeric(result[,col]) | |
166 result[is.na(result[,col]),] = 0 | |
167 } | |
168 | |
169 write.table(result, before.unique.file, sep="\t", quote=F,row.names=F,col.names=T) | |
170 | |
171 if(filter.unique != "no"){ | |
172 clmns = names(result) | |
173 | |
174 if(empty.region.filter == "FR1"){ | |
175 result$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) | |
176 } else if(empty.region.filter == "CDR1"){ | |
177 rresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) | |
178 } else if(empty.region.filter == "FR2"){ | |
179 result$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq) | |
180 } | |
181 | |
182 if(grepl("_c", filter.unique)){ | |
183 result$unique.def = paste(result$unique.def, result$best_match) | |
184 } | |
185 | |
186 #fltr = result$unique.def %in% result.filtered$unique.def | |
187 | |
188 if(grepl("keep", filter.unique)){ | |
189 result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes | |
190 result = result[!duplicated(result$unique.def),] | |
191 } else { | |
192 result = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),] | |
193 result$unique.def = paste(result$unique.def, result$best_match) #keep the unique sequences that are in multiple classes | |
194 result = result[!duplicated(result$unique.def),] | |
195 } | |
196 | |
197 #result = result[,clmns] | |
198 | |
199 #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T) | |
200 } | |
201 | |
202 print(paste("Number of sequences in result after CDR/FR filtering:", nrow(result))) | |
203 print(paste("Number of matched sequences in result after CDR/FR filtering:", nrow(result[!grepl("unmatched", result$best_match),]))) | |
204 | |
205 filtering.steps = rbind(filtering.steps, c("After unique filter", nrow(result))) | |
206 | |
207 print(paste("Number of rows in result:", nrow(result))) | |
208 print(paste("Number of rows in unmatched:", nrow(unmatched))) | |
209 | |
210 matched.sequences = result[!grepl("^unmatched", result$best_match),] | |
211 | |
212 write.table(x=matched.sequences, file=gsub("merged.txt$", "filtered.txt", output), sep="\t",quote=F,row.names=F,col.names=T) | |
213 | |
214 matched.sequences.count = nrow(matched.sequences) | |
215 unmatched.sequences.count = sum(grepl("^unmatched", result$best_match)) | |
216 | |
217 filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count)) | |
218 filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count)) | |
219 filtering.steps[,2] = as.numeric(filtering.steps[,2]) | |
220 filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2) | |
221 | |
222 write.table(x=filtering.steps, file=gsub("unmatched", "filtering_steps", unmatchedfile), sep="\t",quote=F,row.names=F,col.names=F) | |
223 | |
224 write.table(x=result, file=output, sep="\t",quote=F,row.names=F,col.names=T) | |
225 write.table(x=unmatched, file=unmatchedfile, sep="\t",quote=F,row.names=F,col.names=T) |