4
|
1 #########################################################################################
|
|
2 # License Agreement
|
|
3 #
|
|
4 # THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE
|
|
5 # ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER
|
|
6 # APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE
|
|
7 # OR COPYRIGHT LAW IS PROHIBITED.
|
|
8 #
|
|
9 # BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE
|
|
10 # BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED
|
|
11 # TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN
|
|
12 # CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.
|
|
13 #
|
|
14 # BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences
|
|
15 # Coded by: Mohamed Uduman & Gur Yaari
|
|
16 # Copyright 2012 Kleinstein Lab
|
|
17 # Version: 1.3 (01/23/2014)
|
|
18 #########################################################################################
|
|
19
|
|
20 op <- options();
|
|
21 options(showWarnCalls=FALSE, showErrorCalls=FALSE, warn=-1)
|
|
22 library('seqinr')
|
|
23 if( F & Sys.info()[1]=="Linux"){
|
|
24 library("multicore")
|
|
25 }
|
|
26
|
|
27 # Load functions and initialize global variables
|
|
28 source("Baseline_Functions.r")
|
|
29
|
|
30 # Initialize parameters with user provided arguments
|
|
31 arg <- commandArgs(TRUE)
|
|
32 #arg = c(2,1,5,5,0,1,"1:26:38:55:65:104:116", "test.fasta","","sample")
|
|
33 #arg = c(1,1,5,5,0,1,"1:38:55:65:104:116:200", "test.fasta","","sample")
|
|
34 #arg = c(1,1,5,5,1,1,"1:26:38:55:65:104:116", "/home/mu37/Wu/Wu_Cloned_gapped_sequences_D-masked.fasta","/home/mu37/Wu/","Wu")
|
|
35 testID <- as.numeric(arg[1]) # 1 = Focused, 2 = Local
|
|
36 species <- as.numeric(arg[2]) # 1 = Human. 2 = Mouse
|
|
37 substitutionModel <- as.numeric(arg[3]) # 0 = Uniform substitution, 1 = Smith DS et al. 1996, 5 = FiveS
|
|
38 mutabilityModel <- as.numeric(arg[4]) # 0 = Uniform mutablity, 1 = Tri-nucleotide (Shapiro GS et al. 2002) , 5 = FiveS
|
|
39 clonal <- as.numeric(arg[5]) # 0 = Independent sequences, 1 = Clonally related, 2 = Clonally related & only non-terminal mutations
|
|
40 fixIndels <- as.numeric(arg[6]) # 0 = Do nothing, 1 = Try and fix Indels
|
|
41 region <- as.numeric(strsplit(arg[7],":")[[1]]) # StartPos:LastNucleotideF1:C1:F2:C2:F3:C3
|
|
42 inputFilePath <- arg[8] # Full path to input file
|
|
43 outputPath <- arg[9] # Full path to location of output files
|
|
44 outputID <- arg[10] # ID for session output
|
|
45
|
|
46
|
|
47 if(testID==5){
|
|
48 traitChangeModel <- 1
|
|
49 if( !is.na(any(arg[11])) ) traitChangeModel <- as.numeric(arg[11]) # 1 <- Chothia 1998
|
|
50 initializeTraitChange(traitChangeModel)
|
|
51 }
|
|
52
|
|
53 # Initialize other parameters/variables
|
|
54
|
|
55 # Initialzie the codon table ( definitions of R/S )
|
|
56 computeCodonTable(testID)
|
|
57
|
|
58 # Initialize
|
|
59 # Test Name
|
|
60 testName<-"Focused"
|
|
61 if(testID==2) testName<-"Local"
|
|
62 if(testID==3) testName<-"Imbalanced"
|
|
63 if(testID==4) testName<-"ImbalancedSilent"
|
|
64
|
|
65 # Indel placeholders initialization
|
|
66 indelPos <- NULL
|
|
67 delPos <- NULL
|
|
68 insPos <- NULL
|
|
69
|
|
70 # Initialize in Tranistion & Mutability matrixes
|
|
71 substitution <- initializeSubstitutionMatrix(substitutionModel,species)
|
|
72 mutability <- initializeMutabilityMatrix(mutabilityModel,species)
|
|
73
|
|
74 # FWR/CDR boundaries
|
|
75 flagTrim <- F
|
|
76 if( is.na(region[7])){
|
|
77 flagTrim <- T
|
|
78 region[7]<-region[6]
|
|
79 }
|
|
80 readStart = min(region,na.rm=T)
|
|
81 readEnd = max(region,na.rm=T)
|
|
82 if(readStart>1){
|
|
83 region = region - (readStart - 1)
|
|
84 }
|
|
85 region_Nuc = c( (region[1]*3-2) , (region[2:7]*3) )
|
|
86 region_Cod = region
|
|
87
|
|
88 readStart = (readStart*3)-2
|
|
89 readEnd = (readEnd*3)
|
|
90
|
|
91 FWR_Nuc <- c( rep(TRUE,(region_Nuc[2])),
|
|
92 rep(FALSE,(region_Nuc[3]-region_Nuc[2])),
|
|
93 rep(TRUE,(region_Nuc[4]-region_Nuc[3])),
|
|
94 rep(FALSE,(region_Nuc[5]-region_Nuc[4])),
|
|
95 rep(TRUE,(region_Nuc[6]-region_Nuc[5])),
|
|
96 rep(FALSE,(region_Nuc[7]-region_Nuc[6]))
|
|
97 )
|
|
98 CDR_Nuc <- (1-FWR_Nuc)
|
|
99 CDR_Nuc <- as.logical(CDR_Nuc)
|
|
100 FWR_Nuc_Mat <- matrix( rep(FWR_Nuc,4), ncol=length(FWR_Nuc), nrow=4, byrow=T)
|
|
101 CDR_Nuc_Mat <- matrix( rep(CDR_Nuc,4), ncol=length(CDR_Nuc), nrow=4, byrow=T)
|
|
102
|
|
103 FWR_Codon <- c( rep(TRUE,(region[2])),
|
|
104 rep(FALSE,(region[3]-region[2])),
|
|
105 rep(TRUE,(region[4]-region[3])),
|
|
106 rep(FALSE,(region[5]-region[4])),
|
|
107 rep(TRUE,(region[6]-region[5])),
|
|
108 rep(FALSE,(region[7]-region[6]))
|
|
109 )
|
|
110 CDR_Codon <- (1-FWR_Codon)
|
|
111 CDR_Codon <- as.logical(CDR_Codon)
|
|
112
|
|
113
|
|
114 # Read input FASTA file
|
|
115 tryCatch(
|
|
116 inputFASTA <- baseline.read.fasta(inputFilePath, seqtype="DNA",as.string=T,set.attributes=F,forceDNAtolower=F)
|
|
117 , error = function(ex){
|
|
118 cat("Error|Error reading input. Please enter or upload a valid FASTA file.\n")
|
|
119 q()
|
|
120 }
|
|
121 )
|
|
122
|
|
123 if (length(inputFASTA)==1) {
|
|
124 cat("Error|Error reading input. Please enter or upload a valid FASTA file.\n")
|
|
125 q()
|
|
126 }
|
|
127
|
|
128 # Process sequence IDs/names
|
|
129 names(inputFASTA) <- sapply(names(inputFASTA),function(x){trim(x)})
|
|
130
|
|
131 # Convert non nucleotide characters to N
|
|
132 inputFASTA[length(inputFASTA)] = gsub("\t","",inputFASTA[length(inputFASTA)])
|
|
133 inputFASTA <- lapply(inputFASTA,replaceNonFASTAChars)
|
|
134
|
|
135 # Process the FASTA file and conver to Matrix[inputSequence, germlineSequence]
|
|
136 processedInput <- processInputAdvanced(inputFASTA)
|
|
137 matInput <- processedInput[[1]]
|
|
138 germlines <- processedInput[[2]]
|
|
139 lenGermlines = length(unique(germlines))
|
|
140 groups <- processedInput[[3]]
|
|
141 lenGroups = length(unique(groups))
|
|
142 rm(processedInput)
|
|
143 rm(inputFASTA)
|
|
144
|
|
145 # # remove clones with less than 2 seqeunces
|
|
146 # tableGL <- table(germlines)
|
|
147 # singletons <- which(tableGL<8)
|
|
148 # rowsToRemove <- match(singletons,germlines)
|
|
149 # if(any(rowsToRemove)){
|
|
150 # matInput <- matInput[-rowsToRemove,]
|
|
151 # germlines <- germlines[-rowsToRemove]
|
|
152 # groups <- groups[-rowsToRemove]
|
|
153 # }
|
|
154 #
|
|
155 # # remove unproductive seqs
|
|
156 # nonFuctionalSeqs <- sapply(rownames(matInput),function(x){any(grep("unproductive",x))})
|
|
157 # if(any(nonFuctionalSeqs)){
|
|
158 # if(sum(nonFuctionalSeqs)==length(germlines)){
|
|
159 # write.table("Unproductive",file=paste(outputPath,outputID,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T)
|
|
160 # q()
|
|
161 # }
|
|
162 # matInput <- matInput[-which(nonFuctionalSeqs),]
|
|
163 # germlines <- germlines[-which(nonFuctionalSeqs)]
|
|
164 # germlines[1:length(germlines)] <- 1:length(germlines)
|
|
165 # groups <- groups[-which(nonFuctionalSeqs)]
|
|
166 # }
|
|
167 #
|
|
168 # if(class(matInput)=="character"){
|
|
169 # write.table("All unproductive seqs",file=paste(outputPath,outputID,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T)
|
|
170 # q()
|
|
171 # }
|
|
172 #
|
|
173 # if(nrow(matInput)<10 | is.null(nrow(matInput))){
|
|
174 # write.table(paste(nrow(matInput), "seqs only",sep=""),file=paste(outputPath,outputID,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T)
|
|
175 # q()
|
|
176 # }
|
|
177
|
|
178 # replace leading & trailing "-" with "N:
|
|
179 matInput <- t(apply(matInput,1,replaceLeadingTrailingDashes,readEnd))
|
|
180
|
|
181 # Trim (nucleotide) input sequences to the last codon
|
|
182 #matInput[,1] <- apply(matrix(matInput[,1]),1,trimToLastCodon)
|
|
183
|
|
184 # # Check for Indels
|
|
185 # if(fixIndels){
|
|
186 # delPos <- fixDeletions(matInput)
|
|
187 # insPos <- fixInsertions(matInput)
|
|
188 # }else{
|
|
189 # # Check for indels
|
|
190 # indelPos <- checkForInDels(matInput)
|
|
191 # indelPos <- apply(cbind(indelPos[[1]],indelPos[[2]]),1,function(x){(x[1]==T & x[2]==T)})
|
|
192 # }
|
|
193
|
|
194 # If indels are present, remove mutations in the seqeunce & throw warning at end
|
|
195 #matInput[indelPos,] <- apply(matrix(matInput[indelPos,],nrow=sum(indelPos),ncol=2),1,function(x){x[1]=x[2]; return(x) })
|
|
196
|
|
197 colnames(matInput)=c("Input","Germline")
|
|
198
|
|
199 # If seqeunces are clonal, create effective sequence for each clone & modify germline/group definitions
|
|
200 germlinesOriginal = NULL
|
|
201 if(clonal){
|
|
202 germlinesOriginal <- germlines
|
|
203 collapseCloneResults <- tapply(1:nrow(matInput),germlines,function(i){
|
|
204 collapseClone(matInput[i,1],matInput[i[1],2],readEnd,nonTerminalOnly=(clonal-1))
|
|
205 })
|
|
206 matInput = t(sapply(collapseCloneResults,function(x){return(x[[1]])}))
|
|
207 names_groups = tapply(groups,germlines,function(x){names(x[1])})
|
|
208 groups = tapply(groups,germlines,function(x){array(x[1],dimnames=names(x[1]))})
|
|
209 names(groups) = names_groups
|
|
210
|
|
211 names_germlines = tapply(germlines,germlines,function(x){names(x[1])})
|
|
212 germlines = tapply( germlines,germlines,function(x){array(x[1],dimnames=names(x[1]))} )
|
|
213 names(germlines) = names_germlines
|
|
214 matInputErrors = sapply(collapseCloneResults,function(x){return(x[[2]])})
|
|
215 }
|
|
216
|
|
217
|
|
218 # Selection Analysis
|
|
219
|
|
220
|
|
221 # if (length(germlines)>sequenceLimit) {
|
|
222 # # Code to parallelize processing goes here
|
|
223 # stop( paste("Error: Cannot process more than ", Upper_limit," sequences",sep="") )
|
|
224 # }
|
|
225
|
|
226 # if (length(germlines)<sequenceLimit) {}
|
|
227
|
|
228 # Compute expected mutation frequencies
|
|
229 matExpected <- getExpectedIndividual(matInput)
|
|
230
|
|
231 # Count observed number of mutations in the different regions
|
|
232 mutations <- lapply( 1:nrow(matInput), function(i){
|
|
233 #cat(i,"\n")
|
|
234 seqI = s2c(matInput[i,1])
|
|
235 seqG = s2c(matInput[i,2])
|
|
236 matIGL = matrix(c(seqI,seqG),ncol=length(seqI),nrow=2,byrow=T)
|
|
237 retVal <- NA
|
|
238 tryCatch(
|
|
239 retVal <- analyzeMutations2NucUri(matIGL)
|
|
240 , error = function(ex){
|
|
241 retVal <- NA
|
|
242 }
|
|
243 )
|
|
244
|
|
245
|
|
246 return( retVal )
|
|
247 })
|
|
248
|
|
249 matObserved <- t(sapply( mutations, processNucMutations2 ))
|
|
250 numberOfSeqsWithMutations <- numberOfSeqsWithMutations(matObserved, testID)
|
|
251
|
|
252 #if(sum(numberOfSeqsWithMutations)==0){
|
|
253 # write.table("No mutated sequences",file=paste(outputPath,outputID,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T)
|
|
254 # q()
|
|
255 #}
|
|
256
|
|
257 matMutationInfo <- cbind(matObserved,matExpected)
|
|
258 rm(matObserved,matExpected)
|
|
259
|
|
260
|
|
261 #Bayesian PDFs
|
|
262 bayes_pdf = computeBayesianScore(matMutationInfo, test=testName, max_sigma=20,length_sigma=4001)
|
|
263 bayesPDF_cdr = bayes_pdf[[1]]
|
|
264 bayesPDF_fwr = bayes_pdf[[2]]
|
|
265 rm(bayes_pdf)
|
|
266
|
|
267 bayesPDF_germlines_cdr = tapply(bayesPDF_cdr,germlines,function(x) groupPosteriors(x,length_sigma=4001))
|
|
268 bayesPDF_germlines_fwr = tapply(bayesPDF_fwr,germlines,function(x) groupPosteriors(x,length_sigma=4001))
|
|
269
|
|
270 bayesPDF_groups_cdr = tapply(bayesPDF_cdr,groups,function(x) groupPosteriors(x,length_sigma=4001))
|
|
271 bayesPDF_groups_fwr = tapply(bayesPDF_fwr,groups,function(x) groupPosteriors(x,length_sigma=4001))
|
|
272
|
|
273 if(lenGroups>1){
|
|
274 groups <- c(groups,lenGroups+1)
|
|
275 names(groups)[length(groups)] = "All sequences combined"
|
|
276 bayesPDF_groups_cdr[[lenGroups+1]] = groupPosteriors(bayesPDF_groups_cdr,length_sigma=4001)
|
|
277 bayesPDF_groups_fwr[[lenGroups+1]] = groupPosteriors(bayesPDF_groups_fwr,length_sigma=4001)
|
|
278 }
|
|
279
|
|
280 #Bayesian Outputs
|
|
281 bayes_cdr = t(sapply(bayesPDF_cdr,calcBayesOutputInfo))
|
|
282 bayes_fwr = t(sapply(bayesPDF_fwr,calcBayesOutputInfo))
|
|
283 bayes_germlines_cdr = t(sapply(bayesPDF_germlines_cdr,calcBayesOutputInfo))
|
|
284 bayes_germlines_fwr = t(sapply(bayesPDF_germlines_fwr,calcBayesOutputInfo))
|
|
285 bayes_groups_cdr = t(sapply(bayesPDF_groups_cdr,calcBayesOutputInfo))
|
|
286 bayes_groups_fwr = t(sapply(bayesPDF_groups_fwr,calcBayesOutputInfo))
|
|
287
|
|
288 #P-values
|
|
289 simgaP_cdr = sapply(bayesPDF_cdr,computeSigmaP)
|
|
290 simgaP_fwr = sapply(bayesPDF_fwr,computeSigmaP)
|
|
291
|
|
292 simgaP_germlines_cdr = sapply(bayesPDF_germlines_cdr,computeSigmaP)
|
|
293 simgaP_germlines_fwr = sapply(bayesPDF_germlines_fwr,computeSigmaP)
|
|
294
|
|
295 simgaP_groups_cdr = sapply(bayesPDF_groups_cdr,computeSigmaP)
|
|
296 simgaP_groups_fwr = sapply(bayesPDF_groups_fwr,computeSigmaP)
|
|
297
|
|
298
|
|
299 #Format output
|
|
300
|
|
301 # Round expected mutation frequencies to 3 decimal places
|
|
302 matMutationInfo[germlinesOriginal[indelPos],] = NA
|
|
303 if(nrow(matMutationInfo)==1){
|
|
304 matMutationInfo[5:8] = round(matMutationInfo[,5:8]/sum(matMutationInfo[,5:8],na.rm=T),3)
|
|
305 }else{
|
|
306 matMutationInfo[,5:8] = t(round(apply(matMutationInfo[,5:8],1,function(x){ return(x/sum(x,na.rm=T)) }),3))
|
|
307 }
|
|
308
|
|
309 listPDFs = list()
|
|
310 nRows = length(unique(groups)) + length(unique(germlines)) + length(groups)
|
|
311
|
|
312 matOutput = matrix(NA,ncol=18,nrow=nRows)
|
|
313 rowNumb = 1
|
|
314 for(G in unique(groups)){
|
|
315 #print(G)
|
|
316 matOutput[rowNumb,c(1,2,11:18)] = c("Group",names(groups)[groups==G][1],bayes_groups_cdr[G,],bayes_groups_fwr[G,],simgaP_groups_cdr[G],simgaP_groups_fwr[G])
|
|
317 listPDFs[[rowNumb]] = list("CDR"=bayesPDF_groups_cdr[[G]],"FWR"=bayesPDF_groups_fwr[[G]])
|
|
318 names(listPDFs)[rowNumb] = names(groups[groups==paste(G)])[1]
|
|
319 #if(names(groups)[which(groups==G)[1]]!="All sequences combined"){
|
|
320 gs = unique(germlines[groups==G])
|
|
321 rowNumb = rowNumb+1
|
|
322 if( !is.na(gs) ){
|
|
323 for( g in gs ){
|
|
324 matOutput[rowNumb,c(1,2,11:18)] = c("Germline",names(germlines)[germlines==g][1],bayes_germlines_cdr[g,],bayes_germlines_fwr[g,],simgaP_germlines_cdr[g],simgaP_germlines_fwr[g])
|
|
325 listPDFs[[rowNumb]] = list("CDR"=bayesPDF_germlines_cdr[[g]],"FWR"=bayesPDF_germlines_fwr[[g]])
|
|
326 names(listPDFs)[rowNumb] = names(germlines[germlines==paste(g)])[1]
|
|
327 rowNumb = rowNumb+1
|
|
328 indexesOfInterest = which(germlines==g)
|
|
329 numbSeqsOfInterest = length(indexesOfInterest)
|
|
330 rowNumb = seq(rowNumb,rowNumb+(numbSeqsOfInterest-1))
|
|
331 matOutput[rowNumb,] = matrix( c( rep("Sequence",numbSeqsOfInterest),
|
|
332 rownames(matInput)[indexesOfInterest],
|
|
333 c(matMutationInfo[indexesOfInterest,1:4]),
|
|
334 c(matMutationInfo[indexesOfInterest,5:8]),
|
|
335 c(bayes_cdr[indexesOfInterest,]),
|
|
336 c(bayes_fwr[indexesOfInterest,]),
|
|
337 c(simgaP_cdr[indexesOfInterest]),
|
|
338 c(simgaP_fwr[indexesOfInterest])
|
|
339 ), ncol=18, nrow=numbSeqsOfInterest,byrow=F)
|
|
340 increment=0
|
|
341 for( ioi in indexesOfInterest){
|
|
342 listPDFs[[min(rowNumb)+increment]] = list("CDR"=bayesPDF_cdr[[ioi]] , "FWR"=bayesPDF_fwr[[ioi]])
|
|
343 names(listPDFs)[min(rowNumb)+increment] = rownames(matInput)[ioi]
|
|
344 increment = increment + 1
|
|
345 }
|
|
346 rowNumb=max(rowNumb)+1
|
|
347
|
|
348 }
|
|
349 }
|
|
350 }
|
|
351 colsToFormat = 11:18
|
|
352 matOutput[,colsToFormat] = formatC( matrix(as.numeric(matOutput[,colsToFormat]), nrow=nrow(matOutput), ncol=length(colsToFormat)) , digits=3)
|
|
353 matOutput[matOutput== " NaN"] = NA
|
|
354
|
|
355
|
|
356
|
|
357 colnames(matOutput) = c("Type", "ID", "Observed_CDR_R", "Observed_CDR_S", "Observed_FWR_R", "Observed_FWR_S",
|
|
358 "Expected_CDR_R", "Expected_CDR_S", "Expected_FWR_R", "Expected_FWR_S",
|
|
359 paste( rep(testName,6), rep(c("Sigma","CIlower","CIupper"),2),rep(c("CDR","FWR"),each=3), sep="_"),
|
|
360 paste( rep(testName,2), rep("P",2),c("CDR","FWR"), sep="_")
|
|
361 )
|
|
362 fileName = paste(outputPath,outputID,".txt",sep="")
|
|
363 write.table(matOutput,file=fileName,quote=F,sep="\t",row.names=T,col.names=NA)
|
|
364 fileName = paste(outputPath,outputID,".RData",sep="")
|
|
365 save(listPDFs,file=fileName)
|
|
366
|
|
367 indelWarning = FALSE
|
|
368 if(sum(indelPos)>0){
|
|
369 indelWarning = "<P>Warning: The following sequences have either gaps and/or deletions, and have been ommited from the analysis.";
|
|
370 indelWarning = paste( indelWarning , "<UL>", sep="" )
|
|
371 for(indels in names(indelPos)[indelPos]){
|
|
372 indelWarning = paste( indelWarning , "<LI>", indels, "</LI>", sep="" )
|
|
373 }
|
|
374 indelWarning = paste( indelWarning , "</UL></P>", sep="" )
|
|
375 }
|
|
376
|
|
377 cloneWarning = FALSE
|
|
378 if(clonal==1){
|
|
379 if(sum(matInputErrors)>0){
|
|
380 cloneWarning = "<P>Warning: The following clones have sequences of unequal length.";
|
|
381 cloneWarning = paste( cloneWarning , "<UL>", sep="" )
|
|
382 for(clone in names(matInputErrors)[matInputErrors]){
|
|
383 cloneWarning = paste( cloneWarning , "<LI>", names(germlines)[as.numeric(clone)], "</LI>", sep="" )
|
|
384 }
|
|
385 cloneWarning = paste( cloneWarning , "</UL></P>", sep="" )
|
|
386 }
|
|
387 }
|
|
388 cat(paste("Success",outputID,indelWarning,cloneWarning,sep="|"))
|