Mercurial > repos > davidvanzessen > argalaxy_tools
comparison baseline/Baseline_Main.r @ 4:5ffd52fc35c4 draft
Uploaded
author | davidvanzessen |
---|---|
date | Mon, 12 Dec 2016 05:22:37 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:beaa487ecf43 | 4:5ffd52fc35c4 |
---|---|
1 ######################################################################################### | |
2 # License Agreement | |
3 # | |
4 # THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE | |
5 # ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER | |
6 # APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE | |
7 # OR COPYRIGHT LAW IS PROHIBITED. | |
8 # | |
9 # BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE | |
10 # BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED | |
11 # TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN | |
12 # CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS. | |
13 # | |
14 # BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences | |
15 # Coded by: Mohamed Uduman & Gur Yaari | |
16 # Copyright 2012 Kleinstein Lab | |
17 # Version: 1.3 (01/23/2014) | |
18 ######################################################################################### | |
19 | |
20 op <- options(); | |
21 options(showWarnCalls=FALSE, showErrorCalls=FALSE, warn=-1) | |
22 library('seqinr') | |
23 if( F & Sys.info()[1]=="Linux"){ | |
24 library("multicore") | |
25 } | |
26 | |
27 # Load functions and initialize global variables | |
28 source("Baseline_Functions.r") | |
29 | |
30 # Initialize parameters with user provided arguments | |
31 arg <- commandArgs(TRUE) | |
32 #arg = c(2,1,5,5,0,1,"1:26:38:55:65:104:116", "test.fasta","","sample") | |
33 #arg = c(1,1,5,5,0,1,"1:38:55:65:104:116:200", "test.fasta","","sample") | |
34 #arg = c(1,1,5,5,1,1,"1:26:38:55:65:104:116", "/home/mu37/Wu/Wu_Cloned_gapped_sequences_D-masked.fasta","/home/mu37/Wu/","Wu") | |
35 testID <- as.numeric(arg[1]) # 1 = Focused, 2 = Local | |
36 species <- as.numeric(arg[2]) # 1 = Human. 2 = Mouse | |
37 substitutionModel <- as.numeric(arg[3]) # 0 = Uniform substitution, 1 = Smith DS et al. 1996, 5 = FiveS | |
38 mutabilityModel <- as.numeric(arg[4]) # 0 = Uniform mutablity, 1 = Tri-nucleotide (Shapiro GS et al. 2002) , 5 = FiveS | |
39 clonal <- as.numeric(arg[5]) # 0 = Independent sequences, 1 = Clonally related, 2 = Clonally related & only non-terminal mutations | |
40 fixIndels <- as.numeric(arg[6]) # 0 = Do nothing, 1 = Try and fix Indels | |
41 region <- as.numeric(strsplit(arg[7],":")[[1]]) # StartPos:LastNucleotideF1:C1:F2:C2:F3:C3 | |
42 inputFilePath <- arg[8] # Full path to input file | |
43 outputPath <- arg[9] # Full path to location of output files | |
44 outputID <- arg[10] # ID for session output | |
45 | |
46 | |
47 if(testID==5){ | |
48 traitChangeModel <- 1 | |
49 if( !is.na(any(arg[11])) ) traitChangeModel <- as.numeric(arg[11]) # 1 <- Chothia 1998 | |
50 initializeTraitChange(traitChangeModel) | |
51 } | |
52 | |
53 # Initialize other parameters/variables | |
54 | |
55 # Initialzie the codon table ( definitions of R/S ) | |
56 computeCodonTable(testID) | |
57 | |
58 # Initialize | |
59 # Test Name | |
60 testName<-"Focused" | |
61 if(testID==2) testName<-"Local" | |
62 if(testID==3) testName<-"Imbalanced" | |
63 if(testID==4) testName<-"ImbalancedSilent" | |
64 | |
65 # Indel placeholders initialization | |
66 indelPos <- NULL | |
67 delPos <- NULL | |
68 insPos <- NULL | |
69 | |
70 # Initialize in Tranistion & Mutability matrixes | |
71 substitution <- initializeSubstitutionMatrix(substitutionModel,species) | |
72 mutability <- initializeMutabilityMatrix(mutabilityModel,species) | |
73 | |
74 # FWR/CDR boundaries | |
75 flagTrim <- F | |
76 if( is.na(region[7])){ | |
77 flagTrim <- T | |
78 region[7]<-region[6] | |
79 } | |
80 readStart = min(region,na.rm=T) | |
81 readEnd = max(region,na.rm=T) | |
82 if(readStart>1){ | |
83 region = region - (readStart - 1) | |
84 } | |
85 region_Nuc = c( (region[1]*3-2) , (region[2:7]*3) ) | |
86 region_Cod = region | |
87 | |
88 readStart = (readStart*3)-2 | |
89 readEnd = (readEnd*3) | |
90 | |
91 FWR_Nuc <- c( rep(TRUE,(region_Nuc[2])), | |
92 rep(FALSE,(region_Nuc[3]-region_Nuc[2])), | |
93 rep(TRUE,(region_Nuc[4]-region_Nuc[3])), | |
94 rep(FALSE,(region_Nuc[5]-region_Nuc[4])), | |
95 rep(TRUE,(region_Nuc[6]-region_Nuc[5])), | |
96 rep(FALSE,(region_Nuc[7]-region_Nuc[6])) | |
97 ) | |
98 CDR_Nuc <- (1-FWR_Nuc) | |
99 CDR_Nuc <- as.logical(CDR_Nuc) | |
100 FWR_Nuc_Mat <- matrix( rep(FWR_Nuc,4), ncol=length(FWR_Nuc), nrow=4, byrow=T) | |
101 CDR_Nuc_Mat <- matrix( rep(CDR_Nuc,4), ncol=length(CDR_Nuc), nrow=4, byrow=T) | |
102 | |
103 FWR_Codon <- c( rep(TRUE,(region[2])), | |
104 rep(FALSE,(region[3]-region[2])), | |
105 rep(TRUE,(region[4]-region[3])), | |
106 rep(FALSE,(region[5]-region[4])), | |
107 rep(TRUE,(region[6]-region[5])), | |
108 rep(FALSE,(region[7]-region[6])) | |
109 ) | |
110 CDR_Codon <- (1-FWR_Codon) | |
111 CDR_Codon <- as.logical(CDR_Codon) | |
112 | |
113 | |
114 # Read input FASTA file | |
115 tryCatch( | |
116 inputFASTA <- baseline.read.fasta(inputFilePath, seqtype="DNA",as.string=T,set.attributes=F,forceDNAtolower=F) | |
117 , error = function(ex){ | |
118 cat("Error|Error reading input. Please enter or upload a valid FASTA file.\n") | |
119 q() | |
120 } | |
121 ) | |
122 | |
123 if (length(inputFASTA)==1) { | |
124 cat("Error|Error reading input. Please enter or upload a valid FASTA file.\n") | |
125 q() | |
126 } | |
127 | |
128 # Process sequence IDs/names | |
129 names(inputFASTA) <- sapply(names(inputFASTA),function(x){trim(x)}) | |
130 | |
131 # Convert non nucleotide characters to N | |
132 inputFASTA[length(inputFASTA)] = gsub("\t","",inputFASTA[length(inputFASTA)]) | |
133 inputFASTA <- lapply(inputFASTA,replaceNonFASTAChars) | |
134 | |
135 # Process the FASTA file and conver to Matrix[inputSequence, germlineSequence] | |
136 processedInput <- processInputAdvanced(inputFASTA) | |
137 matInput <- processedInput[[1]] | |
138 germlines <- processedInput[[2]] | |
139 lenGermlines = length(unique(germlines)) | |
140 groups <- processedInput[[3]] | |
141 lenGroups = length(unique(groups)) | |
142 rm(processedInput) | |
143 rm(inputFASTA) | |
144 | |
145 # # remove clones with less than 2 seqeunces | |
146 # tableGL <- table(germlines) | |
147 # singletons <- which(tableGL<8) | |
148 # rowsToRemove <- match(singletons,germlines) | |
149 # if(any(rowsToRemove)){ | |
150 # matInput <- matInput[-rowsToRemove,] | |
151 # germlines <- germlines[-rowsToRemove] | |
152 # groups <- groups[-rowsToRemove] | |
153 # } | |
154 # | |
155 # # remove unproductive seqs | |
156 # nonFuctionalSeqs <- sapply(rownames(matInput),function(x){any(grep("unproductive",x))}) | |
157 # if(any(nonFuctionalSeqs)){ | |
158 # if(sum(nonFuctionalSeqs)==length(germlines)){ | |
159 # write.table("Unproductive",file=paste(outputPath,outputID,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T) | |
160 # q() | |
161 # } | |
162 # matInput <- matInput[-which(nonFuctionalSeqs),] | |
163 # germlines <- germlines[-which(nonFuctionalSeqs)] | |
164 # germlines[1:length(germlines)] <- 1:length(germlines) | |
165 # groups <- groups[-which(nonFuctionalSeqs)] | |
166 # } | |
167 # | |
168 # if(class(matInput)=="character"){ | |
169 # write.table("All unproductive seqs",file=paste(outputPath,outputID,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T) | |
170 # q() | |
171 # } | |
172 # | |
173 # if(nrow(matInput)<10 | is.null(nrow(matInput))){ | |
174 # write.table(paste(nrow(matInput), "seqs only",sep=""),file=paste(outputPath,outputID,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T) | |
175 # q() | |
176 # } | |
177 | |
178 # replace leading & trailing "-" with "N: | |
179 matInput <- t(apply(matInput,1,replaceLeadingTrailingDashes,readEnd)) | |
180 | |
181 # Trim (nucleotide) input sequences to the last codon | |
182 #matInput[,1] <- apply(matrix(matInput[,1]),1,trimToLastCodon) | |
183 | |
184 # # Check for Indels | |
185 # if(fixIndels){ | |
186 # delPos <- fixDeletions(matInput) | |
187 # insPos <- fixInsertions(matInput) | |
188 # }else{ | |
189 # # Check for indels | |
190 # indelPos <- checkForInDels(matInput) | |
191 # indelPos <- apply(cbind(indelPos[[1]],indelPos[[2]]),1,function(x){(x[1]==T & x[2]==T)}) | |
192 # } | |
193 | |
194 # If indels are present, remove mutations in the seqeunce & throw warning at end | |
195 #matInput[indelPos,] <- apply(matrix(matInput[indelPos,],nrow=sum(indelPos),ncol=2),1,function(x){x[1]=x[2]; return(x) }) | |
196 | |
197 colnames(matInput)=c("Input","Germline") | |
198 | |
199 # If seqeunces are clonal, create effective sequence for each clone & modify germline/group definitions | |
200 germlinesOriginal = NULL | |
201 if(clonal){ | |
202 germlinesOriginal <- germlines | |
203 collapseCloneResults <- tapply(1:nrow(matInput),germlines,function(i){ | |
204 collapseClone(matInput[i,1],matInput[i[1],2],readEnd,nonTerminalOnly=(clonal-1)) | |
205 }) | |
206 matInput = t(sapply(collapseCloneResults,function(x){return(x[[1]])})) | |
207 names_groups = tapply(groups,germlines,function(x){names(x[1])}) | |
208 groups = tapply(groups,germlines,function(x){array(x[1],dimnames=names(x[1]))}) | |
209 names(groups) = names_groups | |
210 | |
211 names_germlines = tapply(germlines,germlines,function(x){names(x[1])}) | |
212 germlines = tapply( germlines,germlines,function(x){array(x[1],dimnames=names(x[1]))} ) | |
213 names(germlines) = names_germlines | |
214 matInputErrors = sapply(collapseCloneResults,function(x){return(x[[2]])}) | |
215 } | |
216 | |
217 | |
218 # Selection Analysis | |
219 | |
220 | |
221 # if (length(germlines)>sequenceLimit) { | |
222 # # Code to parallelize processing goes here | |
223 # stop( paste("Error: Cannot process more than ", Upper_limit," sequences",sep="") ) | |
224 # } | |
225 | |
226 # if (length(germlines)<sequenceLimit) {} | |
227 | |
228 # Compute expected mutation frequencies | |
229 matExpected <- getExpectedIndividual(matInput) | |
230 | |
231 # Count observed number of mutations in the different regions | |
232 mutations <- lapply( 1:nrow(matInput), function(i){ | |
233 #cat(i,"\n") | |
234 seqI = s2c(matInput[i,1]) | |
235 seqG = s2c(matInput[i,2]) | |
236 matIGL = matrix(c(seqI,seqG),ncol=length(seqI),nrow=2,byrow=T) | |
237 retVal <- NA | |
238 tryCatch( | |
239 retVal <- analyzeMutations2NucUri(matIGL) | |
240 , error = function(ex){ | |
241 retVal <- NA | |
242 } | |
243 ) | |
244 | |
245 | |
246 return( retVal ) | |
247 }) | |
248 | |
249 matObserved <- t(sapply( mutations, processNucMutations2 )) | |
250 numberOfSeqsWithMutations <- numberOfSeqsWithMutations(matObserved, testID) | |
251 | |
252 #if(sum(numberOfSeqsWithMutations)==0){ | |
253 # write.table("No mutated sequences",file=paste(outputPath,outputID,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T) | |
254 # q() | |
255 #} | |
256 | |
257 matMutationInfo <- cbind(matObserved,matExpected) | |
258 rm(matObserved,matExpected) | |
259 | |
260 | |
261 #Bayesian PDFs | |
262 bayes_pdf = computeBayesianScore(matMutationInfo, test=testName, max_sigma=20,length_sigma=4001) | |
263 bayesPDF_cdr = bayes_pdf[[1]] | |
264 bayesPDF_fwr = bayes_pdf[[2]] | |
265 rm(bayes_pdf) | |
266 | |
267 bayesPDF_germlines_cdr = tapply(bayesPDF_cdr,germlines,function(x) groupPosteriors(x,length_sigma=4001)) | |
268 bayesPDF_germlines_fwr = tapply(bayesPDF_fwr,germlines,function(x) groupPosteriors(x,length_sigma=4001)) | |
269 | |
270 bayesPDF_groups_cdr = tapply(bayesPDF_cdr,groups,function(x) groupPosteriors(x,length_sigma=4001)) | |
271 bayesPDF_groups_fwr = tapply(bayesPDF_fwr,groups,function(x) groupPosteriors(x,length_sigma=4001)) | |
272 | |
273 if(lenGroups>1){ | |
274 groups <- c(groups,lenGroups+1) | |
275 names(groups)[length(groups)] = "All sequences combined" | |
276 bayesPDF_groups_cdr[[lenGroups+1]] = groupPosteriors(bayesPDF_groups_cdr,length_sigma=4001) | |
277 bayesPDF_groups_fwr[[lenGroups+1]] = groupPosteriors(bayesPDF_groups_fwr,length_sigma=4001) | |
278 } | |
279 | |
280 #Bayesian Outputs | |
281 bayes_cdr = t(sapply(bayesPDF_cdr,calcBayesOutputInfo)) | |
282 bayes_fwr = t(sapply(bayesPDF_fwr,calcBayesOutputInfo)) | |
283 bayes_germlines_cdr = t(sapply(bayesPDF_germlines_cdr,calcBayesOutputInfo)) | |
284 bayes_germlines_fwr = t(sapply(bayesPDF_germlines_fwr,calcBayesOutputInfo)) | |
285 bayes_groups_cdr = t(sapply(bayesPDF_groups_cdr,calcBayesOutputInfo)) | |
286 bayes_groups_fwr = t(sapply(bayesPDF_groups_fwr,calcBayesOutputInfo)) | |
287 | |
288 #P-values | |
289 simgaP_cdr = sapply(bayesPDF_cdr,computeSigmaP) | |
290 simgaP_fwr = sapply(bayesPDF_fwr,computeSigmaP) | |
291 | |
292 simgaP_germlines_cdr = sapply(bayesPDF_germlines_cdr,computeSigmaP) | |
293 simgaP_germlines_fwr = sapply(bayesPDF_germlines_fwr,computeSigmaP) | |
294 | |
295 simgaP_groups_cdr = sapply(bayesPDF_groups_cdr,computeSigmaP) | |
296 simgaP_groups_fwr = sapply(bayesPDF_groups_fwr,computeSigmaP) | |
297 | |
298 | |
299 #Format output | |
300 | |
301 # Round expected mutation frequencies to 3 decimal places | |
302 matMutationInfo[germlinesOriginal[indelPos],] = NA | |
303 if(nrow(matMutationInfo)==1){ | |
304 matMutationInfo[5:8] = round(matMutationInfo[,5:8]/sum(matMutationInfo[,5:8],na.rm=T),3) | |
305 }else{ | |
306 matMutationInfo[,5:8] = t(round(apply(matMutationInfo[,5:8],1,function(x){ return(x/sum(x,na.rm=T)) }),3)) | |
307 } | |
308 | |
309 listPDFs = list() | |
310 nRows = length(unique(groups)) + length(unique(germlines)) + length(groups) | |
311 | |
312 matOutput = matrix(NA,ncol=18,nrow=nRows) | |
313 rowNumb = 1 | |
314 for(G in unique(groups)){ | |
315 #print(G) | |
316 matOutput[rowNumb,c(1,2,11:18)] = c("Group",names(groups)[groups==G][1],bayes_groups_cdr[G,],bayes_groups_fwr[G,],simgaP_groups_cdr[G],simgaP_groups_fwr[G]) | |
317 listPDFs[[rowNumb]] = list("CDR"=bayesPDF_groups_cdr[[G]],"FWR"=bayesPDF_groups_fwr[[G]]) | |
318 names(listPDFs)[rowNumb] = names(groups[groups==paste(G)])[1] | |
319 #if(names(groups)[which(groups==G)[1]]!="All sequences combined"){ | |
320 gs = unique(germlines[groups==G]) | |
321 rowNumb = rowNumb+1 | |
322 if( !is.na(gs) ){ | |
323 for( g in gs ){ | |
324 matOutput[rowNumb,c(1,2,11:18)] = c("Germline",names(germlines)[germlines==g][1],bayes_germlines_cdr[g,],bayes_germlines_fwr[g,],simgaP_germlines_cdr[g],simgaP_germlines_fwr[g]) | |
325 listPDFs[[rowNumb]] = list("CDR"=bayesPDF_germlines_cdr[[g]],"FWR"=bayesPDF_germlines_fwr[[g]]) | |
326 names(listPDFs)[rowNumb] = names(germlines[germlines==paste(g)])[1] | |
327 rowNumb = rowNumb+1 | |
328 indexesOfInterest = which(germlines==g) | |
329 numbSeqsOfInterest = length(indexesOfInterest) | |
330 rowNumb = seq(rowNumb,rowNumb+(numbSeqsOfInterest-1)) | |
331 matOutput[rowNumb,] = matrix( c( rep("Sequence",numbSeqsOfInterest), | |
332 rownames(matInput)[indexesOfInterest], | |
333 c(matMutationInfo[indexesOfInterest,1:4]), | |
334 c(matMutationInfo[indexesOfInterest,5:8]), | |
335 c(bayes_cdr[indexesOfInterest,]), | |
336 c(bayes_fwr[indexesOfInterest,]), | |
337 c(simgaP_cdr[indexesOfInterest]), | |
338 c(simgaP_fwr[indexesOfInterest]) | |
339 ), ncol=18, nrow=numbSeqsOfInterest,byrow=F) | |
340 increment=0 | |
341 for( ioi in indexesOfInterest){ | |
342 listPDFs[[min(rowNumb)+increment]] = list("CDR"=bayesPDF_cdr[[ioi]] , "FWR"=bayesPDF_fwr[[ioi]]) | |
343 names(listPDFs)[min(rowNumb)+increment] = rownames(matInput)[ioi] | |
344 increment = increment + 1 | |
345 } | |
346 rowNumb=max(rowNumb)+1 | |
347 | |
348 } | |
349 } | |
350 } | |
351 colsToFormat = 11:18 | |
352 matOutput[,colsToFormat] = formatC( matrix(as.numeric(matOutput[,colsToFormat]), nrow=nrow(matOutput), ncol=length(colsToFormat)) , digits=3) | |
353 matOutput[matOutput== " NaN"] = NA | |
354 | |
355 | |
356 | |
357 colnames(matOutput) = c("Type", "ID", "Observed_CDR_R", "Observed_CDR_S", "Observed_FWR_R", "Observed_FWR_S", | |
358 "Expected_CDR_R", "Expected_CDR_S", "Expected_FWR_R", "Expected_FWR_S", | |
359 paste( rep(testName,6), rep(c("Sigma","CIlower","CIupper"),2),rep(c("CDR","FWR"),each=3), sep="_"), | |
360 paste( rep(testName,2), rep("P",2),c("CDR","FWR"), sep="_") | |
361 ) | |
362 fileName = paste(outputPath,outputID,".txt",sep="") | |
363 write.table(matOutput,file=fileName,quote=F,sep="\t",row.names=T,col.names=NA) | |
364 fileName = paste(outputPath,outputID,".RData",sep="") | |
365 save(listPDFs,file=fileName) | |
366 | |
367 indelWarning = FALSE | |
368 if(sum(indelPos)>0){ | |
369 indelWarning = "<P>Warning: The following sequences have either gaps and/or deletions, and have been ommited from the analysis."; | |
370 indelWarning = paste( indelWarning , "<UL>", sep="" ) | |
371 for(indels in names(indelPos)[indelPos]){ | |
372 indelWarning = paste( indelWarning , "<LI>", indels, "</LI>", sep="" ) | |
373 } | |
374 indelWarning = paste( indelWarning , "</UL></P>", sep="" ) | |
375 } | |
376 | |
377 cloneWarning = FALSE | |
378 if(clonal==1){ | |
379 if(sum(matInputErrors)>0){ | |
380 cloneWarning = "<P>Warning: The following clones have sequences of unequal length."; | |
381 cloneWarning = paste( cloneWarning , "<UL>", sep="" ) | |
382 for(clone in names(matInputErrors)[matInputErrors]){ | |
383 cloneWarning = paste( cloneWarning , "<LI>", names(germlines)[as.numeric(clone)], "</LI>", sep="" ) | |
384 } | |
385 cloneWarning = paste( cloneWarning , "</UL></P>", sep="" ) | |
386 } | |
387 } | |
388 cat(paste("Success",outputID,indelWarning,cloneWarning,sep="|")) |