Mercurial > repos > jfb > difference_finder
comparison all stuff/Difference finder for GalaxyP working.R @ 9:033dd86d3e0c draft
Uploaded
author | jfb |
---|---|
date | Fri, 21 Feb 2020 13:07:45 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
8:4dd15c41d9e7 | 9:033dd86d3e0c |
---|---|
1 #ff you want ONLY FULL MOTIFS, put "YES" here, please use all caps | |
2 FullMotifsOnly_questionmark<-"NO" | |
3 #If you want ONLY TRUNCATED MOTIFS, put "YES" here, please use all caps | |
4 TruncatedMotifsOnly_questionmark<-"NO" | |
5 #if you want to find the overlap, put a "YES" here (all caps), if you want to find the non-overlap, put "NO" (all caps) | |
6 Are_You_Looking_For_Commonality<-"NO" | |
7 | |
8 | |
9 #put the names of your input files here | |
10 FirstSubstrateSet<- read.csv("S1.csv", stringsAsFactors=FALSE,colClasses = "character") | |
11 Firstsubbackfreq<- read.csv("SBF1.csv", header=FALSE, stringsAsFactors=FALSE) | |
12 | |
13 SecondSubstrateSet<- read.csv("S2.csv", stringsAsFactors=FALSE,colClasses = "character") | |
14 Secondsubbackfreq<- read.csv("SBF2.csv", header=FALSE, stringsAsFactors=FALSE) | |
15 | |
16 | |
17 First_unshared_motifs_table<-"1RS.csv" | |
18 First_unshared_subbackfreq<-"1RSBF.csv" | |
19 | |
20 Second_unshared_motifs_table<-"2RS.csv" | |
21 Second_unshared_subbackfreq<-"2RSBF.csv" | |
22 | |
23 EmptySubHeader<-colnames(FirstSubstrateSet) | |
24 EmptySubHeader<-matrix(EmptySubHeader, nrow=1) | |
25 EmptySBFHeader<-Firstsubbackfreq[,1] | |
26 | |
27 #final note, this code is going to be unworkable if you want to make a Venn diagram of more than 3 circles. I think I'll poke around | |
28 #other languages to see if any of them can do it. | |
29 #################################################################################################################################### | |
30 | |
31 | |
32 FirstCentralLetters<-FirstSubstrateSet[,11] | |
33 SecondCentralLetters<-SecondSubstrateSet[,11] | |
34 | |
35 FirstEsses<-sapply(FirstCentralLetters, grepl, pattern="S", ignore.case=TRUE) | |
36 FirstTees<-sapply(FirstCentralLetters, grepl, pattern="T", ignore.case=TRUE) | |
37 FirstWys<-sapply(FirstCentralLetters, grepl, pattern="Y", ignore.case=TRUE) | |
38 | |
39 SecondEsses<-sapply(SecondCentralLetters, grepl, pattern="S", ignore.case=TRUE) | |
40 SecondTees<-sapply(SecondCentralLetters, grepl, pattern="T", ignore.case=TRUE) | |
41 SecondWys<-sapply(SecondCentralLetters, grepl, pattern="Y", ignore.case=TRUE) | |
42 | |
43 FirstCentralLetters<-replace(FirstCentralLetters,FirstEsses,"xS") | |
44 FirstCentralLetters<-replace(FirstCentralLetters,FirstTees,"xT") | |
45 FirstCentralLetters<-replace(FirstCentralLetters,FirstWys,"xY") | |
46 | |
47 SecondCentralLetters<-replace(SecondCentralLetters,SecondEsses,"xS") | |
48 SecondCentralLetters<-replace(SecondCentralLetters,SecondTees,"xT") | |
49 SecondCentralLetters<-replace(SecondCentralLetters,SecondWys,"xY") | |
50 | |
51 FirstCentralLetters->FirstSubstrateSet[,11] | |
52 SecondCentralLetters->SecondSubstrateSet[,11] | |
53 | |
54 #################################################################################################################################### | |
55 #################################################################################################################################### | |
56 # better version of this code written in C: what happens when two kinases share a motif, but they found that motif in two | |
57 # separate proteins thus two separate accession numbers? | |
58 # It should actually output the shared motif and BOTH accession numbers. Right now it does not, it only maps out the second | |
59 # accession number. So that needs to be fixed BUT you need to keep the commonality between a motif and its accession number | |
60 #################################################################################################################################### | |
61 #################################################################################################################################### | |
62 #################################################################################################################################### | |
63 #################################################################################################################################### | |
64 | |
65 #Create the motif sets, deciding wether or not you're looking for truncated or full here | |
66 #full only | |
67 | |
68 | |
69 ############################################### | |
70 #ALL motifs, full and truncated | |
71 | |
72 if (FullMotifsOnly_questionmark!="YES"&&TruncatedMotifsOnly_questionmark!="YES"){ | |
73 FTLwtmotifs=matrix(,nrow = nrow(FirstSubstrateSet),ncol=1) | |
74 FTLwtAccessionNumbers=matrix(,nrow = nrow(FirstSubstrateSet),ncol=1) | |
75 | |
76 for (i in 1:nrow(FirstSubstrateSet)){ | |
77 FTLwtletters<-FirstSubstrateSet[i,4:18] | |
78 FTLwtletters<-FTLwtletters[FTLwtletters !="XXXXX"] | |
79 FTLwtletters<-paste(FTLwtletters, sep="", collapse="") | |
80 leftspaces<-c() | |
81 rightspaces<-c() | |
82 | |
83 YYYmotif <- unlist(strsplit(FTLwtletters, split = "")) | |
84 YYYposition <- match(x = "x", table = YYYmotif) | |
85 #position itself tells me how much is to the left of that X by what it's number is. x at position 4 tells me that there are | |
86 #just 3 letters to the left of x | |
87 | |
88 YYYLettersToTheLeft <- YYYposition - 1 | |
89 #how many letters to the right SHOULD just be length(motif)-position-1 if it's 5 long and x is at 3 then Y is at 4 and there is | |
90 #just 1 spot to the right of Y so LettersToTheRight<-1 because 5-3-1=1 | |
91 YYYLettersToTheRight <- length(YYYmotif) - YYYposition - 1 | |
92 #then sanity check, we're currently looking only at +/-4, but this spot allows for up to +/- 7 as well, just depends on what the | |
93 #variable the user puts in is | |
94 | |
95 | |
96 if (YYYLettersToTheLeft < 7 | YYYLettersToTheRight < 7) { | |
97 leftspaces<-rep(" ",times=(7-YYYLettersToTheLeft)) | |
98 rightspaces<-rep(" ",times=7-(YYYLettersToTheRight)) | |
99 #add blank spaces if the motif has less than 4 letters to the left/right | |
100 motif<-c(leftspaces,YYYmotif,rightspaces) | |
101 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
102 motif<-motif[!motif %in% "x"] | |
103 motif<-paste(motif, sep="", collapse="") | |
104 FTLwtletters<-motif | |
105 FTLwtmotifs[i,1]<-FTLwtletters | |
106 FTLwtAccessionNumbers[i,1]<-FirstSubstrateSet[i,3] | |
107 } | |
108 | |
109 if(YYYLettersToTheLeft>6 && YYYLettersToTheRight>6){ | |
110 motif<-YYYmotif | |
111 #add blank spaces if the motif has less than 4 letters to the left/right | |
112 motif<-c(leftspaces,YYYmotif,rightspaces) | |
113 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
114 motif<-motif[!motif %in% "x"] | |
115 motif<-paste(motif, sep="", collapse="") | |
116 FTLwtletters<-motif | |
117 FTLwtmotifs[i,1]<-FTLwtletters | |
118 FTLwtAccessionNumbers[i,1]<-FirstSubstrateSet[i,3] | |
119 | |
120 | |
121 } | |
122 | |
123 } | |
124 | |
125 D835Ymotifs=matrix(,nrow = nrow(SecondSubstrateSet),ncol=1) | |
126 D835YAccessionNumbers<-matrix(,nrow = nrow(SecondSubstrateSet),ncol = 1) | |
127 | |
128 for (i in 1:nrow(SecondSubstrateSet)){ | |
129 D835letters<-SecondSubstrateSet[i,4:18] | |
130 D835letters<-D835letters[D835letters !="XXXXX"] | |
131 D835letters<-paste(D835letters, sep="", collapse="") | |
132 leftspaces<-c() | |
133 rightspaces<-c() | |
134 | |
135 YYYmotif <- unlist(strsplit(D835letters, split = "")) | |
136 YYYposition <- match(x = "x", table = YYYmotif) | |
137 #position itself tells me how much is to the left of that X by what it's number is. x at position 4 tells me that there are | |
138 #just 3 letters to the left of x | |
139 | |
140 YYYLettersToTheLeft <- YYYposition - 1 | |
141 #how many letters to the right SHOULD just be length(motif)-position-1 if it's 5 long and x is at 3 then Y is at 4 and there is | |
142 #just 1 spot to the right of Y so LettersToTheRight<-1 because 5-3-1=1 | |
143 YYYLettersToTheRight <- length(YYYmotif) - YYYposition - 1 | |
144 #then sanity check, we're currently looking only at +/-4, but this spot allows for up to +/- 7 as well, just depends on what the | |
145 #variable the user puts in is | |
146 if (YYYLettersToTheLeft < 7 | YYYLettersToTheRight < 7) { | |
147 leftspaces<-rep(" ",times=(7-YYYLettersToTheLeft)) | |
148 rightspaces<-rep(" ",times=7-(YYYLettersToTheRight)) | |
149 #add blank spaces if the motif has less than 4 letters to the left/right | |
150 motif<-c(leftspaces,YYYmotif,rightspaces) | |
151 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
152 motif<-motif[!motif %in% "x"] | |
153 motif<-paste(motif, sep="", collapse="") | |
154 D835letters<-motif | |
155 D835Ymotifs[i,1]<-D835letters | |
156 D835YAccessionNumbers[i,1]<-SecondSubstrateSet[i,3] | |
157 } | |
158 | |
159 if(YYYLettersToTheLeft>6 && YYYLettersToTheRight>6){ | |
160 motif<-YYYmotif | |
161 #add blank spaces if the motif has less than 4 letters to the left/right | |
162 motif<-c(leftspaces,YYYmotif,rightspaces) | |
163 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
164 motif<-motif[!motif %in% "x"] | |
165 motif<-paste(motif, sep="", collapse="") | |
166 D835letters<-motif | |
167 D835Ymotifs[i,1]<-D835letters | |
168 D835YAccessionNumbers[i,1]<-SecondSubstrateSet[i,3] | |
169 } | |
170 } | |
171 | |
172 names(FTLwtmotifs)<-FTLwtAccessionNumbers | |
173 names(D835Ymotifs)<-D835YAccessionNumbers | |
174 | |
175 } | |
176 | |
177 | |
178 FTLwtmotifsFINAL<-FTLwtmotifs[!FTLwtmotifs %in% D835Ymotifs] | |
179 FTLwtmotifsFINAL<-FTLwtmotifsFINAL[!duplicated(FTLwtmotifsFINAL)] | |
180 | |
181 D835YmotifsFINAL<-D835Ymotifs[!D835Ymotifs %in% FTLwtmotifs] | |
182 D835YmotifsFINAL<-D835YmotifsFINAL[!duplicated(D835YmotifsFINAL)] | |
183 | |
184 | |
185 columnalheader<-c(rep(NA,36)) | |
186 FTLFinalMatrix<-matrix(data =columnalheader,nrow = 1) | |
187 | |
188 FLTheader<-c("Substrate","Species","Reference","-7","-6","-5","-4","-3","-2","-1","0","1","2","3","4","5","6","7","Phosphite") | |
189 | |
190 if (length(FTLwtmotifsFINAL)>0){ | |
191 for (k in 1:length(FTLwtmotifsFINAL)) { | |
192 AN<-00000 | |
193 #I don't remember why, but I felt it necessary to destroy the accession number multiple times to ensure it is | |
194 #destroyed immediately after use | |
195 for (m in 1:ncol(Firstsubbackfreq)) { | |
196 AN <- as.character(Firstsubbackfreq[1, m]) | |
197 if (grepl(pattern = AN, | |
198 x = names(FTLwtmotifsFINAL[k]), | |
199 fixed = TRUE) == TRUE) { | |
200 outputmatrix <- as.character(Firstsubbackfreq[, m]) | |
201 outputmatrix <- matrix(outputmatrix, nrow = 1) | |
202 #with that accession number, find a match in the subbackfreq file and save it here | |
203 FTLFinalMatrix<-rbind(FTLFinalMatrix,outputmatrix) | |
204 } | |
205 } | |
206 } | |
207 FTLFinalMatrix<-FTLFinalMatrix[!duplicated(FTLFinalMatrix),] | |
208 FTLFinalMatrix<-FTLFinalMatrix[2:nrow(FTLFinalMatrix),] | |
209 | |
210 | |
211 FTLoutputmatrix<-matrix(data=c(FTLwtmotifsFINAL,names(FTLwtmotifsFINAL)),ncol = 2) | |
212 # FLTheader<-unlist(FLTheader) | |
213 lefthandFLT<-matrix(data = rep(NA,times=2*nrow(FTLoutputmatrix)),nrow=nrow(FTLoutputmatrix)) | |
214 righthandFLT<-matrix(data = rep(NA,times=1*nrow(FTLoutputmatrix)),nrow=nrow(FTLoutputmatrix)) | |
215 FLTaccessionset<-FTLoutputmatrix[,2] | |
216 FTLmeat<-sapply(FTLoutputmatrix[,1], strsplit, "") | |
217 FTLmeat<-sapply(FTLmeat, unlist) | |
218 colnames(FTLmeat)<-NULL | |
219 FTLmeat<-t(FTLmeat) | |
220 | |
221 FTLoutputmatrix2<-cbind(lefthandFLT,FLTaccessionset,FTLmeat,righthandFLT) | |
222 colnames(FTLoutputmatrix2)<-NULL | |
223 rownames(FTLoutputmatrix2)<-NULL | |
224 colnames(FLTheader)<-NULL | |
225 rownames(FLTheader)<-NULL | |
226 | |
227 | |
228 FirstCentralLettersAGAIN<-FTLoutputmatrix2[,11] | |
229 | |
230 FirstEsses<-sapply(FirstCentralLettersAGAIN, grepl, pattern="S", ignore.case=TRUE) | |
231 FirstTees<-sapply(FirstCentralLettersAGAIN, grepl, pattern="T", ignore.case=TRUE) | |
232 FirstWys<-sapply(FirstCentralLettersAGAIN, grepl, pattern="Y", ignore.case=TRUE) | |
233 | |
234 FirstCentralLettersAGAIN<-replace(FirstCentralLettersAGAIN,FirstEsses,"xS") | |
235 FirstCentralLettersAGAIN<-replace(FirstCentralLettersAGAIN,FirstTees,"xT") | |
236 FirstCentralLettersAGAIN<-replace(FirstCentralLettersAGAIN,FirstWys,"xY") | |
237 | |
238 FirstCentralLettersAGAIN->FTLoutputmatrix2[,11] | |
239 | |
240 FTLoutputmatrix2<-rbind(FLTheader,FTLoutputmatrix2) | |
241 | |
242 write.table(x=FTLoutputmatrix2, | |
243 file=First_unshared_motifs_table, | |
244 quote=FALSE, sep=",", | |
245 row.names=FALSE,col.names = FALSE, na="", append=TRUE) | |
246 | |
247 columnalheader<-c("Accession Numbers",as.character(Firstsubbackfreq[1:35,1])) | |
248 columnalheader<-matrix(columnalheader,nrow = 1) | |
249 write.table(x=columnalheader, | |
250 file=First_unshared_subbackfreq, | |
251 quote=FALSE, sep=",", | |
252 row.names=FALSE,col.names = FALSE, na="", append=TRUE) | |
253 | |
254 write.table(x=FTLFinalMatrix, | |
255 file=First_unshared_subbackfreq, | |
256 quote=FALSE, sep=",", | |
257 row.names=FALSE,col.names = FALSE, na="", append=TRUE) | |
258 } else{ | |
259 FTLFinalMatrix<-columnalheader | |
260 write.table(x=EmptySubHeader, | |
261 file=First_unshared_motifs_table, | |
262 quote=FALSE, sep=",", | |
263 row.names=FALSE,col.names = FALSE, na="", append=TRUE) | |
264 | |
265 columnalheader<-c("Accession Numbers",as.character(Firstsubbackfreq[1:35,1])) | |
266 columnalheader<-matrix(columnalheader,nrow = 1) | |
267 write.table(x=columnalheader, | |
268 file=First_unshared_subbackfreq, | |
269 quote=FALSE, sep=",", | |
270 row.names=FALSE,col.names = FALSE, na="", append=TRUE) | |
271 } | |
272 | |
273 | |
274 columnalheader<-c(rep(NA,36)) | |
275 D835YFinalMatrix<-matrix(data =columnalheader,nrow = 1) | |
276 | |
277 if (length(D835YmotifsFINAL)>0){ | |
278 for (k in 1:length(D835YmotifsFINAL)) { | |
279 #I don't remember why, but I felt it necessary to destroy the accession number multiple times to ensure it is | |
280 #destroyed immediately after use | |
281 for (m in 1:ncol(Secondsubbackfreq)) { | |
282 AN <- as.character(Secondsubbackfreq[1, m]) | |
283 if (grepl(pattern = AN, | |
284 x = names(D835YmotifsFINAL[k]), | |
285 fixed = TRUE) == TRUE) { | |
286 outputmatrix <- as.character(Secondsubbackfreq[, m]) | |
287 outputmatrix <- matrix(outputmatrix, nrow = 1) | |
288 #with that accession number, find a match in the subbackfreq file and save it here | |
289 D835YFinalMatrix<-rbind(D835YFinalMatrix,outputmatrix) | |
290 } | |
291 } | |
292 } | |
293 D835YFinalMatrix<-D835YFinalMatrix[!duplicated(D835YFinalMatrix),] | |
294 D835YFinalMatrix<-D835YFinalMatrix[2:nrow(D835YFinalMatrix),] | |
295 | |
296 D835Youtputmatrix<-matrix(data=c(D835YmotifsFINAL,names(D835YmotifsFINAL)),ncol = 2) | |
297 | |
298 D835Yheader<-c("Substrate","Species","Reference","-7","-6","-5","-4","-3","-2","-1","0","1","2","3","4","5","6","7","Phosphite") | |
299 # D835Yheader<-unlist(D835Yheader) | |
300 lefthandD835<-matrix(data = rep(NA,times=2*nrow(D835Youtputmatrix)),nrow=nrow(D835Youtputmatrix)) | |
301 righthandD835<-matrix(data = rep(NA,times=1*nrow(D835Youtputmatrix)),nrow=nrow(D835Youtputmatrix)) | |
302 D835Yaset<-D835Youtputmatrix[,2] | |
303 D835meat<-sapply(D835Youtputmatrix[,1], strsplit, "") | |
304 D835meat<-sapply(D835meat, unlist) | |
305 colnames(D835meat)<-NULL | |
306 D835meat<-t(D835meat) | |
307 | |
308 D835Youtputmatrix2<-cbind(lefthandD835,D835Yaset,D835meat,righthandD835) | |
309 colnames(D835Youtputmatrix2)<-NULL | |
310 rownames(D835Youtputmatrix2)<-NULL | |
311 colnames(D835Yheader)<-NULL | |
312 rownames(D835Yheader)<-NULL | |
313 | |
314 | |
315 SecondCentralLettersAGAIN<-D835Youtputmatrix2[,11] | |
316 | |
317 SecondEsses<-sapply(SecondCentralLettersAGAIN, grepl, pattern="S", ignore.case=TRUE) | |
318 SecondTees<-sapply(SecondCentralLettersAGAIN, grepl, pattern="T", ignore.case=TRUE) | |
319 SecondWys<-sapply(SecondCentralLettersAGAIN, grepl, pattern="Y", ignore.case=TRUE) | |
320 | |
321 SecondCentralLettersAGAIN<-replace(SecondCentralLettersAGAIN,SecondEsses,"xS") | |
322 SecondCentralLettersAGAIN<-replace(SecondCentralLettersAGAIN,SecondTees,"xT") | |
323 SecondCentralLettersAGAIN<-replace(SecondCentralLettersAGAIN,SecondWys,"xY") | |
324 | |
325 SecondCentralLettersAGAIN->D835Youtputmatrix2[,11] | |
326 | |
327 D835Youtputmatrix2<-rbind(D835Yheader,D835Youtputmatrix2) | |
328 | |
329 write.table(x=D835Youtputmatrix2, | |
330 file=Second_unshared_motifs_table, | |
331 quote=FALSE, sep=",", | |
332 row.names=FALSE,col.names = FALSE, na="", append=TRUE) | |
333 | |
334 columnalheader<-c("Accession Numbers",as.character(Firstsubbackfreq[1:35,1])) | |
335 columnalheader<-matrix(columnalheader,nrow = 1) | |
336 write.table(x=columnalheader, | |
337 file=Second_unshared_subbackfreq, | |
338 quote=FALSE, sep=",", | |
339 row.names=FALSE,col.names = FALSE, na="", append=TRUE) | |
340 | |
341 write.table(x=D835YFinalMatrix, | |
342 file=Second_unshared_subbackfreq, | |
343 quote=FALSE, sep=",", | |
344 row.names=FALSE,col.names = FALSE, na="", append=TRUE) | |
345 } else { | |
346 D835YFinalMatrix<- columnalheader | |
347 write.table(x=EmptySubHeader, | |
348 file=Second_unshared_motifs_table, | |
349 quote=FALSE, sep=",", | |
350 row.names=FALSE,col.names = FALSE, na="", append=TRUE) | |
351 | |
352 columnalheader<-c("Accession Numbers",as.character(Firstsubbackfreq[1:35,1])) | |
353 columnalheader<-matrix(columnalheader,nrow = 1) | |
354 write.table(x=columnalheader, | |
355 file=Second_unshared_subbackfreq, | |
356 quote=FALSE, sep=",", | |
357 row.names=FALSE,col.names = FALSE, na="", append=TRUE) | |
358 } |