Mercurial > repos > jfb > commonality_finder
comparison C and D finder/commonality-working feb 21 20.R @ 4:830e8dc3acac draft default tip
Uploaded
author | jfb |
---|---|
date | Tue, 14 Jul 2020 19:51:12 -0400 |
parents | 6eacb089b655 |
children |
comparison
equal
deleted
inserted
replaced
3:6eacb089b655 | 4:830e8dc3acac |
---|---|
1 FirstSubstrateSet<- read.csv("input1.csv", stringsAsFactors=FALSE, header = FALSE) | |
2 Firstsubbackfreq<- read.csv("input2.csv", header=FALSE, stringsAsFactors=FALSE) | |
3 SubstrateHeader<-FirstSubstrateSet[1,] | |
4 FirstSubstrateSet<- FirstSubstrateSet[2:nrow(FirstSubstrateSet),] | |
5 if(nrow(Firstsubbackfreq[1,]>35)){ | |
6 if(grepl(pattern = "Properties", x=Firstsubbackfreq[1,22])){ | |
7 Firstsubbackfreq<-t(Firstsubbackfreq) | |
8 } | |
9 } | |
10 | |
11 | |
12 SecondSubstrateSet<- read.csv("input3.csv", stringsAsFactors=FALSE, header = FALSE) | |
13 Secondsubbackfreq<- read.csv("input4.csv", header=FALSE, stringsAsFactors=FALSE) | |
14 SecondSubstrateSet<- SecondSubstrateSet[2:nrow(SecondSubstrateSet),] | |
15 if(nrow(Secondsubbackfreq[1,]>35)){ | |
16 if(grepl(pattern = "Properties", x=Secondsubbackfreq[1,22])){ | |
17 Secondsubbackfreq<-t(Secondsubbackfreq) | |
18 } | |
19 } | |
20 | |
21 | |
22 ThirdSubstrateSet<- read.csv("input5.csv", stringsAsFactors=FALSE, header = FALSE) | |
23 Thirdsubbackfreq<- read.csv("input6.csv", header=FALSE, stringsAsFactors=FALSE) | |
24 ThirdSubstrateSet<- ThirdSubstrateSet[2:nrow(ThirdSubstrateSet),] | |
25 if(nrow(Thirdsubbackfreq[1,]>35)){ | |
26 if(grepl(pattern = "Properties", x=Thirdsubbackfreq[1,22])){ | |
27 Thirdsubbackfreq<-t(Thirdsubbackfreq) | |
28 } | |
29 } | |
30 | |
31 | |
32 | |
33 | |
34 #ff you want ONLY FULL MOTIFS, put "YES" here, please use all caps | |
35 FullMotifsOnly_questionmark<-"NO" | |
36 #If you want ONLY TRUNCATED MOTIFS, put "YES" here, please use all caps | |
37 TruncatedMotifsOnly_questionmark<-"NO" | |
38 #if you want to find the overlap, put a "YES" here (all caps), if you want to find the non-overlap, put "NO" (all caps) | |
39 Are_You_Looking_For_Commonality<-"YES" | |
40 | |
41 | |
42 #then put the names of your output files here | |
43 Shared_motifs_table<-"sharedmotifs.csv" | |
44 Shared_subbackfreq_table<-"sharedSBF.csv" | |
45 | |
46 # Shared_motifs_table<-"Shared motifs 7-27-17.csv" | |
47 # Shared_subbackfreq_table<-"SubstrateBackgrounFrequency-for-shared-motifs 4 7-27-17.csv" | |
48 | |
49 First_unshared_motifs_table<-"R1 substrates.csv" | |
50 First_unshared_subbackfreq<-"R1 SBF.csv" | |
51 | |
52 Second_unshared_motifs_table<-"R2 subs.csv" | |
53 Second_unshared_subbackfreq<-"R2 SBf.csv" | |
54 | |
55 Third_unshared_motifs_table<-"R3 subs.csv" | |
56 Third_unshared_subbackfreq<-"R3 SBF.csv" | |
57 | |
58 #final note, this code is going to be unworkable if you want to make a Venn diagram of more than 3 circles. I think I'll poke around | |
59 #other languages to see if any of them can do it. | |
60 #################################################################################################################################### | |
61 | |
62 | |
63 | |
64 | |
65 | |
66 FirstxY<-rep("xY",times=nrow(FirstSubstrateSet)) | |
67 FirstSubstrateSet[,11]<-FirstxY | |
68 | |
69 SecondxY<-rep("xY",times=nrow(SecondSubstrateSet)) | |
70 SecondSubstrateSet[,11]<-SecondxY | |
71 | |
72 ThirdxY<-rep("xY",times=nrow(ThirdSubstrateSet)) | |
73 ThirdSubstrateSet[,11]<-ThirdxY | |
74 | |
75 | |
76 | |
77 | |
78 | |
79 | |
80 | |
81 | |
82 | |
83 | |
84 | |
85 #################################################################################################################################### | |
86 #################################################################################################################################### | |
87 # better version of this code written in C: what happens when two kinases share a motif, but they found that motif in two | |
88 # separate proteins thus two separate accession numbers? | |
89 # It should actually output the shared motif and BOTH accession numbers. Right now it does not, it only maps out the second | |
90 # accession number. So that needs to be fixed BUT you need to keep the commonality between a motif and its accession number | |
91 #################################################################################################################################### | |
92 #################################################################################################################################### | |
93 #################################################################################################################################### | |
94 #################################################################################################################################### | |
95 | |
96 #Create the motif sets, deciding wether or not you're looking for truncated or full here | |
97 #full only | |
98 | |
99 FTLwtmotifs=matrix(,nrow = nrow(FirstSubstrateSet),ncol=1) | |
100 FTLwtAccessionNumbers=matrix(data = Firstsubbackfreq[1,],ncol=1) | |
101 | |
102 for (i in 1:nrow(FirstSubstrateSet)){ | |
103 FTLwtletters<-FirstSubstrateSet[i,4:18] | |
104 FTLwtletters<-FTLwtletters[FTLwtletters !="XXXXX"] | |
105 FTLwtletters<-paste(FTLwtletters, sep="", collapse="") | |
106 leftspaces<-c() | |
107 rightspaces<-c() | |
108 | |
109 YYYmotif <- unlist(strsplit(FTLwtletters, split = "")) | |
110 YYYposition <- match(x = "x", table = YYYmotif) | |
111 #position itself tells me how much is to the left of that X by what it's number is. x at position 4 tells me that there are | |
112 #just 3 letters to the left of x | |
113 | |
114 YYYLettersToTheLeft <- YYYposition - 1 | |
115 #how many letters to the right SHOULD just be length(motif)-position-1 if it's 5 long and x is at 3 then Y is at 4 and there is | |
116 #just 1 spot to the right of Y so LettersToTheRight<-1 because 5-3-1=1 | |
117 YYYLettersToTheRight <- length(YYYmotif) - YYYposition - 1 | |
118 #then sanity check, we're currently looking only at +/-4, but this spot allows for up to +/- 7 as well, just depends on what the | |
119 #variable the user puts in is | |
120 | |
121 | |
122 if (YYYLettersToTheLeft < 7 | YYYLettersToTheRight < 7) { | |
123 leftspaces<-rep(" ",times=(7-YYYLettersToTheLeft)) | |
124 rightspaces<-rep(" ",times=7-(YYYLettersToTheRight)) | |
125 #add blank spaces if the motif has less than 4 letters to the left/right | |
126 motif<-c(leftspaces,YYYmotif,rightspaces) | |
127 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
128 motif<-motif[!motif %in% "x"] | |
129 motif<-paste(motif, sep="", collapse="") | |
130 FTLwtletters<-motif | |
131 FTLwtmotifs[i,1]<-FTLwtletters | |
132 # FTLwtAccessionNumbers[i,1]<-FirstSubstrateSet[i,3] | |
133 } | |
134 | |
135 if(YYYLettersToTheLeft>6 && YYYLettersToTheRight>6){ | |
136 motif<-YYYmotif | |
137 #add blank spaces if the motif has less than 4 letters to the left/right | |
138 motif<-c(leftspaces,YYYmotif,rightspaces) | |
139 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
140 motif<-motif[!motif %in% "x"] | |
141 motif<-paste(motif, sep="", collapse="") | |
142 FTLwtletters<-motif | |
143 FTLwtmotifs[i,1]<-FTLwtletters | |
144 # FTLwtAccessionNumbers[i,1]<-FirstSubstrateSet[i,3] | |
145 | |
146 | |
147 } | |
148 | |
149 } | |
150 | |
151 D835Ymotifs=matrix(,nrow = nrow(SecondSubstrateSet),ncol=1) | |
152 D835YAccessionNumbers<-matrix(data = Secondsubbackfreq[1,],ncol = 1) | |
153 | |
154 for (i in 1:nrow(SecondSubstrateSet)){ | |
155 D835letters<-SecondSubstrateSet[i,4:18] | |
156 D835letters<-D835letters[D835letters !="XXXXX"] | |
157 D835letters<-paste(D835letters, sep="", collapse="") | |
158 leftspaces<-c() | |
159 rightspaces<-c() | |
160 | |
161 YYYmotif <- unlist(strsplit(D835letters, split = "")) | |
162 YYYposition <- match(x = "x", table = YYYmotif) | |
163 #position itself tells me how much is to the left of that X by what it's number is. x at position 4 tells me that there are | |
164 #just 3 letters to the left of x | |
165 | |
166 YYYLettersToTheLeft <- YYYposition - 1 | |
167 #how many letters to the right SHOULD just be length(motif)-position-1 if it's 5 long and x is at 3 then Y is at 4 and there is | |
168 #just 1 spot to the right of Y so LettersToTheRight<-1 because 5-3-1=1 | |
169 YYYLettersToTheRight <- length(YYYmotif) - YYYposition - 1 | |
170 #then sanity check, we're currently looking only at +/-4, but this spot allows for up to +/- 7 as well, just depends on what the | |
171 #variable the user puts in is | |
172 if (YYYLettersToTheLeft < 7 | YYYLettersToTheRight < 7) { | |
173 leftspaces<-rep(" ",times=(7-YYYLettersToTheLeft)) | |
174 rightspaces<-rep(" ",times=7-(YYYLettersToTheRight)) | |
175 #add blank spaces if the motif has less than 4 letters to the left/right | |
176 motif<-c(leftspaces,YYYmotif,rightspaces) | |
177 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
178 motif<-motif[!motif %in% "x"] | |
179 motif<-paste(motif, sep="", collapse="") | |
180 D835letters<-motif | |
181 D835Ymotifs[i,1]<-D835letters | |
182 # D835YAccessionNumbers[i,1]<-FirstSubstrateSet[i,3] | |
183 } | |
184 | |
185 if(YYYLettersToTheLeft>6 && YYYLettersToTheRight>6){ | |
186 motif<-YYYmotif | |
187 #add blank spaces if the motif has less than 4 letters to the left/right | |
188 motif<-c(leftspaces,YYYmotif,rightspaces) | |
189 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
190 motif<-motif[!motif %in% "x"] | |
191 motif<-paste(motif, sep="", collapse="") | |
192 D835letters<-motif | |
193 D835Ymotifs[i,1]<-D835letters | |
194 # D835YAccessionNumbers[i,1]<-FirstSubstrateSet[i,3] | |
195 } | |
196 } | |
197 | |
198 | |
199 ITDmotifs=matrix(,nrow = nrow(ThirdSubstrateSet),ncol=1) | |
200 ITDAccessionNumbers<-matrix(data = Thirdsubbackfreq[1,],ncol = 1) | |
201 | |
202 for (i in 1:nrow(ThirdSubstrateSet)){ | |
203 ITDletters<-ThirdSubstrateSet[i,4:18] | |
204 ITDletters<-ITDletters[ITDletters !="XXXXX"] | |
205 ITDletters<-paste(ITDletters, sep="", collapse="") | |
206 YYYmotif <- unlist(strsplit(ITDletters, split = "")) | |
207 leftspaces<-c() | |
208 rightspaces<-c() | |
209 YYYposition <- match(x = "x", table = YYYmotif) | |
210 #position itself tells me how much is to the left of that X by what it's number is. x at position 4 tells me that there are | |
211 #just 3 letters to the left of x | |
212 | |
213 YYYLettersToTheLeft <- YYYposition - 1 | |
214 #how many letters to the right SHOULD just be length(motif)-position-1 if it's 5 long and x is at 3 then Y is at 4 and there is | |
215 #just 1 spot to the right of Y so LettersToTheRight<-1 because 5-3-1=1 | |
216 YYYLettersToTheRight <- length(YYYmotif) - YYYposition - 1 | |
217 #then sanity check, we're currently looking only at +/-4, but this spot allows for up to +/- 7 as well, just depends on what the | |
218 #variable the user puts in is | |
219 if (YYYLettersToTheLeft < 7 | YYYLettersToTheRight < 7) { | |
220 leftspaces<-rep(" ",times=(7-YYYLettersToTheLeft)) | |
221 rightspaces<-rep(" ",times=7-(YYYLettersToTheRight)) | |
222 #add blank spaces if the motif has less than 4 letters to the left/right | |
223 motif<-c(leftspaces,YYYmotif,rightspaces) | |
224 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
225 motif<-motif[!motif %in% "x"] | |
226 motif<-paste(motif, sep="", collapse="") | |
227 ITDletters<-motif | |
228 ITDmotifs[i,1]<-ITDletters | |
229 # ITDAccessionNumbers[i,1]<-FirstSubstrateSet[i,3] | |
230 } | |
231 | |
232 if(YYYLettersToTheLeft>6 && YYYLettersToTheRight>6){ | |
233 motif<-YYYmotif | |
234 #add blank spaces if the motif has less than 4 letters to the left/right | |
235 motif<-c(leftspaces,YYYmotif,rightspaces) | |
236 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
237 motif<-motif[!motif %in% "x"] | |
238 motif<-paste(motif, sep="", collapse="") | |
239 ITDletters<-motif | |
240 ITDmotifs[i,1]<-ITDletters | |
241 # ITDAccessionNumbers[i,1]<-FirstSubstrateSet[i,3] | |
242 } | |
243 } | |
244 | |
245 ############################################################################################################################# | |
246 ############################################################################################################################# | |
247 ############################################################################################################################# | |
248 ############################################################################################################################# | |
249 ############################################################################################################################# | |
250 | |
251 #now look for either commonality or difference. Actually could you look for both... | |
252 | |
253 if (Are_You_Looking_For_Commonality=="YES"){ | |
254 | |
255 columnalheader<-c(as.character(Thirdsubbackfreq[1:36,1])) | |
256 columnalheader<-matrix(columnalheader,nrow = 1) | |
257 | |
258 SubstrateOverlap1<-intersect(D835Ymotifs,ITDmotifs) | |
259 SubstrateOverlap1<-as.matrix(SubstrateOverlap1) | |
260 | |
261 | |
262 columnalheader<-c(rep(NA,36)) | |
263 FinalMatrix<-matrix(data =columnalheader,ncol = 1) | |
264 | |
265 SubstrateOverlapFINAL<-intersect(FTLwtmotifs,SubstrateOverlap1) | |
266 AccessionOverlap1<-intersect(D835YAccessionNumbers,ITDAccessionNumbers) | |
267 AccessionOverlapFinal<-intersect(AccessionOverlap1,FTLwtAccessionNumbers) | |
268 AccessionOverlapFinal<-unlist(AccessionOverlapFinal) | |
269 | |
270 for (x in 1:length(AccessionOverlapFinal)) { | |
271 for (y in 1:ncol(Firstsubbackfreq)) { | |
272 Acc<-AccessionOverlapFinal[x] | |
273 SBF<-Firstsubbackfreq[1,y] | |
274 if(Acc==SBF){ | |
275 FinalMatrix<-cbind(FinalMatrix,Firstsubbackfreq[,y]) | |
276 } | |
277 } | |
278 } | |
279 FinalMatrix<-FinalMatrix[,2:ncol(FinalMatrix)] | |
280 | |
281 if(grepl(pattern = "Properties", x=FinalMatrix[22,1])==FALSE){ | |
282 Outputmatrix<-cbind(Firstsubbackfreq[,1],FinalMatrix) | |
283 write.table(x=Outputmatrix,file = Shared_subbackfreq_table,quote = FALSE,sep = ",",row.names = FALSE,col.names = FALSE,na="") | |
284 } else { | |
285 write.table(x=FinalMatrix,file = Shared_subbackfreq_table,quote = FALSE,sep = ",",row.names = FALSE,col.names = FALSE,na="") | |
286 } | |
287 | |
288 SubstrateMatrix<-SubstrateHeader | |
289 if(ncol(SubstrateMatrix)>18){ | |
290 SubstrateMatrix<-SubstrateMatrix[,1:18] | |
291 } | |
292 | |
293 for (z in 1:length(SubstrateOverlapFINAL)) { | |
294 motif<-SubstrateOverlapFINAL[z] | |
295 newmotif<-unlist(strsplit(motif,split = "")) | |
296 | |
297 Addition<-"" | |
298 outputmotif<-c(Addition,Addition,Addition,newmotif) | |
299 SubstrateMatrix<-rbind(SubstrateMatrix,outputmotif) | |
300 } | |
301 write.table(x=SubstrateMatrix,file = Shared_motifs_table,quote = FALSE,sep = ",",row.names = FALSE,col.names = FALSE,na="") | |
302 } |