Mercurial > repos > jfb > commonality_finder
comparison C and D finder/commonality-working jul 13 2020.R @ 4:830e8dc3acac draft default tip
Uploaded
author | jfb |
---|---|
date | Tue, 14 Jul 2020 19:51:12 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:6eacb089b655 | 4:830e8dc3acac |
---|---|
1 FirstSubstrateSet<- read.csv("input1.csv", stringsAsFactors=FALSE, header = FALSE) | |
2 Firstsubbackfreq<- read.csv("input2.csv", header=FALSE, stringsAsFactors=FALSE) | |
3 SubstrateHeader<-FirstSubstrateSet[1,] | |
4 FirstSubstrateSet<- FirstSubstrateSet[2:nrow(FirstSubstrateSet),] | |
5 if(nrow(Firstsubbackfreq[1,]>35)){ | |
6 if(grepl(pattern = "Properties", x=Firstsubbackfreq[1,22])){ | |
7 Firstsubbackfreq<-t(Firstsubbackfreq) | |
8 } | |
9 } | |
10 | |
11 | |
12 SecondSubstrateSet<- read.csv("input3.csv", stringsAsFactors=FALSE, header = FALSE) | |
13 Secondsubbackfreq<- read.csv("input4.csv", header=FALSE, stringsAsFactors=FALSE) | |
14 SecondSubstrateSet<- SecondSubstrateSet[2:nrow(SecondSubstrateSet),] | |
15 if(nrow(Secondsubbackfreq[1,]>35)){ | |
16 if(grepl(pattern = "Properties", x=Secondsubbackfreq[1,22])){ | |
17 Secondsubbackfreq<-t(Secondsubbackfreq) | |
18 } | |
19 } | |
20 | |
21 | |
22 ThirdSubstrateSet<- read.csv("input5.csv", stringsAsFactors=FALSE, header = FALSE) | |
23 Thirdsubbackfreq<- read.csv("input6.csv", header=FALSE, stringsAsFactors=FALSE) | |
24 ThirdSubstrateSet<- ThirdSubstrateSet[2:nrow(ThirdSubstrateSet),] | |
25 if(nrow(Thirdsubbackfreq[1,]>35)){ | |
26 if(grepl(pattern = "Properties", x=Thirdsubbackfreq[1,22])){ | |
27 Thirdsubbackfreq<-t(Thirdsubbackfreq) | |
28 } | |
29 } | |
30 | |
31 #the above three sections bring in the input files and ensure they are properly aligned. The if statements align them if they are misaligned | |
32 | |
33 First_unshared_motifs_table<-"R1 substrates.csv" | |
34 First_unshared_subbackfreq<-"R1 SBF.csv" | |
35 | |
36 Second_unshared_motifs_table<-"R2 subs.csv" | |
37 Second_unshared_subbackfreq<-"R2 SBf.csv" | |
38 | |
39 Third_unshared_motifs_table<-"R3 subs.csv" | |
40 Third_unshared_subbackfreq<-"R3 SBF.csv" | |
41 | |
42 #the above 4 sections create the names of the output files that this tool can create | |
43 | |
44 | |
45 | |
46 | |
47 FirstxY<-rep("xY",times=nrow(FirstSubstrateSet)) | |
48 FirstSubstrateSet[,11]<-FirstxY | |
49 | |
50 SecondxY<-rep("xY",times=nrow(SecondSubstrateSet)) | |
51 SecondSubstrateSet[,11]<-SecondxY | |
52 | |
53 ThirdxY<-rep("xY",times=nrow(ThirdSubstrateSet)) | |
54 ThirdSubstrateSet[,11]<-ThirdxY | |
55 | |
56 #for each input file, mark the phospho-Tyrosine (which is always housed in column 11) | |
57 #with an xY to denote that is it a phospho and not regular tyrosine | |
58 | |
59 | |
60 #currently the substrates are a dataframe with many values, I want to collapse each substrate into a single variable, | |
61 #but if I simply use the paste() function then I lose any information about whether the substrate was truncated on the C or N terminal. | |
62 #so the for loops below then are constructed so as to retain that information on how any substrate is truncated. | |
63 | |
64 FTLwtmotifs=matrix(,nrow = nrow(FirstSubstrateSet),ncol=1) | |
65 FTLwtAccessionNumbers=matrix(data = Firstsubbackfreq[1,],ncol=1) | |
66 #create the vectors which will house the first set of substrates and the first set of accession numbers | |
67 | |
68 | |
69 for (i in 1:nrow(FirstSubstrateSet)){ | |
70 FTLwtletters<-FirstSubstrateSet[i,4:18] | |
71 FTLwtletters<-FTLwtletters[FTLwtletters !="XXXXX"] | |
72 FTLwtletters<-paste(FTLwtletters, sep="", collapse="") | |
73 leftspaces<-c() | |
74 rightspaces<-c() | |
75 | |
76 YYYmotif <- unlist(strsplit(FTLwtletters, split = "")) | |
77 YYYposition <- match(x = "x", table = YYYmotif) | |
78 #position itself tells me how much is to the left of that X by what it's number is. x at position 4 tells me that there are | |
79 #just 3 letters to the left of x | |
80 | |
81 YYYLettersToTheLeft <- YYYposition - 1 | |
82 YYYLettersToTheRight <- length(YYYmotif) - YYYposition - 1 | |
83 #how many letters to the right SHOULD just be length(motif)-position-1 if it's 5 long and x is at 3 then Y is at 4 and there is | |
84 #just 1 spot to the right of Y so LettersToTheRight<-1 because 5-3-1=1 | |
85 | |
86 | |
87 | |
88 if (YYYLettersToTheLeft < 7 | YYYLettersToTheRight < 7) { | |
89 leftspaces<-rep(" ",times=(7-YYYLettersToTheLeft)) | |
90 rightspaces<-rep(" ",times=7-(YYYLettersToTheRight)) | |
91 #add blank spaces if the motif has less than 4 letters to the left/right | |
92 motif<-c(leftspaces,YYYmotif,rightspaces) | |
93 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
94 motif<-motif[!motif %in% "x"] | |
95 motif<-paste(motif, sep="", collapse="") | |
96 FTLwtletters<-motif | |
97 FTLwtmotifs[i,1]<-FTLwtletters | |
98 } | |
99 | |
100 if(YYYLettersToTheLeft>6 && YYYLettersToTheRight>6){ | |
101 motif<-YYYmotif | |
102 #add blank spaces if the motif has less than 4 letters to the left/right | |
103 motif<-c(leftspaces,YYYmotif,rightspaces) | |
104 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
105 motif<-motif[!motif %in% "x"] | |
106 motif<-paste(motif, sep="", collapse="") | |
107 FTLwtletters<-motif | |
108 FTLwtmotifs[i,1]<-FTLwtletters | |
109 } | |
110 } | |
111 | |
112 D835Ymotifs=matrix(,nrow = nrow(SecondSubstrateSet),ncol=1) | |
113 D835YAccessionNumbers<-matrix(data = Secondsubbackfreq[1,],ncol = 1) | |
114 #vectors to house the second set of substrates and accession numbers | |
115 | |
116 for (i in 1:nrow(SecondSubstrateSet)){ | |
117 D835letters<-SecondSubstrateSet[i,4:18] | |
118 D835letters<-D835letters[D835letters !="XXXXX"] | |
119 D835letters<-paste(D835letters, sep="", collapse="") | |
120 leftspaces<-c() | |
121 rightspaces<-c() | |
122 | |
123 YYYmotif <- unlist(strsplit(D835letters, split = "")) | |
124 YYYposition <- match(x = "x", table = YYYmotif) | |
125 #position itself tells me how much is to the left of that X by what it's number is. x at position 4 tells me that there are | |
126 #just 3 letters to the left of x | |
127 | |
128 YYYLettersToTheLeft <- YYYposition - 1 | |
129 #how many letters to the right SHOULD just be length(motif)-position-1 if it's 5 long and x is at 3 then Y is at 4 and there is | |
130 #just 1 spot to the right of Y so LettersToTheRight<-1 because 5-3-1=1 | |
131 YYYLettersToTheRight <- length(YYYmotif) - YYYposition - 1 | |
132 #then sanity check, we're currently looking only at +/-4, but this spot allows for up to +/- 7 as well, just depends on what the | |
133 #variable the user puts in is | |
134 if (YYYLettersToTheLeft < 7 | YYYLettersToTheRight < 7) { | |
135 leftspaces<-rep(" ",times=(7-YYYLettersToTheLeft)) | |
136 rightspaces<-rep(" ",times=7-(YYYLettersToTheRight)) | |
137 #add blank spaces if the motif has less than 4 letters to the left/right | |
138 motif<-c(leftspaces,YYYmotif,rightspaces) | |
139 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
140 motif<-motif[!motif %in% "x"] | |
141 motif<-paste(motif, sep="", collapse="") | |
142 D835letters<-motif | |
143 D835Ymotifs[i,1]<-D835letters | |
144 } | |
145 | |
146 if(YYYLettersToTheLeft>6 && YYYLettersToTheRight>6){ | |
147 motif<-YYYmotif | |
148 #add blank spaces if the motif has less than 4 letters to the left/right | |
149 motif<-c(leftspaces,YYYmotif,rightspaces) | |
150 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
151 motif<-motif[!motif %in% "x"] | |
152 motif<-paste(motif, sep="", collapse="") | |
153 D835letters<-motif | |
154 D835Ymotifs[i,1]<-D835letters | |
155 } | |
156 } | |
157 | |
158 | |
159 ITDmotifs=matrix(,nrow = nrow(ThirdSubstrateSet),ncol=1) | |
160 ITDAccessionNumbers<-matrix(data = Thirdsubbackfreq[1,],ncol = 1) | |
161 #third set of substrates and accession numbers | |
162 | |
163 | |
164 for (i in 1:nrow(ThirdSubstrateSet)){ | |
165 ITDletters<-ThirdSubstrateSet[i,4:18] | |
166 ITDletters<-ITDletters[ITDletters !="XXXXX"] | |
167 ITDletters<-paste(ITDletters, sep="", collapse="") | |
168 YYYmotif <- unlist(strsplit(ITDletters, split = "")) | |
169 leftspaces<-c() | |
170 rightspaces<-c() | |
171 YYYposition <- match(x = "x", table = YYYmotif) | |
172 #position itself tells me how much is to the left of that X by what it's number is. x at position 4 tells me that there are | |
173 #just 3 letters to the left of x | |
174 | |
175 YYYLettersToTheLeft <- YYYposition - 1 | |
176 #how many letters to the right SHOULD just be length(motif)-position-1 if it's 5 long and x is at 3 then Y is at 4 and there is | |
177 #just 1 spot to the right of Y so LettersToTheRight<-1 because 5-3-1=1 | |
178 YYYLettersToTheRight <- length(YYYmotif) - YYYposition - 1 | |
179 #then sanity check, we're currently looking only at +/-4, but this spot allows for up to +/- 7 as well, just depends on what the | |
180 #variable the user puts in is | |
181 if (YYYLettersToTheLeft < 7 | YYYLettersToTheRight < 7) { | |
182 leftspaces<-rep(" ",times=(7-YYYLettersToTheLeft)) | |
183 rightspaces<-rep(" ",times=7-(YYYLettersToTheRight)) | |
184 #add blank spaces if the motif has less than 4 letters to the left/right | |
185 motif<-c(leftspaces,YYYmotif,rightspaces) | |
186 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
187 motif<-motif[!motif %in% "x"] | |
188 motif<-paste(motif, sep="", collapse="") | |
189 ITDletters<-motif | |
190 ITDmotifs[i,1]<-ITDletters | |
191 # ITDAccessionNumbers[i,1]<-FirstSubstrateSet[i,3] | |
192 } | |
193 | |
194 if(YYYLettersToTheLeft>6 && YYYLettersToTheRight>6){ | |
195 motif<-YYYmotif | |
196 #add blank spaces if the motif has less than 4 letters to the left/right | |
197 motif<-c(leftspaces,YYYmotif,rightspaces) | |
198 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
199 motif<-motif[!motif %in% "x"] | |
200 motif<-paste(motif, sep="", collapse="") | |
201 ITDletters<-motif | |
202 ITDmotifs[i,1]<-ITDletters | |
203 # ITDAccessionNumbers[i,1]<-FirstSubstrateSet[i,3] | |
204 } | |
205 } | |
206 | |
207 | |
208 SubstrateOverlap1<-intersect(D835Ymotifs,ITDmotifs) | |
209 SubstrateOverlap1<-as.matrix(SubstrateOverlap1) | |
210 SubstrateOverlapFINAL<-intersect(FTLwtmotifs,SubstrateOverlap1) | |
211 #this tool is essentially looking for the intersection of three sets, and to do so it performs 1 intersection at a time | |
212 #this find the intersection of the substrate sets | |
213 | |
214 AccessionOverlap1<-intersect(D835YAccessionNumbers,ITDAccessionNumbers) | |
215 AccessionOverlapFinal<-intersect(AccessionOverlap1,FTLwtAccessionNumbers) | |
216 AccessionOverlapFinal<-unlist(AccessionOverlapFinal) | |
217 #this tool is essentially looking for the intersection of three sets, and to do so it performs 1 intersection at a time | |
218 #this find the intersection of the accession number sets | |
219 | |
220 | |
221 for (x in 1:length(AccessionOverlapFinal)) { | |
222 for (y in 1:ncol(Firstsubbackfreq)) { | |
223 Acc<-AccessionOverlapFinal[x] | |
224 SBF<-Firstsubbackfreq[1,y] | |
225 if(Acc==SBF){ | |
226 FinalMatrix<-cbind(FinalMatrix,Firstsubbackfreq[,y]) | |
227 } | |
228 } | |
229 } | |
230 #for every accession number, go back to the original substrate background frequency file and find the protein statistics associated with that number | |
231 #columnbind all those protein statistics together | |
232 FinalMatrix<-FinalMatrix[,2:ncol(FinalMatrix)] | |
233 | |
234 | |
235 #write all files, in the proper format so that the files coming out look like the files coming in | |
236 | |
237 if(grepl(pattern = "Properties", x=FinalMatrix[22,1])==FALSE){ | |
238 Outputmatrix<-cbind(Firstsubbackfreq[,1],FinalMatrix) | |
239 write.table(x=Outputmatrix,file = Shared_subbackfreq_table,quote = FALSE,sep = ",",row.names = FALSE,col.names = FALSE,na="") | |
240 } else { | |
241 write.table(x=FinalMatrix,file = Shared_subbackfreq_table,quote = FALSE,sep = ",",row.names = FALSE,col.names = FALSE,na="") | |
242 } | |
243 | |
244 SubstrateMatrix<-SubstrateHeader | |
245 if(ncol(SubstrateMatrix)>18){ | |
246 SubstrateMatrix<-SubstrateMatrix[,1:18] | |
247 } | |
248 | |
249 for (z in 1:length(SubstrateOverlapFINAL)) { | |
250 motif<-SubstrateOverlapFINAL[z] | |
251 newmotif<-unlist(strsplit(motif,split = "")) | |
252 | |
253 Addition<-"" | |
254 outputmotif<-c(Addition,Addition,Addition,newmotif) | |
255 SubstrateMatrix<-rbind(SubstrateMatrix,outputmotif) | |
256 } | |
257 write.table(x=SubstrateMatrix,file = Shared_motifs_table,quote = FALSE,sep = ",",row.names = FALSE,col.names = FALSE,na="") |