annotate NMF/NMF-working-2-5-20.R @ 4:220d4359ec9b draft

Uploaded
author jfb
date Thu, 06 Feb 2020 14:20:36 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
220d4359ec9b Uploaded
jfb
parents:
diff changeset
1 NAMEOFOUTPUTFILE<-"output1.csv"
220d4359ec9b Uploaded
jfb
parents:
diff changeset
2
220d4359ec9b Uploaded
jfb
parents:
diff changeset
3 SuperAwesometrial <- read.delim2("input1.tabular", header=FALSE)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
4 #once you've used the other script to turn the FASFA into a CSV, copypaste the filepath and name
220d4359ec9b Uploaded
jfb
parents:
diff changeset
5 #of the csv into this line between the quote marks.
220d4359ec9b Uploaded
jfb
parents:
diff changeset
6
220d4359ec9b Uploaded
jfb
parents:
diff changeset
7 SBF<-read.csv("input3.csv", stringsAsFactors = FALSE, header = FALSE)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
8 SBF<-t(SBF)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
9
220d4359ec9b Uploaded
jfb
parents:
diff changeset
10 PositiveMotifs <- read.csv("input2.csv", stringsAsFactors=FALSE)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
11 #because of R reasons, it is required that the motifs in this file have blank cells instead of spaces where there is no letter in
220d4359ec9b Uploaded
jfb
parents:
diff changeset
12 #the motif
220d4359ec9b Uploaded
jfb
parents:
diff changeset
13
220d4359ec9b Uploaded
jfb
parents:
diff changeset
14 YsToim<-rep("xY",times=nrow(PositiveMotifs))
220d4359ec9b Uploaded
jfb
parents:
diff changeset
15 PositiveMotifs[,11]<-YsToim
220d4359ec9b Uploaded
jfb
parents:
diff changeset
16
220d4359ec9b Uploaded
jfb
parents:
diff changeset
17
220d4359ec9b Uploaded
jfb
parents:
diff changeset
18
220d4359ec9b Uploaded
jfb
parents:
diff changeset
19 ################################################################################################################################
220d4359ec9b Uploaded
jfb
parents:
diff changeset
20 #I have to paste them, then split and unlist them, then find the x and paste again
220d4359ec9b Uploaded
jfb
parents:
diff changeset
21 Positive9Letters<-PositiveMotifs[,4:18]
220d4359ec9b Uploaded
jfb
parents:
diff changeset
22 #head(Positive9Letters)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
23 PositiveTrueMotifs<-c()
220d4359ec9b Uploaded
jfb
parents:
diff changeset
24
220d4359ec9b Uploaded
jfb
parents:
diff changeset
25 AccessionNumbers<-as.character(SBF[2:nrow(SBF),1])
220d4359ec9b Uploaded
jfb
parents:
diff changeset
26 AccessionNumbers<-AccessionNumbers[!is.na(AccessionNumbers)]
220d4359ec9b Uploaded
jfb
parents:
diff changeset
27 ALLPOSSIBLE<-SuperAwesometrial[,1]
220d4359ec9b Uploaded
jfb
parents:
diff changeset
28 ALLPOSSIBLE<-as.character(ALLPOSSIBLE)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
29 ################################################################################################################################
220d4359ec9b Uploaded
jfb
parents:
diff changeset
30
220d4359ec9b Uploaded
jfb
parents:
diff changeset
31 for (q in 1:nrow(Positive9Letters)) {
220d4359ec9b Uploaded
jfb
parents:
diff changeset
32 LeftJust<-0
220d4359ec9b Uploaded
jfb
parents:
diff changeset
33 RightJust<-0
220d4359ec9b Uploaded
jfb
parents:
diff changeset
34
220d4359ec9b Uploaded
jfb
parents:
diff changeset
35 motifmotif<-Positive9Letters[q,]
220d4359ec9b Uploaded
jfb
parents:
diff changeset
36 motifmotif<-paste(motifmotif, collapse = "",sep = "")
220d4359ec9b Uploaded
jfb
parents:
diff changeset
37
220d4359ec9b Uploaded
jfb
parents:
diff changeset
38 motifmotif<-unlist(strsplit(motifmotif, split = ""))
220d4359ec9b Uploaded
jfb
parents:
diff changeset
39
220d4359ec9b Uploaded
jfb
parents:
diff changeset
40 position <- match(x = "x", table = motifmotif)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
41 LeftJust<-position-1
220d4359ec9b Uploaded
jfb
parents:
diff changeset
42 RightJust<-length(motifmotif)-position-1
220d4359ec9b Uploaded
jfb
parents:
diff changeset
43
220d4359ec9b Uploaded
jfb
parents:
diff changeset
44 LeftSpaces<-rep(x=" ", times=(7-LeftJust))
220d4359ec9b Uploaded
jfb
parents:
diff changeset
45 RightSpaces<-rep(x=" ", times=(7-RightJust))
220d4359ec9b Uploaded
jfb
parents:
diff changeset
46
220d4359ec9b Uploaded
jfb
parents:
diff changeset
47 motifmotif<-motifmotif[!motifmotif %in% c("x")]
220d4359ec9b Uploaded
jfb
parents:
diff changeset
48
220d4359ec9b Uploaded
jfb
parents:
diff changeset
49 motifmotif<-c(LeftSpaces,motifmotif,RightSpaces)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
50 motifmotif<-paste(motifmotif, collapse = "",sep = "")
220d4359ec9b Uploaded
jfb
parents:
diff changeset
51 PositiveTrueMotifs<-c(PositiveTrueMotifs,motifmotif)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
52 }
220d4359ec9b Uploaded
jfb
parents:
diff changeset
53
220d4359ec9b Uploaded
jfb
parents:
diff changeset
54
220d4359ec9b Uploaded
jfb
parents:
diff changeset
55
220d4359ec9b Uploaded
jfb
parents:
diff changeset
56 ################################################################################################################################
220d4359ec9b Uploaded
jfb
parents:
diff changeset
57 allmotifs<-matrix(data=rep("Motifs", times= 1000000),ncol = 1)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
58 thenames<-matrix(data=rep("AccessionNumbers", times= 1000000),ncol = 1)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
59 ################################################################################################################################
220d4359ec9b Uploaded
jfb
parents:
diff changeset
60
220d4359ec9b Uploaded
jfb
parents:
diff changeset
61 ################################################################################################################################
220d4359ec9b Uploaded
jfb
parents:
diff changeset
62
220d4359ec9b Uploaded
jfb
parents:
diff changeset
63 #I need to preallocate these vectors. I will find out how many y's there are total and then make the vector that many long
220d4359ec9b Uploaded
jfb
parents:
diff changeset
64 #Or what I need is two separate loops. First loop finds all the accession number positions that Grep to the FASTA (which is called ALLPOSSIBLE)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
65 #then take only those AAs from the fasta and count their y's, preallocate the vector for part 2 to that many y's
220d4359ec9b Uploaded
jfb
parents:
diff changeset
66 #those accessions and such as saved in a vector... this seems like it would be no faster actually
220d4359ec9b Uploaded
jfb
parents:
diff changeset
67
220d4359ec9b Uploaded
jfb
parents:
diff changeset
68 #then_that_are <- which(AccessionNumbers %in% ALLPOSSIBLE)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
69
220d4359ec9b Uploaded
jfb
parents:
diff changeset
70 MotifNumber<-2
220d4359ec9b Uploaded
jfb
parents:
diff changeset
71
220d4359ec9b Uploaded
jfb
parents:
diff changeset
72 #TrueMotifNums<-which(ALLPOSSIBLE %in% AccessionNumbers)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
73 #fihlodeANs<-c()
220d4359ec9b Uploaded
jfb
parents:
diff changeset
74
220d4359ec9b Uploaded
jfb
parents:
diff changeset
75 locations<-unique(grep(paste(AccessionNumbers,collapse="|"), ALLPOSSIBLE))
220d4359ec9b Uploaded
jfb
parents:
diff changeset
76
220d4359ec9b Uploaded
jfb
parents:
diff changeset
77 if (sum(locations)>0){
220d4359ec9b Uploaded
jfb
parents:
diff changeset
78 whereisit<-locations
220d4359ec9b Uploaded
jfb
parents:
diff changeset
79 for (u in 1:length(whereisit)) {
220d4359ec9b Uploaded
jfb
parents:
diff changeset
80 i<-whereisit[u]
220d4359ec9b Uploaded
jfb
parents:
diff changeset
81 name<-c()
220d4359ec9b Uploaded
jfb
parents:
diff changeset
82 data<-c()
220d4359ec9b Uploaded
jfb
parents:
diff changeset
83 name<-as.character(SuperAwesometrial[i,1])
220d4359ec9b Uploaded
jfb
parents:
diff changeset
84 #the name of each protein is the first column
220d4359ec9b Uploaded
jfb
parents:
diff changeset
85 name<-sub(x=name, pattern=",", replacement="")
220d4359ec9b Uploaded
jfb
parents:
diff changeset
86 #the names may contain commas, remove them
220d4359ec9b Uploaded
jfb
parents:
diff changeset
87 data<-as.character(SuperAwesometrial[i,3])
220d4359ec9b Uploaded
jfb
parents:
diff changeset
88 #the amino acids are stored in the third column
220d4359ec9b Uploaded
jfb
parents:
diff changeset
89 data<-strsplit(data,"")
220d4359ec9b Uploaded
jfb
parents:
diff changeset
90 #split them into their component letters
220d4359ec9b Uploaded
jfb
parents:
diff changeset
91 data<-unlist(data)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
92 #turn them into a vector
220d4359ec9b Uploaded
jfb
parents:
diff changeset
93 motif<-c()
220d4359ec9b Uploaded
jfb
parents:
diff changeset
94
220d4359ec9b Uploaded
jfb
parents:
diff changeset
95 #this part below is where I can speed things up
220d4359ec9b Uploaded
jfb
parents:
diff changeset
96 The_Ys<-data=="Y"
220d4359ec9b Uploaded
jfb
parents:
diff changeset
97 #find any Y in the protein
220d4359ec9b Uploaded
jfb
parents:
diff changeset
98 if (sum(The_Ys>0)){ #if there is at least one Y
220d4359ec9b Uploaded
jfb
parents:
diff changeset
99 Where_are_they<-which(The_Ys %in% TRUE)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
100 for (z in 1:length(Where_are_they)) { #then for every Y, make a motif
220d4359ec9b Uploaded
jfb
parents:
diff changeset
101
220d4359ec9b Uploaded
jfb
parents:
diff changeset
102 j<-Where_are_they[z]
220d4359ec9b Uploaded
jfb
parents:
diff changeset
103 #for (j in 1:length(data)){
220d4359ec9b Uploaded
jfb
parents:
diff changeset
104 #if ("Y" %in% data[j]){
220d4359ec9b Uploaded
jfb
parents:
diff changeset
105 #if there is a Y aka Tyrosine in the data
220d4359ec9b Uploaded
jfb
parents:
diff changeset
106 #allmotifs=rbind(allmotifs,data[(i-4):(i+4)])
220d4359ec9b Uploaded
jfb
parents:
diff changeset
107 a <- j-7
220d4359ec9b Uploaded
jfb
parents:
diff changeset
108 a<-ifelse(a<1, a <- 1, a <- a)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
109 # if (a<1){
220d4359ec9b Uploaded
jfb
parents:
diff changeset
110 # a <- 1
220d4359ec9b Uploaded
jfb
parents:
diff changeset
111 # }
220d4359ec9b Uploaded
jfb
parents:
diff changeset
112 b<-j+7
220d4359ec9b Uploaded
jfb
parents:
diff changeset
113 b<-ifelse(b>length(data), b <- length(data), b <-
220d4359ec9b Uploaded
jfb
parents:
diff changeset
114 b)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
115 # if (b>length(data)){
220d4359ec9b Uploaded
jfb
parents:
diff changeset
116 # b<-length(data)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
117 # }
220d4359ec9b Uploaded
jfb
parents:
diff changeset
118 #take the motif that is +/- 4 from that Y, sanity checks so that values are never off the grid from the protein
220d4359ec9b Uploaded
jfb
parents:
diff changeset
119
220d4359ec9b Uploaded
jfb
parents:
diff changeset
120 LeftSide<-7-(j-a)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
121 RightSide<-7-(b-j)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
122 #how is the motif justified? Does it have exactly 4 letters to the left/right, or does it not?
220d4359ec9b Uploaded
jfb
parents:
diff changeset
123
220d4359ec9b Uploaded
jfb
parents:
diff changeset
124 leftspaces<-rep(" ",times=LeftSide)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
125 rightspaces<-rep(" ",times=RightSide)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
126 #add blank spaces if the motif has less than 4 letters to the left/right
220d4359ec9b Uploaded
jfb
parents:
diff changeset
127
220d4359ec9b Uploaded
jfb
parents:
diff changeset
128
220d4359ec9b Uploaded
jfb
parents:
diff changeset
129 motif<-(data[(a):(b)])
220d4359ec9b Uploaded
jfb
parents:
diff changeset
130 motif<-c(leftspaces,motif,rightspaces)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
131 #save that motif, which is the Y and +/- 4 amino acids, including truncation
220d4359ec9b Uploaded
jfb
parents:
diff changeset
132
220d4359ec9b Uploaded
jfb
parents:
diff changeset
133 # lens<-c(lens,length(motif))
220d4359ec9b Uploaded
jfb
parents:
diff changeset
134 # leni<-c(leni,i)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
135 # lenj<-c(lenj,j)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
136
220d4359ec9b Uploaded
jfb
parents:
diff changeset
137 motif<-paste(motif, sep="", collapse="")
220d4359ec9b Uploaded
jfb
parents:
diff changeset
138 #the 4 amino acids, put them back together into a single string
220d4359ec9b Uploaded
jfb
parents:
diff changeset
139 motif<-matrix(data=c(motif),nrow = 1)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
140 namesss<-matrix(data=c(name),nrow = 1)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
141 #keep this motif and separately keep the name of the protein it came from
220d4359ec9b Uploaded
jfb
parents:
diff changeset
142
220d4359ec9b Uploaded
jfb
parents:
diff changeset
143 # allmotifs<-rbind(allmotifs,motif)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
144 # thenames<-rbind(thenames,namesss)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
145 allmotifs[MotifNumber,1]<-motif
220d4359ec9b Uploaded
jfb
parents:
diff changeset
146 thenames[MotifNumber,1]<-namesss
220d4359ec9b Uploaded
jfb
parents:
diff changeset
147 MotifNumber<-MotifNumber+1
220d4359ec9b Uploaded
jfb
parents:
diff changeset
148
220d4359ec9b Uploaded
jfb
parents:
diff changeset
149 #add names and motifs to a growing list
220d4359ec9b Uploaded
jfb
parents:
diff changeset
150
220d4359ec9b Uploaded
jfb
parents:
diff changeset
151 # write.table(motif, file="TRIALTIALRIAALSKFDJSD.csv", quote=FALSE, sep=",",
220d4359ec9b Uploaded
jfb
parents:
diff changeset
152 # row.names=FALSE,col.names = FALSE, na="", append=TRUE)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
153 #and then write it into a csv, the sep is needed so that the two pieces of the data frame are separated
220d4359ec9b Uploaded
jfb
parents:
diff changeset
154 #append has 1to equal true because this thing will loop around many times adding more and more data points
220d4359ec9b Uploaded
jfb
parents:
diff changeset
155 #you must create a new filename/filepath with each new data you run
220d4359ec9b Uploaded
jfb
parents:
diff changeset
156 }
220d4359ec9b Uploaded
jfb
parents:
diff changeset
157
220d4359ec9b Uploaded
jfb
parents:
diff changeset
158 }
220d4359ec9b Uploaded
jfb
parents:
diff changeset
159 }
220d4359ec9b Uploaded
jfb
parents:
diff changeset
160 }
220d4359ec9b Uploaded
jfb
parents:
diff changeset
161
220d4359ec9b Uploaded
jfb
parents:
diff changeset
162
220d4359ec9b Uploaded
jfb
parents:
diff changeset
163
220d4359ec9b Uploaded
jfb
parents:
diff changeset
164
220d4359ec9b Uploaded
jfb
parents:
diff changeset
165 ################################################################################################################################
220d4359ec9b Uploaded
jfb
parents:
diff changeset
166 ################################################################################################################################
220d4359ec9b Uploaded
jfb
parents:
diff changeset
167 ################################################################################################################################
220d4359ec9b Uploaded
jfb
parents:
diff changeset
168
220d4359ec9b Uploaded
jfb
parents:
diff changeset
169
220d4359ec9b Uploaded
jfb
parents:
diff changeset
170 # for (i in 1:nrow(SuperAwesometrial)){
220d4359ec9b Uploaded
jfb
parents:
diff changeset
171 #
220d4359ec9b Uploaded
jfb
parents:
diff changeset
172 # }
220d4359ec9b Uploaded
jfb
parents:
diff changeset
173
220d4359ec9b Uploaded
jfb
parents:
diff changeset
174 names(allmotifs)<-thenames
220d4359ec9b Uploaded
jfb
parents:
diff changeset
175
220d4359ec9b Uploaded
jfb
parents:
diff changeset
176 truemotifs<-allmotifs[!duplicated(allmotifs)]
220d4359ec9b Uploaded
jfb
parents:
diff changeset
177 #truenames<-thenames[!duplicated(thenames)]
220d4359ec9b Uploaded
jfb
parents:
diff changeset
178 #remove duplicates from the motifs and names
220d4359ec9b Uploaded
jfb
parents:
diff changeset
179
220d4359ec9b Uploaded
jfb
parents:
diff changeset
180 #make the motifs and names into matrices
220d4359ec9b Uploaded
jfb
parents:
diff changeset
181
220d4359ec9b Uploaded
jfb
parents:
diff changeset
182
220d4359ec9b Uploaded
jfb
parents:
diff changeset
183 truemotifs<-truemotifs[!truemotifs %in% PositiveTrueMotifs]
220d4359ec9b Uploaded
jfb
parents:
diff changeset
184
220d4359ec9b Uploaded
jfb
parents:
diff changeset
185 outputfile<-cbind(names(truemotifs),truemotifs)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
186
220d4359ec9b Uploaded
jfb
parents:
diff changeset
187 outputfile <- gsub(",","",outputfile)
220d4359ec9b Uploaded
jfb
parents:
diff changeset
188
220d4359ec9b Uploaded
jfb
parents:
diff changeset
189 write.table(outputfile, file=NAMEOFOUTPUTFILE, quote=FALSE, sep=",",
220d4359ec9b Uploaded
jfb
parents:
diff changeset
190 row.names=FALSE,col.names = FALSE, na="", append=TRUE)