negative_motif_finder_7_7: NMF/NMF-working-1-9-18.R annotate

annotate NMF/NMF-working-1-9-18.R @ 3:a69be20d500d draft

Uploaded

author	jfb
date	Wed, 09 Jan 2019 14:28:43 -0500
parents
children

rev	line source
3 a69be20d500d Uploaded jfb parents: diff changeset	1 NAMEOFOUTPUTFILE<-"output1.csv"
a69be20d500d Uploaded jfb parents: diff changeset	2
a69be20d500d Uploaded jfb parents: diff changeset	3 SuperAwesometrial <- read.delim2("input1.tabular", header=FALSE)
a69be20d500d Uploaded jfb parents: diff changeset	4 #once you've used the other script to turn the FASFA into a CSV, copypaste the filepath and name
a69be20d500d Uploaded jfb parents: diff changeset	5 #of the csv into this line between the quote marks.
a69be20d500d Uploaded jfb parents: diff changeset	6
a69be20d500d Uploaded jfb parents: diff changeset	7 SBF<-read.csv("input3.csv", stringsAsFactors = FALSE, header = FALSE)
a69be20d500d Uploaded jfb parents: diff changeset	8 SBF<-t(SBF)
a69be20d500d Uploaded jfb parents: diff changeset	9
a69be20d500d Uploaded jfb parents: diff changeset	10 PositiveMotifs <- read.csv("input2.csv", stringsAsFactors=FALSE)
a69be20d500d Uploaded jfb parents: diff changeset	11 #because of R reasons, it is required that the motifs in this file have blank cells instead of spaces where there is no letter in
a69be20d500d Uploaded jfb parents: diff changeset	12 #the motif
a69be20d500d Uploaded jfb parents: diff changeset	13
a69be20d500d Uploaded jfb parents: diff changeset	14 YsToim<-rep("xY",times=nrow(PositiveMotifs))
a69be20d500d Uploaded jfb parents: diff changeset	15 PositiveMotifs[,11]<-YsToim
a69be20d500d Uploaded jfb parents: diff changeset	16
a69be20d500d Uploaded jfb parents: diff changeset	17
a69be20d500d Uploaded jfb parents: diff changeset	18
a69be20d500d Uploaded jfb parents: diff changeset	19 ################################################################################################################################
a69be20d500d Uploaded jfb parents: diff changeset	20 #I have to paste them, then split and unlist them, then find the x and paste again
a69be20d500d Uploaded jfb parents: diff changeset	21 Positive9Letters<-PositiveMotifs[,4:18]
a69be20d500d Uploaded jfb parents: diff changeset	22 #head(Positive9Letters)
a69be20d500d Uploaded jfb parents: diff changeset	23 PositiveTrueMotifs<-c()
a69be20d500d Uploaded jfb parents: diff changeset	24
a69be20d500d Uploaded jfb parents: diff changeset	25 AccessionNumbers<-as.character(SBF[2:nrow(SBF),1])
a69be20d500d Uploaded jfb parents: diff changeset	26 AccessionNumbers<-AccessionNumbers[!is.na(AccessionNumbers)]
a69be20d500d Uploaded jfb parents: diff changeset	27 ALLPOSSIBLE<-SuperAwesometrial[,1]
a69be20d500d Uploaded jfb parents: diff changeset	28 ALLPOSSIBLE<-as.character(ALLPOSSIBLE)
a69be20d500d Uploaded jfb parents: diff changeset	29 ################################################################################################################################
a69be20d500d Uploaded jfb parents: diff changeset	30
a69be20d500d Uploaded jfb parents: diff changeset	31 for (q in 1:nrow(Positive9Letters)) {
a69be20d500d Uploaded jfb parents: diff changeset	32 LeftJust<-0
a69be20d500d Uploaded jfb parents: diff changeset	33 RightJust<-0
a69be20d500d Uploaded jfb parents: diff changeset	34
a69be20d500d Uploaded jfb parents: diff changeset	35 motifmotif<-Positive9Letters[q,]
a69be20d500d Uploaded jfb parents: diff changeset	36 motifmotif<-paste(motifmotif, collapse = "",sep = "")
a69be20d500d Uploaded jfb parents: diff changeset	37
a69be20d500d Uploaded jfb parents: diff changeset	38 motifmotif<-unlist(strsplit(motifmotif, split = ""))
a69be20d500d Uploaded jfb parents: diff changeset	39
a69be20d500d Uploaded jfb parents: diff changeset	40 position <- match(x = "x", table = motifmotif)
a69be20d500d Uploaded jfb parents: diff changeset	41 LeftJust<-position-1
a69be20d500d Uploaded jfb parents: diff changeset	42 RightJust<-length(motifmotif)-position-1
a69be20d500d Uploaded jfb parents: diff changeset	43
a69be20d500d Uploaded jfb parents: diff changeset	44 LeftSpaces<-rep(x=" ", times=(7-LeftJust))
a69be20d500d Uploaded jfb parents: diff changeset	45 RightSpaces<-rep(x=" ", times=(7-RightJust))
a69be20d500d Uploaded jfb parents: diff changeset	46
a69be20d500d Uploaded jfb parents: diff changeset	47 motifmotif<-motifmotif[!motifmotif %in% c("x")]
a69be20d500d Uploaded jfb parents: diff changeset	48
a69be20d500d Uploaded jfb parents: diff changeset	49 motifmotif<-c(LeftSpaces,motifmotif,RightSpaces)
a69be20d500d Uploaded jfb parents: diff changeset	50 motifmotif<-paste(motifmotif, collapse = "",sep = "")
a69be20d500d Uploaded jfb parents: diff changeset	51 PositiveTrueMotifs<-c(PositiveTrueMotifs,motifmotif)
a69be20d500d Uploaded jfb parents: diff changeset	52 }
a69be20d500d Uploaded jfb parents: diff changeset	53
a69be20d500d Uploaded jfb parents: diff changeset	54
a69be20d500d Uploaded jfb parents: diff changeset	55
a69be20d500d Uploaded jfb parents: diff changeset	56 ################################################################################################################################
a69be20d500d Uploaded jfb parents: diff changeset	57 allmotifs<-matrix(data=rep("Motifs", times= 1000000),ncol = 1)
a69be20d500d Uploaded jfb parents: diff changeset	58 thenames<-matrix(data=rep("AccessionNumbers", times= 1000000),ncol = 1)
a69be20d500d Uploaded jfb parents: diff changeset	59 ################################################################################################################################
a69be20d500d Uploaded jfb parents: diff changeset	60
a69be20d500d Uploaded jfb parents: diff changeset	61 ################################################################################################################################
a69be20d500d Uploaded jfb parents: diff changeset	62
a69be20d500d Uploaded jfb parents: diff changeset	63 #I need to preallocate these vectors. I will find out how many y's there are total and then make the vector that many long
a69be20d500d Uploaded jfb parents: diff changeset	64 #Or what I need is two separate loops. First loop finds all the accession number positions that Grep to the FASTA (which is called ALLPOSSIBLE)
a69be20d500d Uploaded jfb parents: diff changeset	65 #then take only those AAs from the fasta and count their y's, preallocate the vector for part 2 to that many y's
a69be20d500d Uploaded jfb parents: diff changeset	66 #those accessions and such as saved in a vector... this seems like it would be no faster actually
a69be20d500d Uploaded jfb parents: diff changeset	67
a69be20d500d Uploaded jfb parents: diff changeset	68 #then_that_are <- which(AccessionNumbers %in% ALLPOSSIBLE)
a69be20d500d Uploaded jfb parents: diff changeset	69
a69be20d500d Uploaded jfb parents: diff changeset	70 MotifNumber<-2
a69be20d500d Uploaded jfb parents: diff changeset	71
a69be20d500d Uploaded jfb parents: diff changeset	72 #TrueMotifNums<-which(ALLPOSSIBLE %in% AccessionNumbers)
a69be20d500d Uploaded jfb parents: diff changeset	73 #fihlodeANs<-c()
a69be20d500d Uploaded jfb parents: diff changeset	74 for (q in 1:length(AccessionNumbers)) {
a69be20d500d Uploaded jfb parents: diff changeset	75 patterno<-as.character(AccessionNumbers[q])
a69be20d500d Uploaded jfb parents: diff changeset	76 location<-sapply(ALLPOSSIBLE, grepl, pattern=patterno, fixed=TRUE)
a69be20d500d Uploaded jfb parents: diff changeset	77 if (sum(location)>0){
a69be20d500d Uploaded jfb parents: diff changeset	78 whereisit<-which(location %in% TRUE)
a69be20d500d Uploaded jfb parents: diff changeset	79 for (u in 1:length(whereisit)) {
a69be20d500d Uploaded jfb parents: diff changeset	80 i<-whereisit[u]
a69be20d500d Uploaded jfb parents: diff changeset	81 name<-c()
a69be20d500d Uploaded jfb parents: diff changeset	82 data<-c()
a69be20d500d Uploaded jfb parents: diff changeset	83 name<-as.character(SuperAwesometrial[i,1])
a69be20d500d Uploaded jfb parents: diff changeset	84 #the name of each protein is the first column
a69be20d500d Uploaded jfb parents: diff changeset	85 name<-sub(x=name, pattern=",", replacement="")
a69be20d500d Uploaded jfb parents: diff changeset	86 #the names may contain commas, remove them
a69be20d500d Uploaded jfb parents: diff changeset	87 data<-as.character(SuperAwesometrial[i,3])
a69be20d500d Uploaded jfb parents: diff changeset	88 #the amino acids are stored in the third column
a69be20d500d Uploaded jfb parents: diff changeset	89 data<-strsplit(data,"")
a69be20d500d Uploaded jfb parents: diff changeset	90 #split them into their component letters
a69be20d500d Uploaded jfb parents: diff changeset	91 data<-unlist(data)
a69be20d500d Uploaded jfb parents: diff changeset	92 #turn them into a vector
a69be20d500d Uploaded jfb parents: diff changeset	93 motif<-c()
a69be20d500d Uploaded jfb parents: diff changeset	94
a69be20d500d Uploaded jfb parents: diff changeset	95 #this part below is where I can speed things up
a69be20d500d Uploaded jfb parents: diff changeset	96 The_Ys<-data=="Y"
a69be20d500d Uploaded jfb parents: diff changeset	97 #find any Y in the protein
a69be20d500d Uploaded jfb parents: diff changeset	98 if (sum(The_Ys>0)){ #if there is at least one Y
a69be20d500d Uploaded jfb parents: diff changeset	99 Where_are_they<-which(The_Ys %in% TRUE)
a69be20d500d Uploaded jfb parents: diff changeset	100 for (z in 1:length(Where_are_they)) { #then for every Y, make a motif
a69be20d500d Uploaded jfb parents: diff changeset	101
a69be20d500d Uploaded jfb parents: diff changeset	102 j<-Where_are_they[z]
a69be20d500d Uploaded jfb parents: diff changeset	103 #for (j in 1:length(data)){
a69be20d500d Uploaded jfb parents: diff changeset	104 #if ("Y" %in% data[j]){
a69be20d500d Uploaded jfb parents: diff changeset	105 #if there is a Y aka Tyrosine in the data
a69be20d500d Uploaded jfb parents: diff changeset	106 #allmotifs=rbind(allmotifs,data[(i-4):(i+4)])
a69be20d500d Uploaded jfb parents: diff changeset	107 a <- j-7
a69be20d500d Uploaded jfb parents: diff changeset	108 a<-ifelse(a<1, a <- 1, a <- a)
a69be20d500d Uploaded jfb parents: diff changeset	109 # if (a<1){
a69be20d500d Uploaded jfb parents: diff changeset	110 # a <- 1
a69be20d500d Uploaded jfb parents: diff changeset	111 # }
a69be20d500d Uploaded jfb parents: diff changeset	112 b<-j+7
a69be20d500d Uploaded jfb parents: diff changeset	113 b<-ifelse(b>length(data), b <- length(data), b <-
a69be20d500d Uploaded jfb parents: diff changeset	114 b)
a69be20d500d Uploaded jfb parents: diff changeset	115 # if (b>length(data)){
a69be20d500d Uploaded jfb parents: diff changeset	116 # b<-length(data)
a69be20d500d Uploaded jfb parents: diff changeset	117 # }
a69be20d500d Uploaded jfb parents: diff changeset	118 #take the motif that is +/- 4 from that Y, sanity checks so that values are never off the grid from the protein
a69be20d500d Uploaded jfb parents: diff changeset	119
a69be20d500d Uploaded jfb parents: diff changeset	120 LeftSide<-7-(j-a)
a69be20d500d Uploaded jfb parents: diff changeset	121 RightSide<-7-(b-j)
a69be20d500d Uploaded jfb parents: diff changeset	122 #how is the motif justified? Does it have exactly 4 letters to the left/right, or does it not?
a69be20d500d Uploaded jfb parents: diff changeset	123
a69be20d500d Uploaded jfb parents: diff changeset	124 leftspaces<-rep(" ",times=LeftSide)
a69be20d500d Uploaded jfb parents: diff changeset	125 rightspaces<-rep(" ",times=RightSide)
a69be20d500d Uploaded jfb parents: diff changeset	126 #add blank spaces if the motif has less than 4 letters to the left/right
a69be20d500d Uploaded jfb parents: diff changeset	127
a69be20d500d Uploaded jfb parents: diff changeset	128
a69be20d500d Uploaded jfb parents: diff changeset	129 motif<-(data[(a):(b)])
a69be20d500d Uploaded jfb parents: diff changeset	130 motif<-c(leftspaces,motif,rightspaces)
a69be20d500d Uploaded jfb parents: diff changeset	131 #save that motif, which is the Y and +/- 4 amino acids, including truncation
a69be20d500d Uploaded jfb parents: diff changeset	132
a69be20d500d Uploaded jfb parents: diff changeset	133 # lens<-c(lens,length(motif))
a69be20d500d Uploaded jfb parents: diff changeset	134 # leni<-c(leni,i)
a69be20d500d Uploaded jfb parents: diff changeset	135 # lenj<-c(lenj,j)
a69be20d500d Uploaded jfb parents: diff changeset	136
a69be20d500d Uploaded jfb parents: diff changeset	137 motif<-paste(motif, sep="", collapse="")
a69be20d500d Uploaded jfb parents: diff changeset	138 #the 4 amino acids, put them back together into a single string
a69be20d500d Uploaded jfb parents: diff changeset	139 motif<-matrix(data=c(motif),nrow = 1)
a69be20d500d Uploaded jfb parents: diff changeset	140 namesss<-matrix(data=c(name),nrow = 1)
a69be20d500d Uploaded jfb parents: diff changeset	141 #keep this motif and separately keep the name of the protein it came from
a69be20d500d Uploaded jfb parents: diff changeset	142
a69be20d500d Uploaded jfb parents: diff changeset	143 # allmotifs<-rbind(allmotifs,motif)
a69be20d500d Uploaded jfb parents: diff changeset	144 # thenames<-rbind(thenames,namesss)
a69be20d500d Uploaded jfb parents: diff changeset	145 allmotifs[MotifNumber,1]<-motif
a69be20d500d Uploaded jfb parents: diff changeset	146 thenames[MotifNumber,1]<-namesss
a69be20d500d Uploaded jfb parents: diff changeset	147 MotifNumber<-MotifNumber+1
a69be20d500d Uploaded jfb parents: diff changeset	148
a69be20d500d Uploaded jfb parents: diff changeset	149 #add names and motifs to a growing list
a69be20d500d Uploaded jfb parents: diff changeset	150
a69be20d500d Uploaded jfb parents: diff changeset	151 # write.table(motif, file="TRIALTIALRIAALSKFDJSD.csv", quote=FALSE, sep=",",
a69be20d500d Uploaded jfb parents: diff changeset	152 # row.names=FALSE,col.names = FALSE, na="", append=TRUE)
a69be20d500d Uploaded jfb parents: diff changeset	153 #and then write it into a csv, the sep is needed so that the two pieces of the data frame are separated
a69be20d500d Uploaded jfb parents: diff changeset	154 #append has 1to equal true because this thing will loop around many times adding more and more data points
a69be20d500d Uploaded jfb parents: diff changeset	155 #you must create a new filename/filepath with each new data you run
a69be20d500d Uploaded jfb parents: diff changeset	156 }
a69be20d500d Uploaded jfb parents: diff changeset	157
a69be20d500d Uploaded jfb parents: diff changeset	158 }
a69be20d500d Uploaded jfb parents: diff changeset	159 }
a69be20d500d Uploaded jfb parents: diff changeset	160 }
a69be20d500d Uploaded jfb parents: diff changeset	161 }
a69be20d500d Uploaded jfb parents: diff changeset	162
a69be20d500d Uploaded jfb parents: diff changeset	163
a69be20d500d Uploaded jfb parents: diff changeset	164
a69be20d500d Uploaded jfb parents: diff changeset	165
a69be20d500d Uploaded jfb parents: diff changeset	166 ################################################################################################################################
a69be20d500d Uploaded jfb parents: diff changeset	167 ################################################################################################################################
a69be20d500d Uploaded jfb parents: diff changeset	168 ################################################################################################################################
a69be20d500d Uploaded jfb parents: diff changeset	169
a69be20d500d Uploaded jfb parents: diff changeset	170
a69be20d500d Uploaded jfb parents: diff changeset	171 # for (i in 1:nrow(SuperAwesometrial)){
a69be20d500d Uploaded jfb parents: diff changeset	172 #
a69be20d500d Uploaded jfb parents: diff changeset	173 # }
a69be20d500d Uploaded jfb parents: diff changeset	174
a69be20d500d Uploaded jfb parents: diff changeset	175 names(allmotifs)<-thenames
a69be20d500d Uploaded jfb parents: diff changeset	176
a69be20d500d Uploaded jfb parents: diff changeset	177 truemotifs<-allmotifs[!duplicated(allmotifs)]
a69be20d500d Uploaded jfb parents: diff changeset	178 #truenames<-thenames[!duplicated(thenames)]
a69be20d500d Uploaded jfb parents: diff changeset	179 #remove duplicates from the motifs and names
a69be20d500d Uploaded jfb parents: diff changeset	180
a69be20d500d Uploaded jfb parents: diff changeset	181 #make the motifs and names into matrices
a69be20d500d Uploaded jfb parents: diff changeset	182
a69be20d500d Uploaded jfb parents: diff changeset	183
a69be20d500d Uploaded jfb parents: diff changeset	184 truemotifs<-truemotifs[!truemotifs %in% PositiveTrueMotifs]
a69be20d500d Uploaded jfb parents: diff changeset	185
a69be20d500d Uploaded jfb parents: diff changeset	186 outputfile<-cbind(names(truemotifs),truemotifs)
a69be20d500d Uploaded jfb parents: diff changeset	187
a69be20d500d Uploaded jfb parents: diff changeset	188 outputfile <- gsub(",","",outputfile)
a69be20d500d Uploaded jfb parents: diff changeset	189
a69be20d500d Uploaded jfb parents: diff changeset	190 write.table(outputfile, file=NAMEOFOUTPUTFILE, quote=FALSE, sep=",",
a69be20d500d Uploaded jfb parents: diff changeset	191 row.names=FALSE,col.names = FALSE, na="", append=TRUE)

Mercurial > repos > jfb > negative_motif_finder_7_7

annotate NMF/NMF-working-1-9-18.R @ 3:a69be20d500d draft