| 31 | 1 ####################################################################################### | 
|  | 2 # R-code: Protein Name and Tukey's Normalization | 
|  | 3 # Author: Adam L Borne | 
|  | 4 # Contributers: Paul A Stewart, Brent Kuenzi | 
|  | 5 ####################################################################################### | 
|  | 6 # Assigns uniprot id from MaxQuant peptides file. Filters and normalizes the | 
|  | 7 # intensities of each proteins. Resulting in a one to one list of intensities to | 
|  | 8 # uniprot id. | 
|  | 9 ####################################################################################### | 
|  | 10 # Copyright (C)  Adam L Borne. | 
|  | 11 # Permission is granted to copy, distribute and/or modify this document | 
|  | 12 # under the terms of the GNU Free Documentation License, Version 1.3 | 
|  | 13 # or any later version published by the Free Software Foundation; | 
|  | 14 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. | 
|  | 15 # A copy of the license is included in the section entitled "GNU | 
|  | 16 # Free Documentation License". | 
|  | 17 ####################################################################################### | 
|  | 18 ## REQUIRED INPUT ## | 
|  | 19 | 
|  | 20 # 1) peptides_file: MaxQuant peptides file. | 
|  | 21 ####################################################################################### | 
|  | 22 ins_check_run <- function() { | 
|  | 23   if ("affy" %in% rownames(installed.packages())){} | 
|  | 24   else { | 
|  | 25     source("https://bioconductor.org/biocLite.R") | 
|  | 26     biocLite(c('mygene','affy')) | 
|  | 27   } | 
|  | 28   if ('data.table' %in% rownames(installed.packages())){} | 
|  | 29   else { | 
|  | 30     install.packages('data.table', repos='http://cran.us.r-project.org') | 
|  | 31   } | 
|  | 32   if ('stringr' %in% rownames(installed.packages())){} | 
|  | 33   else { | 
|  | 34     install.packages('stringr', repos='http://cran.us.r-project.org') | 
|  | 35   } | 
|  | 36   if ('VennDiagram' %in% rownames(installed.packages())){} | 
|  | 37   else { | 
|  | 38     install.packages('VennDiagram', repos='http://cran.us.r-project.org') | 
|  | 39   } | 
|  | 40 } | 
|  | 41 | 
|  | 42 ins_check_run() | 
|  | 43 library(data.table) | 
|  | 44 library(affy) | 
|  | 45 library(stringr) | 
|  | 46 library(mygene) | 
|  | 47 library(VennDiagram) | 
|  | 48 ##### | 
|  | 49 #data | 
|  | 50 | 
|  | 51 #We should chat a bit more about using Tukey's and handling 0's/missing values with Brent. | 
|  | 52 #Ask me about some updates for doing a bit more filtering of TMT data. | 
|  | 53 | 
|  | 54 main <- function(peptides_file, db_path) { | 
|  | 55 	peptides_file = read.delim(peptides_file,header=TRUE,stringsAsFactors=FALSE,fill=TRUE) | 
|  | 56   peptides_txt = peptides_file | 
|  | 57 	intensity_columns = names(peptides_txt[,str_detect(names(peptides_txt),"Intensity\\.*")]) #Pulls out all lines with Intensity in them. | 
|  | 58 	intensity_columns = intensity_columns[2:length(intensity_columns)] #Removes the first column that does not have a bait. | 
|  | 59 	peptides_txt_mapped = as.data.frame(map_peptides_proteins(peptides_txt)) #This function as below sets every line to a 1 to 1 intensity to each possible protein. | 
|  | 60 	peptides_txt_mapped$Uniprot = str_extract(peptides_txt_mapped$mapped_protein, "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") #Pulls out just Uniprot id from the script. | 
|  | 61 	peptides_txt_mapped = subset(peptides_txt_mapped,!is.na(Uniprot)) #removes reverse sequences and any that didn't match a uniprot accession | 
|  | 62 	columns_comb = c("Uniprot", intensity_columns) | 
|  | 63 	peptides_mapped_intensity = subset(peptides_txt_mapped, select = columns_comb) #Subsets out only the needed cloumns for Tukeys (Uniprot IDS and baited intensities) | 
|  | 64 	swissprot_fasta = scan(db_path, what="character") | 
|  | 65 	peptides_txt_mapped_log2 = peptides_mapped_intensity | 
|  | 66   # Takes the log2 of the intensities. | 
|  | 67 	for (i in intensity_columns) { | 
|  | 68 		peptides_txt_mapped_log2[,i] = log2(subset(peptides_txt_mapped_log2, select = i)) | 
|  | 69 	} | 
|  | 70   #get the minimum from each column while ignoring the -Inf; get the min of these mins for the global min; breaks when there's only one intensity column | 
|  | 71 	global_min = min(apply(peptides_txt_mapped_log2[,2:ncol(peptides_txt_mapped_log2)],2,function(x) { | 
|  | 72 	  min(x[x != -Inf]) | 
|  | 73 	})) | 
|  | 74 	peptides_txt_mapped_log2[peptides_txt_mapped_log2 == -Inf] <- 0 | 
|  | 75   #uniprot accessions WITHOUT isoforms; it looks like only contaminants contain isoforms anyways | 
|  | 76 	mapped_protein_uniprotonly = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") | 
|  | 77 	mapped_protein_uniprot_accession = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9](-[0-9]+)?|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(-[0-9]+)?|[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") | 
|  | 78 	peptides_txt_mapped_log2$mapped_protein = mapped_protein_uniprotonly | 
|  | 79   # Runs the Tukey function returning completed table | 
|  | 80   peptides_txt_mapped_log2 = subset(peptides_txt_mapped_log2,mapped_protein %in% swissprot_fasta) | 
|  | 81 	protein_intensities_tukeys = get_protein_values(peptides_txt_mapped_log2,intensity_columns) | 
|  | 82   protein_intensities_tukeys[protein_intensities_tukeys == 1] <- 0 | 
|  | 83   write.table(protein_intensities_tukeys, "./tukeys_output.txt", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t") | 
|  | 84 | 
|  | 85 } | 
|  | 86 | 
|  | 87 map_peptides_proteins = function(peptides_in) { | 
|  | 88     #reverse sequences are blank but have a razor protein indicating that they are reverse; exclude these for now | 
|  | 89     peptides_in = subset(peptides_in,peptides_in$Proteins != "") | 
|  | 90     results_list = list() | 
|  | 91     k = 1 | 
|  | 92     for (i in 1:nrow(peptides_in)) { | 
|  | 93         protein_names = peptides_in[i,"Proteins"] | 
|  | 94         protein_names_split = unlist(strsplit(protein_names,";")) | 
|  | 95         for (j in 1:length(protein_names_split)) { | 
|  | 96             peptides_mapped_proteins = data.frame(peptides_in[i,],mapped_protein=protein_names_split[j],stringsAsFactors=FALSE) | 
|  | 97             results_list[[k]] = peptides_mapped_proteins | 
|  | 98             k = k+1 | 
|  | 99 | 
|  | 100         } | 
|  | 101     } | 
|  | 102     return(rbindlist(results_list)) | 
|  | 103 } | 
|  | 104 | 
|  | 105 get_protein_values = function(mapped_peptides_in,intensity_columns_list) { | 
|  | 106   unique_mapped_proteins_list = unique(mapped_peptides_in$mapped_protein) # Gets list of all peptides listed. | 
|  | 107   # Generates a blank data frame with clomns of Intensities and rows of Uniprots. | 
|  | 108   Tukeys_df = data.frame(mapped_protein = unique_mapped_proteins_list, stringsAsFactors = FALSE ) | 
|  | 109   for (q in intensity_columns_list) {Tukeys_df[,q] = NA} | 
|  | 110   for (i in 1:length(unique_mapped_proteins_list)) { | 
|  | 111     mapped_peptides_unique_subset = subset(mapped_peptides_in, mapped_protein == unique_mapped_proteins_list[i]) | 
|  | 112     #calculate Tukey's Biweight from library(affy); returns a single numeric | 
|  | 113     #results_list[[i]] = data.frame(Protein=unique_mapped_proteins_list[i],Peptides_per_protein=nrow(mapped_peptides_unique_subset)) | 
|  | 114     for (j in intensity_columns_list) { | 
|  | 115       #Populates with new Tukeys values. | 
|  | 116       Tukeys_df[i,j] = 2^(tukey.biweight(mapped_peptides_unique_subset[,j])) | 
|  | 117     } | 
|  | 118   } | 
|  | 119   return(Tukeys_df) | 
|  | 120 } | 
|  | 121 | 
|  | 122 | 
|  | 123 args <- commandArgs(trailingOnly = TRUE) | 
|  | 124 main(args[1], args[2]) |