| 
31
 | 
     1 #######################################################################################
 | 
| 
 | 
     2 # R-code: Protein Name and Tukey's Normalization
 | 
| 
 | 
     3 # Author: Adam L Borne
 | 
| 
 | 
     4 # Contributers: Paul A Stewart, Brent Kuenzi
 | 
| 
 | 
     5 #######################################################################################
 | 
| 
 | 
     6 # Assigns uniprot id from MaxQuant peptides file. Filters and normalizes the
 | 
| 
 | 
     7 # intensities of each proteins. Resulting in a one to one list of intensities to 
 | 
| 
 | 
     8 # uniprot id.
 | 
| 
 | 
     9 #######################################################################################
 | 
| 
 | 
    10 # Copyright (C)  Adam L Borne.
 | 
| 
 | 
    11 # Permission is granted to copy, distribute and/or modify this document
 | 
| 
 | 
    12 # under the terms of the GNU Free Documentation License, Version 1.3
 | 
| 
 | 
    13 # or any later version published by the Free Software Foundation;
 | 
| 
 | 
    14 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
 | 
| 
 | 
    15 # A copy of the license is included in the section entitled "GNU
 | 
| 
 | 
    16 # Free Documentation License".
 | 
| 
 | 
    17 #######################################################################################
 | 
| 
 | 
    18 ## REQUIRED INPUT ##
 | 
| 
 | 
    19 
 | 
| 
 | 
    20 # 1) peptides_file: MaxQuant peptides file. 
 | 
| 
 | 
    21 #######################################################################################
 | 
| 
 | 
    22 ins_check_run <- function() {
 | 
| 
 | 
    23   if ("affy" %in% rownames(installed.packages())){}
 | 
| 
 | 
    24   else {
 | 
| 
 | 
    25     source("https://bioconductor.org/biocLite.R")
 | 
| 
 | 
    26     biocLite(c('mygene','affy'))
 | 
| 
 | 
    27   }
 | 
| 
 | 
    28   if ('data.table' %in% rownames(installed.packages())){}
 | 
| 
 | 
    29   else {
 | 
| 
 | 
    30     install.packages('data.table', repos='http://cran.us.r-project.org')
 | 
| 
 | 
    31   }
 | 
| 
 | 
    32   if ('stringr' %in% rownames(installed.packages())){}
 | 
| 
 | 
    33   else {
 | 
| 
 | 
    34     install.packages('stringr', repos='http://cran.us.r-project.org')
 | 
| 
 | 
    35   }
 | 
| 
 | 
    36   if ('VennDiagram' %in% rownames(installed.packages())){}
 | 
| 
 | 
    37   else {
 | 
| 
 | 
    38     install.packages('VennDiagram', repos='http://cran.us.r-project.org')
 | 
| 
 | 
    39   }
 | 
| 
 | 
    40 }
 | 
| 
 | 
    41 
 | 
| 
 | 
    42 ins_check_run()
 | 
| 
 | 
    43 library(data.table)
 | 
| 
 | 
    44 library(affy)
 | 
| 
 | 
    45 library(stringr)
 | 
| 
 | 
    46 library(mygene)
 | 
| 
 | 
    47 library(VennDiagram)
 | 
| 
 | 
    48 #####
 | 
| 
 | 
    49 #data
 | 
| 
 | 
    50 
 | 
| 
 | 
    51 #We should chat a bit more about using Tukey's and handling 0's/missing values with Brent.
 | 
| 
 | 
    52 #Ask me about some updates for doing a bit more filtering of TMT data.
 | 
| 
 | 
    53 
 | 
| 
 | 
    54 main <- function(peptides_file, db_path) {
 | 
| 
 | 
    55 	peptides_file = read.delim(peptides_file,header=TRUE,stringsAsFactors=FALSE,fill=TRUE)
 | 
| 
 | 
    56   peptides_txt = peptides_file
 | 
| 
 | 
    57 	intensity_columns = names(peptides_txt[,str_detect(names(peptides_txt),"Intensity\\.*")]) #Pulls out all lines with Intensity in them.
 | 
| 
 | 
    58 	intensity_columns = intensity_columns[2:length(intensity_columns)] #Removes the first column that does not have a bait. 
 | 
| 
 | 
    59 	peptides_txt_mapped = as.data.frame(map_peptides_proteins(peptides_txt)) #This function as below sets every line to a 1 to 1 intensity to each possible protein.
 | 
| 
 | 
    60 	peptides_txt_mapped$Uniprot = str_extract(peptides_txt_mapped$mapped_protein, "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") #Pulls out just Uniprot id from the script.
 | 
| 
 | 
    61 	peptides_txt_mapped = subset(peptides_txt_mapped,!is.na(Uniprot)) #removes reverse sequences and any that didn't match a uniprot accession
 | 
| 
 | 
    62 	columns_comb = c("Uniprot", intensity_columns) 
 | 
| 
 | 
    63 	peptides_mapped_intensity = subset(peptides_txt_mapped, select = columns_comb) #Subsets out only the needed cloumns for Tukeys (Uniprot IDS and baited intensities)
 | 
| 
 | 
    64 	swissprot_fasta = scan(db_path, what="character")
 | 
| 
 | 
    65 	peptides_txt_mapped_log2 = peptides_mapped_intensity
 | 
| 
 | 
    66   # Takes the log2 of the intensities. 
 | 
| 
 | 
    67 	for (i in intensity_columns) { 
 | 
| 
 | 
    68 		peptides_txt_mapped_log2[,i] = log2(subset(peptides_txt_mapped_log2, select = i))
 | 
| 
 | 
    69 	}
 | 
| 
 | 
    70   #get the minimum from each column while ignoring the -Inf; get the min of these mins for the global min; breaks when there's only one intensity column 
 | 
| 
 | 
    71 	global_min = min(apply(peptides_txt_mapped_log2[,2:ncol(peptides_txt_mapped_log2)],2,function(x) {
 | 
| 
 | 
    72 	  min(x[x != -Inf])
 | 
| 
 | 
    73 	}))
 | 
| 
 | 
    74 	peptides_txt_mapped_log2[peptides_txt_mapped_log2 == -Inf] <- 0
 | 
| 
 | 
    75   #uniprot accessions WITHOUT isoforms; it looks like only contaminants contain isoforms anyways
 | 
| 
 | 
    76 	mapped_protein_uniprotonly = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") 
 | 
| 
 | 
    77 	mapped_protein_uniprot_accession = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9](-[0-9]+)?|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(-[0-9]+)?|[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
 | 
| 
 | 
    78 	peptides_txt_mapped_log2$mapped_protein = mapped_protein_uniprotonly
 | 
| 
 | 
    79   # Runs the Tukey function returning completed table
 | 
| 
 | 
    80   peptides_txt_mapped_log2 = subset(peptides_txt_mapped_log2,mapped_protein %in% swissprot_fasta)
 | 
| 
 | 
    81 	protein_intensities_tukeys = get_protein_values(peptides_txt_mapped_log2,intensity_columns)
 | 
| 
 | 
    82   protein_intensities_tukeys[protein_intensities_tukeys == 1] <- 0
 | 
| 
 | 
    83   write.table(protein_intensities_tukeys, "./tukeys_output.txt", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")	
 | 
| 
 | 
    84 
 | 
| 
 | 
    85 }
 | 
| 
 | 
    86 
 | 
| 
 | 
    87 map_peptides_proteins = function(peptides_in) {
 | 
| 
 | 
    88     #reverse sequences are blank but have a razor protein indicating that they are reverse; exclude these for now
 | 
| 
 | 
    89     peptides_in = subset(peptides_in,peptides_in$Proteins != "")
 | 
| 
 | 
    90     results_list = list()
 | 
| 
 | 
    91     k = 1
 | 
| 
 | 
    92     for (i in 1:nrow(peptides_in)) {
 | 
| 
 | 
    93         protein_names = peptides_in[i,"Proteins"]
 | 
| 
 | 
    94         protein_names_split = unlist(strsplit(protein_names,";"))
 | 
| 
 | 
    95         for (j in 1:length(protein_names_split)) {
 | 
| 
 | 
    96             peptides_mapped_proteins = data.frame(peptides_in[i,],mapped_protein=protein_names_split[j],stringsAsFactors=FALSE)
 | 
| 
 | 
    97             results_list[[k]] = peptides_mapped_proteins
 | 
| 
 | 
    98             k = k+1
 | 
| 
 | 
    99             
 | 
| 
 | 
   100         }
 | 
| 
 | 
   101     }
 | 
| 
 | 
   102     return(rbindlist(results_list))
 | 
| 
 | 
   103 }
 | 
| 
 | 
   104 
 | 
| 
 | 
   105 get_protein_values = function(mapped_peptides_in,intensity_columns_list) {
 | 
| 
 | 
   106   unique_mapped_proteins_list = unique(mapped_peptides_in$mapped_protein) # Gets list of all peptides listed.
 | 
| 
 | 
   107   # Generates a blank data frame with clomns of Intensities and rows of Uniprots.
 | 
| 
 | 
   108   Tukeys_df = data.frame(mapped_protein = unique_mapped_proteins_list, stringsAsFactors = FALSE ) 
 | 
| 
 | 
   109   for (q in intensity_columns_list) {Tukeys_df[,q] = NA}
 | 
| 
 | 
   110   for (i in 1:length(unique_mapped_proteins_list)) {
 | 
| 
 | 
   111     mapped_peptides_unique_subset = subset(mapped_peptides_in, mapped_protein == unique_mapped_proteins_list[i])
 | 
| 
 | 
   112     #calculate Tukey's Biweight from library(affy); returns a single numeric
 | 
| 
 | 
   113     #results_list[[i]] = data.frame(Protein=unique_mapped_proteins_list[i],Peptides_per_protein=nrow(mapped_peptides_unique_subset))
 | 
| 
 | 
   114     for (j in intensity_columns_list) {
 | 
| 
 | 
   115       #Populates with new Tukeys values.
 | 
| 
 | 
   116       Tukeys_df[i,j] = 2^(tukey.biweight(mapped_peptides_unique_subset[,j]))
 | 
| 
 | 
   117     }
 | 
| 
 | 
   118   }
 | 
| 
 | 
   119   return(Tukeys_df)
 | 
| 
 | 
   120 }
 | 
| 
 | 
   121 
 | 
| 
 | 
   122 
 | 
| 
 | 
   123 args <- commandArgs(trailingOnly = TRUE)
 | 
| 
 | 
   124 main(args[1], args[2])
 |