comparison MaxQuantProcessingScript.R @ 0:8dfd5d2b5903 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author galaxyp
date Mon, 11 Jul 2022 19:22:54 +0000
parents
children b76c75521d91
comparison
equal deleted inserted replaced
-1:000000000000 0:8dfd5d2b5903
1 #!/usr/bin/env Rscript
2
3 # This is the implementation for the
4 # "MaxQuant Phosphopeptide Localization Probability Cutoff"
5 # Galaxy tool (mqppep_lclztn_filter)
6 # It is adapted from the MaxQuant Processing Script written by Larry Cheng.
7
8 # libraries
9 library(optparse)
10 library(data.table)
11 library(stringr)
12 library(ggplot2)
13
14 # title: "MaxQuant Processing Script"
15 # author: "Larry Cheng"
16 # date: "February 19, 2018"
17 #
18 # # MaxQuant Processing Script
19 # Takes MaxQuant Phospho (STY)sites.txt file as input
20 # and performs the following (in order):
21 # 1) Runs the Proteomics Quality Control software
22 # 2) Remove contaminant and reverse sequence rows
23 # 3) Filters rows based on localization probability
24 # 4) Extract the quantitative data
25 # 5) Sequences phosphopeptides
26 # 6) Merges multiply phosphorylated peptides
27 # 7) Filters out phosphopeptides based on enrichment
28 # The output file contains the phosphopeptide (first column)
29 # and the quantitative values for each sample.
30 #
31 # ## Revision History
32 # Rev. 2022-02-10 :wrap for inclusion in Galaxy
33 # Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script"
34 # and "Phosphopeptide Processing Script"
35 # Rev. 2017-12-12 :added PTXQC
36 # added additional plots and table outputs for quality control
37 # allowed for more than 2 samples to be grouped together
38 # (up to 26 (eg, 1A, 1B, 1C, etc))
39 # converted from .r to .rmd file to knit report
40 # for quality control
41 # Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data
42 # impute multiple times
43 # Rev. 2016-09-09 :added filter to eliminate contaminant & reverse sequence rows
44 # Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to
45 # preANOVA file output
46 # Rev. 2016-08-22 :use regexSampleNames <- "\\.(\\d + )[AB]$"
47 # so that it looks at the end of string
48 # Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....)
49 # Rev. 2016-07-03 :Removed row names from the write.table() output for
50 # ANOVA and PreANOVA
51 # Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75
52 # Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting
53 # the row numbers afterwards
54 # Rev. 2016-06-21 :test18 + standardized the regexpression in protocol
55
56
57 ### FUNCTION DECLARATIONS begin ----------------------------------------------
58
59 # Read first line of file at filePath
60 # adapted from: https://stackoverflow.com/a/35761217/15509512
61 read_first_line <- function(filepath) {
62 con <- file(filepath, "r")
63 line <- readLines(con, n = 1)
64 close(con)
65 return(line)
66 }
67
68 # Move columns to the end of dataframe
69 # - data: the dataframe
70 # - move: a vector of column names, each of which is an element of names(data)
71 movetolast <- function(data, move) {
72 data[c(setdiff(names(data), move), move)]
73 }
74
75 # Generate phosphopeptide and build list when applied
76 phosphopeptide_func <- function(df) {
77 # generate peptide sequence and list of phosphopositions
78 phosphoprobsequence <-
79 strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]]
80 output <- vector()
81 phosphopeptide <- ""
82 counter <- 0 # keep track of position in peptide
83 phosphopositions <-
84 vector() # keep track of phosphorylation positions in peptide
85 score_diff <- ""
86 for (chara in phosphoprobsequence) {
87 # build peptide sequence
88 if (!(
89 chara == " " ||
90 chara == "(" ||
91 chara == ")" ||
92 chara == "." ||
93 chara == "-" ||
94 chara == "0" ||
95 chara == "1" ||
96 chara == "2" ||
97 chara == "3" ||
98 chara == "4" ||
99 chara == "5" ||
100 chara == "6" ||
101 chara == "7" ||
102 chara == "8" ||
103 chara == "9")
104 ) {
105 phosphopeptide <- paste(phosphopeptide, chara, sep = "")
106 counter <- counter + 1
107 }
108 # generate score_diff
109 if (chara == "-" ||
110 chara == "." ||
111 chara == "0" ||
112 chara == "1" ||
113 chara == "2" ||
114 chara == "3" ||
115 chara == "4" ||
116 chara == "5" ||
117 chara == "6" ||
118 chara == "7" ||
119 chara == "8" ||
120 chara == "9"
121 ) {
122 score_diff <- paste(score_diff, chara, sep = "")
123 }
124 # evaluate score_diff
125 if (chara == ")") {
126 score_diff <- as.numeric(score_diff)
127 # only consider a phosphoresidue if score_diff > 0
128 if (score_diff > 0) {
129 phosphopositions <- append(phosphopositions, counter)
130 }
131 score_diff <- ""
132 }
133 }
134
135 # generate phosphopeptide sequence (ie, peptide sequence with "p"'s)
136 counter <- 1
137 phosphoposition_correction1 <-
138 -1 # used to correct phosphosposition as "p"'s
139 # are inserted into the phosphopeptide string
140 phosphoposition_correction2 <-
141 0 # used to correct phosphosposition as "p"'s
142 # are inserted into the phosphopeptide string
143 while (counter <= length(phosphopositions)) {
144 phosphopeptide <-
145 paste(
146 substr(
147 phosphopeptide,
148 0,
149 phosphopositions[counter] + phosphoposition_correction1
150 ),
151 "p",
152 substr(
153 phosphopeptide,
154 phosphopositions[counter] + phosphoposition_correction2,
155 nchar(phosphopeptide)
156 ),
157 sep = ""
158 )
159 counter <- counter + 1
160 phosphoposition_correction1 <- phosphoposition_correction1 + 1
161 phosphoposition_correction2 <- phosphoposition_correction2 + 1
162 }
163 # building phosphopeptide list
164 output <- append(output, phosphopeptide)
165 return(output)
166 }
167
168 ### FUNCTION DECLARATIONS end ------------------------------------------------
169
170
171 ### EXTRACT ARGUMENTS begin --------------------------------------------------
172
173 # parse options
174 option_list <- list(
175 make_option(
176 c("-i", "--input"),
177 action = "store",
178 type = "character",
179 help = "A MaxQuant Phospho (STY)Sites.txt"
180 )
181 ,
182 make_option(
183 c("-o", "--output"),
184 action = "store",
185 type = "character",
186 help = "path to output file"
187 )
188 ,
189 make_option(
190 c("-E", "--enrichGraph"),
191 action = "store",
192 type = "character",
193 help = "path to enrichment graph PDF"
194 )
195 ,
196 make_option(
197 c("-F", "--enrichGraph_svg"),
198 action = "store",
199 type = "character",
200 help = "path to enrichment graph SVG"
201 )
202 ,
203 make_option(
204 c("-L", "--locProbCutoffGraph"),
205 action = "store",
206 type = "character",
207 help = "path to location-proability cutoff graph PDF"
208 )
209 ,
210 make_option(
211 c("-M", "--locProbCutoffGraph_svg"),
212 action = "store",
213 type = "character",
214 help = "path to location-proability cutoff graph SVG"
215 )
216 ,
217 make_option(
218 c("-e", "--enriched"),
219 action = "store",
220 type = "character",
221 help = "pY or pST enriched samples (ie, 'Y' or 'ST')"
222 )
223 # default = "^Number of Phospho [(]STY[)]$",
224 ,
225 make_option(
226 c("-p", "--phosphoCol"),
227 action = "store",
228 type = "character",
229 help = paste0("PERL-compatible regular expression matching",
230 " header of column having number of 'Phospho (STY)'")
231 )
232 # default = "^Intensity[^_]",
233 ,
234 make_option(
235 c("-s", "--startCol"),
236 action = "store",
237 type = "character",
238 help = paste0("PERL-compatible regular expression matching",
239 " header of column having first sample intensity")
240 )
241 # default = 1,
242 ,
243 make_option(
244 c("-I", "--intervalCol"),
245 action = "store",
246 type = "integer",
247 help = paste0("Column interval between the Intensities of samples",
248 " (eg, 1 if subsequent column; 2 if every other column")
249 )
250 # default = 0.75,
251 ,
252 make_option(
253 c("-l", "--localProbCutoff"),
254 action = "store",
255 type = "double",
256 help = "Localization Probability Cutoff"
257 )
258 # default = "sum",
259 ,
260 make_option(
261 c("-f", "--collapse_func"),
262 action = "store",
263 type = "character",
264 help = paste0("merge identical phosphopeptides",
265 " by ('sum' or 'average') the intensities")
266 )
267 # default = "filtered_data.txt",
268 ,
269 make_option(
270 c("-r", "--filtered_data"),
271 action = "store",
272 type = "character",
273 help = "filtered_data.txt"
274 )
275 # default = "quantData.txt",
276 ,
277 make_option(
278 c("-q", "--quant_data"),
279 action = "store",
280 type = "character",
281 help = "quantData.txt"
282 )
283 )
284 args <- parse_args(OptionParser(option_list = option_list))
285 # Check parameter values
286
287 ### EXTRACT ARGUMENTS end ----------------------------------------------------
288
289
290 ### EXTRACT PARAMETERS from arguments begin ----------------------------------
291
292 if (!file.exists(args$input)) {
293 stop((paste("File", args$input, "does not exist")))
294 }
295
296 phospho_col_pattern <- "^Number of Phospho [(][STY][STY]*[)]$"
297 start_col_pattern <- "^Intensity[^_]"
298 phospho_col_pattern <- read_first_line(args$phosphoCol)
299 start_col_pattern <- read_first_line(args$startCol)
300
301 sink(getConnection(2))
302
303 input_file_name <- args$input
304 filtered_filename <- args$filtered_data
305 quant_file_name <- args$quant_data
306 interval_col <- as.integer(args$intervalCol)
307
308 first_line <- read_first_line(input_file_name)
309 col_headers <-
310 unlist(strsplit(
311 x = first_line,
312 split = c("\t"),
313 fixed = TRUE
314 ))
315 sink(getConnection(2))
316 sink()
317
318
319 intensity_header_cols <-
320 grep(pattern = start_col_pattern, x = col_headers, perl = TRUE)
321 if (length(intensity_header_cols) == 0) {
322 err_msg <-
323 paste("Found no intensity columns matching pattern:",
324 start_col_pattern)
325 # Divert output to stderr
326 sink(getConnection(2))
327 print(err_msg)
328 sink()
329 stop(err_msg)
330 }
331
332
333 phospho_col <-
334 grep(pattern = phospho_col_pattern, x = col_headers, perl = TRUE)[1]
335 if (is.na(phospho_col)) {
336 err_msg <-
337 paste("Found no 'number of phospho sites' columns matching pattern:",
338 phospho_col_pattern)
339 # Divert output to stderr
340 sink(getConnection(2))
341 print(err_msg)
342 sink()
343 stop(err_msg)
344 }
345
346
347 i_count <- 0
348 this_column <- 1
349 last_value <- intensity_header_cols[1]
350 intensity_cols <- c(last_value)
351
352 while (length(intensity_header_cols) >= interval_col * i_count) {
353 i_count <- 1 + i_count
354 this_column <- interval_col + this_column
355 if (last_value + interval_col != intensity_header_cols[this_column])
356 break
357 last_value <- intensity_header_cols[this_column]
358 if (length(intensity_header_cols) < interval_col * i_count)
359 break
360 intensity_cols <-
361 c(intensity_cols, intensity_header_cols[this_column])
362 }
363
364 start_col <- intensity_cols[1]
365 num_samples <- i_count
366
367 output_filename <- args$output
368 enrich_graph_filename <- args$enrichGraph
369 loc_prob_cutoff_graph_filename <- args$locProbCutoffGraph
370 enrich_graph_filename_svg <- args$enrichGraph_svg
371 loc_prob_cutoff_graph_fn_svg <- args$locProbCutoffGraph_svg
372
373 local_prob_cutoff <- args$localProbCutoff
374 enriched <- args$enriched
375 collapse_fn <- args$collapse_func
376
377 ### EXTRACT PARAMETERS from arguments end ------------------------------------
378
379
380 # Proteomics Quality Control for MaxQuant Results
381 # (Bielow C et al. J Proteome Res. 2016 PMID: 26653327)
382 # is run by the Galaxy MaxQuant wrapper and need not be invoked here.
383
384
385 # Read & filter out contaminants, reverse sequences, & localization probability
386 # ---
387 full_data <-
388 read.table(
389 file = input_file_name,
390 sep = "\t",
391 header = TRUE,
392 quote = ""
393 )
394
395 # Filter out contaminant rows and reverse rows
396 filtered_data <- subset(full_data, !grepl("CON__", Proteins))
397 filtered_data <-
398 subset(filtered_data, !grepl("_MYCOPLASMA", Proteins))
399 filtered_data <-
400 subset(filtered_data, !grepl("CONTAMINANT_", Proteins))
401 filtered_data <-
402 subset(filtered_data, !grepl("REV__", Protein)
403 ) # since REV__ rows are blank in the first column (Proteins)
404 write.table(
405 filtered_data,
406 file = filtered_filename,
407 sep = "\t",
408 quote = FALSE,
409 col.names = TRUE,
410 row.names = FALSE
411 )
412 # ...
413
414
415 # Filter out data with localization probability below localProbCutoff
416 # ---
417 # Data filtered by localization probability
418 loc_prob_filtered_data <-
419 filtered_data[
420 filtered_data$Localization.prob >= local_prob_cutoff,
421 ]
422 # ...
423
424
425 # Localization probability -- visualize locprob cutoff
426 # ---
427 loc_prob_graph_data <-
428 data.frame(
429 group = c(paste(">", toString(local_prob_cutoff), sep = ""),
430 paste("<", toString(local_prob_cutoff), sep = "")),
431 value = c(
432 nrow(loc_prob_filtered_data) / nrow(filtered_data) * 100,
433 (nrow(filtered_data) - nrow(loc_prob_filtered_data))
434 / nrow(filtered_data) * 100
435 )
436 )
437 gigi <-
438 ggplot(loc_prob_graph_data, aes(x = "", y = value, fill = group)) +
439 geom_bar(width = 0.5,
440 stat = "identity",
441 color = "black") +
442 labs(x = NULL,
443 y = "percent",
444 title = "Phosphopeptides partitioned by localization-probability cutoff"
445 ) +
446 scale_fill_discrete(name = "phosphopeptide\nlocalization-\nprobability") +
447 theme_minimal() +
448 theme(
449 legend.position = "right",
450 legend.title = element_text(),
451 plot.title = element_text(hjust = 0.5),
452 plot.subtitle = element_text(hjust = 0.5),
453 plot.title.position = "plot"
454 )
455 pdf(loc_prob_cutoff_graph_filename)
456 print(gigi)
457 dev.off()
458 svg(loc_prob_cutoff_graph_fn_svg)
459 print(gigi)
460 dev.off()
461 # ...
462
463
464 # Extract quantitative values from filtered data
465 # ---
466 quant_data <-
467 loc_prob_filtered_data[, seq(from = start_col,
468 by = interval_col,
469 length.out = num_samples)]
470 # ...
471
472
473 # Generate Phosphopeptide Sequence
474 # for latest version of MaxQuant (Version 1.5.3.30)
475 # ---
476 metadata_df <-
477 data.frame(
478 loc_prob_filtered_data[, 1:8],
479 loc_prob_filtered_data[, phospho_col],
480 loc_prob_filtered_data[, phospho_col + 1],
481 loc_prob_filtered_data[, phospho_col + 2],
482 loc_prob_filtered_data[, phospho_col + 3],
483 loc_prob_filtered_data[, phospho_col + 4],
484 loc_prob_filtered_data[, phospho_col + 5],
485 loc_prob_filtered_data[, phospho_col + 6],
486 loc_prob_filtered_data[, phospho_col + 7],
487 quant_data
488 )
489 colnames(metadata_df) <-
490 c(
491 "Proteins",
492 "Positions within proteins",
493 "Leading proteins",
494 "Protein",
495 "Protein names",
496 "Gene names",
497 "Fasta headers",
498 "Localization prob",
499 "Number of Phospho (STY)",
500 "Amino Acid",
501 "Sequence window",
502 "Modification window",
503 "Peptide window coverage",
504 "Phospho (STY) Probabilities",
505 "Phospho (STY) Score diffs",
506 "Position in peptide",
507 colnames(quant_data)
508 )
509 # 'phosphopeptide_func' generates a phosphopeptide sequence
510 # for each row of data.
511 # for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1, 2) = both
512 metadata_df$phosphopeptide <-
513 apply(X = metadata_df, MARGIN = 1, FUN = phosphopeptide_func)
514 colnames(metadata_df)[1] <- "Phosphopeptide"
515 # Move the quant data columns to the right end of the data.frame
516 metadata_df <- movetolast(metadata_df, c(colnames(quant_data)))
517 # ...
518
519
520 # Write quantitative values for debugging purposes
521 # ---
522 quant_write <- cbind(metadata_df[, "Sequence window"], quant_data)
523 colnames(quant_write)[1] <- "Sequence.Window"
524 write.table(
525 quant_write,
526 file = quant_file_name,
527 sep = "\t",
528 quote = FALSE,
529 col.names = TRUE,
530 row.names = FALSE
531 )
532 # ...
533
534
535 # Make new data frame containing only Phosphopeptides
536 # that are to be mapped to quant data (merge_df)
537 # ---
538 metadata_df <-
539 setDT(metadata_df, keep.rownames = TRUE) # row name will be used to map
540 merge_df <-
541 data.frame(
542 as.integer(metadata_df$rn),
543 metadata_df$phosphopeptide # row index to merge data frames
544 )
545 colnames(merge_df) <- c("rn", "Phosphopeptide")
546 # ...
547
548
549 # Add Phosphopeptide column to quant columns for quality control checking
550 # ---
551 quant_data_qc <- as.data.frame(quant_data)
552 setDT(quant_data_qc, keep.rownames = TRUE) # will use to match rowname to data
553 quant_data_qc$rn <- as.integer(quant_data_qc$rn)
554 quant_data_qc <- merge(merge_df, quant_data_qc, by = "rn")
555 quant_data_qc$rn <- NULL # remove rn column
556 # ...
557
558
559 # Collapse multiphosphorylated peptides
560 # ---
561 quant_data_qc_collapsed <-
562 data.table(quant_data_qc, key = "Phosphopeptide")
563 quant_data_qc_collapsed <-
564 aggregate(. ~ Phosphopeptide, quant_data_qc, FUN = collapse_fn)
565 # ...
566 print("quant_data_qc_collapsed")
567 head(quant_data_qc_collapsed)
568
569 # Compute (as string) % of phosphopeptides that are multiphosphorylated
570 # (for use in next step)
571 # ---
572 pct_multiphos <-
573 (
574 nrow(quant_data_qc) - nrow(quant_data_qc_collapsed)
575 ) / (2 * nrow(quant_data_qc))
576 pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%")
577 # ...
578
579
580 # Compute and visualize breakdown of pY, pS, and pT before enrichment filter
581 # ---
582 py_data <-
583 quant_data_qc_collapsed[
584 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"),
585 ]
586 ps_data <-
587 quant_data_qc_collapsed[
588 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS"),
589 ]
590 pt_data <-
591 quant_data_qc_collapsed[
592 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"),
593 ]
594
595 py_num <- nrow(py_data)
596 ps_num <- nrow(ps_data)
597 pt_num <- nrow(pt_data)
598
599 # Visualize enrichment
600 enrich_graph_data <- data.frame(group = c("pY", "pS", "pT"),
601 value = c(py_num, ps_num, pt_num))
602
603 enrich_graph_data <-
604 enrich_graph_data[
605 enrich_graph_data$value > 0,
606 ]
607
608 # Plot pie chart with legend
609 # start: https://stackoverflow.com/a/62522478/15509512
610 # refine: https://www.statology.org/ggplot-pie-chart/
611 # colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8
612 slices <- enrich_graph_data$value
613 phosphoresidue <- enrich_graph_data$group
614 pct <- round(100 * slices / sum(slices))
615 lbls <-
616 paste(enrich_graph_data$group, "\n", pct, "%\n(", slices, ")", sep = "")
617 slc_ctr <- c()
618 run_tot <- 0
619 for (p in pct) {
620 slc_ctr <- c(slc_ctr, run_tot + p / 2.0)
621 run_tot <- run_tot + p
622 }
623 lbl_y <- 100 - slc_ctr
624 df <-
625 data.frame(slices,
626 pct,
627 lbls,
628 phosphoresidue = factor(phosphoresidue, levels = phosphoresidue))
629 gigi <- ggplot(df
630 , aes(x = 1, y = pct, fill = phosphoresidue)) +
631 geom_col(position = "stack", orientation = "x") +
632 geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") +
633 coord_polar(theta = "y", direction = -1) +
634 labs(
635 x = NULL
636 ,
637 y = NULL
638 ,
639 title = "Percentages (and counts) of phosphosites, by type of residue"
640 ,
641 caption = sprintf(
642 "Roughly %s of peptides have multiple phosphosites.",
643 pct_multiphos
644 )
645 ) +
646 labs(x = NULL, y = NULL, fill = NULL) +
647 theme_classic() +
648 theme(
649 legend.position = "right"
650 ,
651 axis.line = element_blank()
652 ,
653 axis.text = element_blank()
654 ,
655 axis.ticks = element_blank()
656 ,
657 plot.title = element_text(hjust = 0.5)
658 ,
659 plot.subtitle = element_text(hjust = 0.5)
660 ,
661 plot.caption = element_text(hjust = 0.5)
662 ,
663 plot.title.position = "plot"
664 ) +
665 scale_fill_manual(breaks = phosphoresidue,
666 values = c("#c7eae5", "#f6e8c3", "#dfc27d"))
667
668 pdf(enrich_graph_filename)
669 print(gigi)
670 dev.off()
671 svg(enrich_graph_filename_svg)
672 print(gigi)
673 dev.off()
674 # ...
675
676
677 # Filter phosphopeptides by enrichment
678 # --
679 if (enriched == "Y") {
680 quant_data_qc_enrichment <- quant_data_qc_collapsed[
681 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"),
682 ]
683 } else if (enriched == "ST") {
684 quant_data_qc_enrichment <- quant_data_qc_collapsed[
685 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS") |
686 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"),
687 ]
688 } else {
689 print("Error in enriched variable. Set to either 'Y' or 'ST'")
690 }
691 # ...
692
693 print("quant_data_qc_enrichment")
694 head(quant_data_qc_enrichment)
695
696 # Write phosphopeptides filtered by enrichment
697 # --
698 write.table(
699 quant_data_qc_enrichment,
700 file = output_filename,
701 sep = "\t",
702 quote = FALSE,
703 row.names = FALSE
704 )
705 # ...