Mercurial > repos > proteore > proteore_prot_features
comparison protein_features.R @ 1:bfc679370c64 draft
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
| author | proteore |
|---|---|
| date | Fri, 16 Feb 2018 04:06:16 -0500 |
| parents | |
| children | 867d47ff782c |
comparison
equal
deleted
inserted
replaced
| 0:e3b52db3d583 | 1:bfc679370c64 |
|---|---|
| 1 # Read file and return file content as data.frame | |
| 2 readfile = function(filename, header) { | |
| 3 if (header == "true") { | |
| 4 # Read only first line of the file as header: | |
| 5 headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE) | |
| 6 #Read the data of the files (skipping the first row) | |
| 7 file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE) | |
| 8 # Remove empty rows | |
| 9 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] | |
| 10 #And assign the header to the data | |
| 11 names(file) <- headers | |
| 12 } | |
| 13 else { | |
| 14 file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE) | |
| 15 # Remove empty rows | |
| 16 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] | |
| 17 } | |
| 18 return(file) | |
| 19 } | |
| 20 | |
| 21 protein_features = function() { | |
| 22 args <- commandArgs(TRUE) | |
| 23 if(length(args)<1) { | |
| 24 args <- c("--help") | |
| 25 } | |
| 26 | |
| 27 # Help section | |
| 28 if("--help" %in% args) { | |
| 29 cat("Selection and Annotation HPA | |
| 30 Arguments: | |
| 31 --inputtype: type of input (list of id or filename) | |
| 32 --input: input | |
| 33 --nextprot: path to nextprot information file | |
| 34 --column: the column number which you would like to apply... | |
| 35 --header: true/false if your file contains a header | |
| 36 --type: the type of input IDs (UniProt/EntrezID) | |
| 37 --argsP1: IsoPoint,SeqLength,MW | |
| 38 --argsP2: Chr,SubcellLocations | |
| 39 --argsP3: Diseases | |
| 40 --output: text output filename \n") | |
| 41 q(save="no") | |
| 42 } | |
| 43 | |
| 44 # Parse arguments | |
| 45 parseArgs <- function(x) strsplit(sub("^--", "", x), "=") | |
| 46 argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) | |
| 47 args <- as.list(as.character(argsDF$V2)) | |
| 48 names(args) <- argsDF$V1 | |
| 49 | |
| 50 inputtype = args$inputtype | |
| 51 if (inputtype == "copypaste") { | |
| 52 input = strsplit(args$input, " ")[[1]] | |
| 53 } | |
| 54 else if (inputtype == "tabfile") { | |
| 55 filename = args$input | |
| 56 ncol = args$column | |
| 57 # Check ncol | |
| 58 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { | |
| 59 stop("Please enter an integer for level") | |
| 60 } | |
| 61 else { | |
| 62 ncol = as.numeric(gsub("c", "", ncol)) | |
| 63 } | |
| 64 header = args$header | |
| 65 # Get file content | |
| 66 file = readfile(filename, header) | |
| 67 # Extract Protein IDs list | |
| 68 input = c() | |
| 69 for (row in as.character(file[,ncol])) { | |
| 70 input = c(input, strsplit(row, ";")[[1]][1]) | |
| 71 } | |
| 72 } | |
| 73 nextprot_file = args$nextprot | |
| 74 nextprot = human_id_map = read.table(nextprot_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings = "") | |
| 75 typeid = args$type | |
| 76 P1_args = strsplit(args$argsP1, ",")[[1]] | |
| 77 P2_args = strsplit(args$argsP2, ",")[[1]] | |
| 78 P3_args = strsplit(args$argsP3, ",")[[1]] | |
| 79 output = args$output | |
| 80 | |
| 81 # Change the sample ids if they are uniprot ids to be able to match them with | |
| 82 # Nextprot data | |
| 83 if (typeid=="uniprot"){ | |
| 84 input = gsub("^","NX_",input) | |
| 85 } | |
| 86 | |
| 87 # Select user input protein ids in nextprot | |
| 88 if ((length(input[input %in% nextprot[,1]]))==0){ | |
| 89 write.table("None of the input ids are can be found in Nextprot",file=filename,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) | |
| 90 } else { | |
| 91 names = c() | |
| 92 res = matrix(nrow=length(input), ncol=0) | |
| 93 | |
| 94 # Get information from neXtProt | |
| 95 if (length(P1_args)>0) { | |
| 96 for (arg in P1_args) { | |
| 97 names = c(names, arg) | |
| 98 info = nextprot[match(input, nextprot["NextprotID"][,]),][arg][,] | |
| 99 res = cbind(res, info) | |
| 100 } | |
| 101 } | |
| 102 if (length(P2_args)>0) { | |
| 103 for (arg in P2_args) { | |
| 104 names = c(names, arg) | |
| 105 info = nextprot[match(input, nextprot["NextprotID"][,]),][arg][,] | |
| 106 res = cbind(res, info) | |
| 107 } | |
| 108 } | |
| 109 if (length(P3_args)>0) { | |
| 110 for (arg in P3_args) { | |
| 111 names = c(names, arg) | |
| 112 info = nextprot[match(input, nextprot["NextprotID"][,]),][arg][,] | |
| 113 res = cbind(res, info) | |
| 114 } | |
| 115 } | |
| 116 | |
| 117 # Write output | |
| 118 if (inputtype == "copypaste") { | |
| 119 res = cbind(as.matrix(input), res) | |
| 120 names = c(typeid, names) | |
| 121 colnames(res) = names | |
| 122 write.table(res, output, row.names = FALSE, sep = "\t", quote = FALSE) | |
| 123 } | |
| 124 else if (inputtype == "tabfile") { | |
| 125 names = c(names(file), names) | |
| 126 output_content = cbind(file, res) | |
| 127 colnames(output_content) = names | |
| 128 write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE) | |
| 129 } | |
| 130 } | |
| 131 | |
| 132 } | |
| 133 protein_features() |
