Mercurial > repos > proteore > proteore_expression_rnaseq_abbased
comparison get_data_HPA_v2.R @ 5:f15cdeeba4b4 draft
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
| author | proteore |
|---|---|
| date | Mon, 19 Mar 2018 10:07:38 -0400 |
| parents | cf2fa609625b |
| children |
comparison
equal
deleted
inserted
replaced
| 4:2f95774977ff | 5:f15cdeeba4b4 |
|---|---|
| 14 # --column : column containing in input ENSG identifiers | 14 # --column : column containing in input ENSG identifiers |
| 15 # --select : information from HPA to select, may be | 15 # --select : information from HPA to select, may be |
| 16 # : RNA.tissue.category,Reliability..IH.,Reliability..IF. (comma-separated) | 16 # : RNA.tissue.category,Reliability..IH.,Reliability..IF. (comma-separated) |
| 17 # --output : output file name | 17 # --output : output file name |
| 18 # Useful functions | 18 # Useful functions |
| 19 | |
| 20 # Read file and return file content as data.frame | |
| 21 readfile = function(filename, header) { | |
| 22 if (header == "true") { | |
| 23 # Read only first line of the file as header: | |
| 24 headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "") | |
| 25 #Read the data of the files (skipping the first row) | |
| 26 file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "") | |
| 27 # Remove empty rows | |
| 28 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] | |
| 29 #And assign the header to the data | |
| 30 names(file) <- headers | |
| 31 } | |
| 32 else { | |
| 33 file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "") | |
| 34 # Remove empty rows | |
| 35 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] | |
| 36 } | |
| 37 return(file) | |
| 38 } | |
| 19 | 39 |
| 20 '%!in%' <- function(x,y)!('%in%'(x,y)) | 40 '%!in%' <- function(x,y)!('%in%'(x,y)) |
| 21 | 41 |
| 22 args = commandArgs(trailingOnly = TRUE) | 42 args = commandArgs(trailingOnly = TRUE) |
| 23 | 43 |
| 50 sample = sample[,column] | 70 sample = sample[,column] |
| 51 } | 71 } |
| 52 if (typeinput=="tabfile"){ | 72 if (typeinput=="tabfile"){ |
| 53 | 73 |
| 54 if (header=="TRUE"){ | 74 if (header=="TRUE"){ |
| 55 listfile = read.table(listfile,header=TRUE,sep="\t",quote="\"",fill=TRUE, na.strings=c("","NA")) | 75 listfile = readfile(listfile, "true") |
| 56 }else{ | 76 }else{ |
| 57 listfile = read.table(listfile,header=FALSE,sep="\t",quote="\"",fill=TRUE, na.strings=c("","NA")) | 77 listfile = readfile(listfile, "false") |
| 58 } | 78 } |
| 59 sample = listfile[,column] | 79 sample = listfile[,column] |
| 60 | 80 |
| 61 } | 81 } |
| 62 | 82 |
| 84 data = data[,to_keep] | 104 data = data[,to_keep] |
| 85 # if only some of the proteins were not found in proteinatlas they will be added to | 105 # if only some of the proteins were not found in proteinatlas they will be added to |
| 86 # the file with the fields "Protein not found in proteinatlas" | 106 # the file with the fields "Protein not found in proteinatlas" |
| 87 if (length(which(sample %!in% proteinatlas[,3]))!=0){ | 107 if (length(which(sample %!in% proteinatlas[,3]))!=0){ |
| 88 proteins_not_found = as.data.frame(sample[which(sample %!in% proteinatlas[,3])]) | 108 proteins_not_found = as.data.frame(sample[which(sample %!in% proteinatlas[,3])]) |
| 89 proteins_not_found = cbind(proteins_not_found,matrix(rep("Protein not found in HPA",length(proteins_not_found)),nrow=length(proteins_not_found),ncol=length(colnames(data))-1)) | 109 proteins_not_found = cbind(proteins_not_found,matrix(rep("Protein not found in HPA",length(proteins_not_found)),nrow=length(proteins_not_found),ncol=length(colnames(data))-1)) |
| 90 | 110 |
| 91 colnames(proteins_not_found)=colnames(data) | 111 colnames(proteins_not_found)=colnames(data) |
| 92 | 112 |
| 93 data = rbind(data,proteins_not_found) | 113 data = rbind(data,proteins_not_found) |
| 94 } | 114 } |
