Mercurial > repos > proteore > proteore_prot_features
comparison get_data_nextprot.R @ 0:e3b52db3d583 draft
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
| author | proteore |
|---|---|
| date | Sun, 26 Nov 2017 19:45:52 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e3b52db3d583 |
|---|---|
| 1 # Usage : Rscript --vanilla get_data_nextprot.R --inputtype copypaste (or | |
| 2 # tabfile) --input file.txt --nextprot result_nextprot.txt --column column | |
| 3 # --argsP1 IsoPoint,SeqLength,MW | |
| 4 # --argsP2 Chr,SubcellLocations --argsP3 Diseases --type id nextprot (uniprot) | |
| 5 # --output output.txt --header TRUE | |
| 6 | |
| 7 # e.g : | |
| 8 # Rscript --vanilla get_data_nextprot.R --inputtype copypaste --input P01133 P00533 P62158 Q16566 P31323 P17612 P10644 | |
| 9 # P22612 P31321 P13861 P22694 P25098 P16220 Q14573 Q14571 Q14643 Q05655 Q02156 | |
| 10 # P19174 O43865 Q01064 P54750 Q14123 P51828 Q08828 O60266 Q08462 O60503 O43306 | |
| 11 # Q8NFM4 O95622 P40145 P17252 P05129 --nextprot | |
| 12 # result_nextprot.txt--column c1 --argsP1 IsoPoint --argsP2 | |
| 13 # Chr --argsP3 Diseases --typeid uniprot --output output.txt --header FALSE | |
| 14 | |
| 15 # Useful functions | |
| 16 | |
| 17 '%!in%' <- function(x,y)!('%in%'(x,y)) | |
| 18 | |
| 19 # Parse arguments | |
| 20 | |
| 21 args = commandArgs(trailingOnly = TRUE) | |
| 22 | |
| 23 # create a list of the arguments from the command line, separated by a blank space | |
| 24 hh <- paste(unlist(args),collapse=' ') | |
| 25 # delete the first element of the list which is always a blank space | |
| 26 listoptions <- unlist(strsplit(hh,'--'))[-1] | |
| 27 # for each input, split the arguments with blank space as separator, unlist, and delete the first element which is the input name (e.g --protalas) | |
| 28 options.args <- sapply(listoptions,function(x){ | |
| 29 unlist(strsplit(x, ' '))[-1] | |
| 30 }) | |
| 31 # same as the step above, except that only the names are kept | |
| 32 options.names <- sapply(listoptions,function(x){ | |
| 33 option <- unlist(strsplit(x, ' '))[1] | |
| 34 }) | |
| 35 names(options.args) <- unlist(options.names) | |
| 36 | |
| 37 | |
| 38 typeinput = as.character(options.args[1]) | |
| 39 nextprot = read.table(as.character(options.args[3]),header=TRUE,sep="\t",quote="\"") | |
| 40 listfile = as.character(options.args[2]) | |
| 41 column = as.numeric(gsub("c","",options.args[4])) | |
| 42 P1_args = as.character(options.args[5]) | |
| 43 P2_args = as.character(options.args[6]) | |
| 44 P3_args = as.character(options.args[7]) | |
| 45 typeid = as.character(options.args[8]) | |
| 46 filename = as.character(options.args[9]) | |
| 47 header = as.character(options.args[10]) | |
| 48 | |
| 49 if (typeinput=="copypaste"){ | |
| 50 sample = as.data.frame(unlist(listfile)) | |
| 51 sample = sample[,column] | |
| 52 } | |
| 53 if (typeinput=="tabfile"){ | |
| 54 | |
| 55 if (header=="TRUE"){ | |
| 56 listfile = read.table(listfile,header=TRUE,sep="\t",quote="\"",fill=TRUE) | |
| 57 }else{ | |
| 58 listfile = read.table(listfile,header=FALSE,sep="\t",quote="\"",fill=TRUE) | |
| 59 } | |
| 60 sample = listfile[,column] | |
| 61 | |
| 62 } | |
| 63 # Change the sample ids if they are uniprot ids to be able to match them with | |
| 64 # Nextprot data | |
| 65 if (typeid=="uniprot"){ | |
| 66 sample = gsub("^","NX_",sample) | |
| 67 } | |
| 68 | |
| 69 # Select user input protein ids in nextprot | |
| 70 | |
| 71 if ((length(sample[sample %in% nextprot[,1]]))==0){ | |
| 72 | |
| 73 write.table("None of the input ids are can be found in Nextprot",file=filename,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) | |
| 74 | |
| 75 }else{ | |
| 76 | |
| 77 | |
| 78 to_keep = c() | |
| 79 | |
| 80 if (P1_args!="None"){ | |
| 81 P1_args = unlist(strsplit(P1_args,",")) | |
| 82 for (arg in P1_args){ | |
| 83 colnb = which(colnames(nextprot) %in% c(arg)) | |
| 84 to_keep = c(to_keep,colnb) | |
| 85 } | |
| 86 } | |
| 87 | |
| 88 if (P2_args!="None"){ | |
| 89 P2_args = unlist(strsplit(P2_args,",")) | |
| 90 for (arg in P2_args){ | |
| 91 colnb = which(colnames(nextprot) %in% c(arg)) | |
| 92 to_keep = c(to_keep,colnb) | |
| 93 } | |
| 94 } | |
| 95 | |
| 96 if (P3_args!="None"){ | |
| 97 P3_args = unlist(strsplit(P3_args,",")) | |
| 98 for (arg in P3_args){ | |
| 99 colnb = which(colnames(nextprot) %in% c(arg)) | |
| 100 to_keep = c(to_keep,colnb) | |
| 101 } | |
| 102 } | |
| 103 to_keep = c(1,to_keep) | |
| 104 lines = which(nextprot[,1] %in% sample) | |
| 105 data = nextprot[lines,] | |
| 106 | |
| 107 data = data[,to_keep] | |
| 108 | |
| 109 | |
| 110 # if only some of the proteins were not found in nextprot they will be added to | |
| 111 # the file with the fields "Protein not found in Nextprot" | |
| 112 if (length(which(sample %!in% nextprot[,1]))!=0){ | |
| 113 proteins_not_found = as.data.frame(sample[which(sample %!in% nextprot[,1])]) | |
| 114 | |
| 115 proteins_not_found = cbind(proteins_not_found,matrix(rep("Protein not found in Nextprot",length(proteins_not_found)),nrow=length(proteins_not_found),ncol=length(colnames(data))-1)) | |
| 116 | |
| 117 colnames(proteins_not_found)=colnames(data) | |
| 118 data = rbind(data,proteins_not_found) | |
| 119 } | |
| 120 | |
| 121 # Merge original data and data selected from nextprot | |
| 122 | |
| 123 # Before that, if the initial ids were uniprot ids change them back from | |
| 124 # Nextprot to uniprot ids in data | |
| 125 if (typeid=="uniprot"){ | |
| 126 data[,1] = gsub("^NX_","",data[,1]) | |
| 127 } | |
| 128 data = merge(listfile, data, by.x = column, by.y=1) | |
| 129 if (typeid=="uniprot"){ | |
| 130 colnames(data)[1] = "UniprotID" | |
| 131 } | |
| 132 if (typeid=="nextprot"){ | |
| 133 colnames(data)[1] = "NextprotID" | |
| 134 } | |
| 135 # Write result | |
| 136 write.table(data,file=filename,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) | |
| 137 | |
| 138 } |
