Mercurial > repos > vipints > rdiff
view rDiff/src/octave/importdata.m @ 2:233c30f91d66
updated python based GFF parsing module which will handle GTF/GFF/GFF3 file types
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Tue, 08 Oct 2013 07:15:44 -0400 |
parents | 0f80a5141704 |
children |
line wrap: on
line source
## Copyright (C) 2012 Erik Kjellson ## ## This file is part of Octave. ## ## Octave is free software; you can redistribute it and/or modify it ## under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 3 of the License, or (at ## your option) any later version. ## ## Octave is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Octave; see the file COPYING. If not, see ## <http://www.gnu.org/licenses/>. ## -*- texinfo -*- ## @deftypefn {Function File} {@var{A} =} importdata (@var{fname}) ## @deftypefnx {Function File} {@var{A} =} importdata (@var{fname}, @var{delimiter}) ## @deftypefnx {Function File} {@var{A} =} importdata (@var{fname}, @var{delimiter}, @var{header_rows}) ## @deftypefnx {Function File} {[@var{A}, @var{delimiter}] =} importdata (...) ## @deftypefnx {Function File} {[@var{A}, @var{delimiter}, @var{header_rows}] =} importdata (...) ## Importing data from file. ## ## Importing the contents of file @var{fname} into workspace. ## ## Input parameters: ## @itemize ## @item @var{fname} ## The file name for the file to import. ## ## @item @var{delimiter} ## The character separating columns of data. Use @code{\t} for tab. ## (Only valid for ascii files) ## ## @item @var{header_rows} ## Number of header rows before the data begins. (Only valid for ascii files) ## @end itemize ## ## Different file types are supported: ## @itemize ## @item Ascii table ## ## Importing ascii table using the specified number of header rows and ## the specified delimiter. ## ## @item Image file ## ## @item @sc{Matlab} file ## ## @item Spreadsheet files (depending on external software) ## ## @item Wav file ## ## @end itemize ## ## @seealso{textscan, dlmread, csvread, load} ## @end deftypefn ## Author: Erik Kjellson <erikiiofph7@users.sourceforge.net> function [output, delimiter, header_rows] = importdata (varargin) ## Default values fname = ""; delimiter = ""; header_rows = -1; ########## ## Check input arguments if (nargin < 1) print_usage (); endif fname = varargin{1}; ## Check that the file name really is a string if (! ischar (fname)) error ("importdata: file name needs to be a string"); endif if ( strcmpi (fname, "-pastespecial")) error ("importdata: option -pastespecial not implemented"); endif if (nargin > 1) delimiter = varargin{2}; ## Check that the delimiter really is a string if (!ischar (delimiter)) error("importdata: delimiter needs to be a character"); endif if (length (delimiter) > 1 && !strcmpi (delimiter, "\\t")) error("importdata: delimiter cannot be longer than 1 character"); endif if (strcmpi (delimiter, "\\")) delimiter = "\\\\"; endif endif if (nargin > 2) header_rows = varargin{3}; if (!isnumeric (header_rows) || header_rows < 0) error ("importdata: number of header rows needs to be an integer number >= 0"); endif endif if (nargin > 3) error ("importdata: too many input arguments"); endif ########## ## Check file format ## Get the extension from the file name. [d n fileExt v] = fileparts (fname); ## Make sure file extension is in lower case. fileExt = lower (fileExt); switch fileExt case {".au", ".snd"} error (sprintf ("importdata: not implemented for file format %s", fileExt)); case ".avi" error (sprintf ("importdata: not implemented for file format %s", fileExt)); case {".bmp", ".cur", ".gif", ".hdf", ".ico", ".jpe", ".jpeg", ".jpg", \ ".pbm", ".pcx", ".pgm", ".png", ".pnm", ".ppm", ".ras", \ ".tif", ".tiff", ".xwd"} delimiter = NaN; header_rows = 0; [output.cdata, output.colormap, output.alpha] = imread (fname); case ".mat" delimiter = NaN; header_rows = 0; output = load (fname); case {".wk1", ".xls", ".xlsx", ".dbf", ".pxl"} ## If there's no Excel file support simply fall back to unimplemented.m output = xlsread (fname); case {".ods", ".sxc", ".fods", ".uos", ".xml"} ## unimplemented.m only knows ML functions; odsread isn't one but is in OF try output = odsread (fname); catch ## Fall back to unimplemented.m. output = xlsread (fname); end_try_catch case {".wav", ".wave"} delimiter = NaN; header_rows = 0; [output.data, output.fs] = wavread (fname); otherwise ## Assume the file is in ascii format. [output, delimiter, header_rows] = \ importdata_ascii (fname, delimiter, header_rows); endswitch ## If there are any empty fields in the output structure, then remove them if (isstruct (output) && length (output) == 1) fields = fieldnames (output); for i=1:length (fields) if (isempty (getfield (output, fields{i}))) output = rmfield (output, fields{i}); endif endfor ## If only one field is left, replace the structure with the field, ## i.e. output = output.onlyFieldLeft ## Update the list of fields fields = fieldnames (output); if (length (fields) == 1) output = getfield (output, fields{1}); endif endif endfunction ######################################## function [output, delimiter, header_rows] = \ importdata_ascii (fname, delimiter, header_rows) ## Define the fields in the output structure so that the order will be ## correct. output.data = []; output.textdata = []; output.rowheaders = []; output.colheaders = []; ## Read file into string and count the number of header rows #file_content = fileread (fname); file_content_rows={}; fid=fopen(fname); currline=0; firstline=1; while 1==1 if not(isempty(currline)) if currline==-1 break end end currline=fgetl(fid); file_content_rows{end+1}=currline; end fclose(fid); file_content_rows={file_content_rows{1:(end-1)}}; ## Split the file into rows (using \r\n or \n as delimiters between rows). #file_content_rows = regexp (file_content, "\r?\n", "split"); ## FIXME: guess delimiter, if it isn't defined if (isempty (delimiter)) error ("importdata: Guessing delimiter is not implemented yet, you have to specify it."); endif ## FIXME: A more intelligent way to count number of header rows. This ## is needed e.g. when delimiter=' ' and the header contains spaces... ## If number of header rows is undefined, then count the number of ## header rows by step through row by row and look for the delimiter. ## Assume that the header can't contain any delimiter. if (header_rows < 0) header_rows = 0; for i=1:length (file_content_rows) if (isempty (regexp(file_content_rows{i}, delimiter, "once"))) header_rows++; else ## Data part has begun and therefore no more header rows can be ## found break; endif endfor endif ## Put the header rows in output.textdata. if (header_rows > 0) output.textdata = file_content_rows (1:header_rows)'; endif ## If space is the delimiter, then remove spaces in the beginning of ## each data row. if (strcmpi (delimiter, " ")) for i=(header_rows+1):length (file_content_rows) ## strtrim does not only remove the leading spaces but also the ## tailing spaces, but that doesn't really matter. file_content_rows{i} = strtrim (file_content_rows{i}); endfor endif ## Remove empty data rows. Go through them backwards so that you wont ## get out of bounds. for i=length (file_content_rows):-1:(header_rows + 1) if (length (file_content_rows{i}) < 1) file_content_rows = [file_content_rows(1:i-1), \ file_content_rows(i+1:length(file_content_rows))]; endif endfor ## Count the number of data columns. If there are different number of ## columns, use the greatest value. data_columns = 0; delimiter_pattern = delimiter; ## If space is the delimiter, then multiple spaces should count as ONE ## delimiter. Also ignore leading spaces. if (strcmpi (delimiter, " ")) delimiter_pattern = ' +'; endif for i=(header_rows+1):length(file_content_rows) data_columns = max (data_columns, length (regexp (file_content_rows{i}, delimiter_pattern, "split"))); endfor ## Go through the data and put it in either output.data or ## output.textdata depending on if it is numeric or not. output.data = NaN (length (file_content_rows) - header_rows, data_columns); cut_rows=zeros(1,data_columns); for i=(header_rows+1):length(file_content_rows) ## Only use the row if it contains anything other than white-space ## characters. if (any (file_content_rows{i} != " ")) row_data = regexp (file_content_rows{i}, delimiter_pattern, "split"); for j=1:length(row_data) ## Try to convert the column to a number, if it works put it in ## output.data, otherwise in output.textdata if (!isempty (row_data{j})) data_numeric = str2double (row_data{j}); if and(!isempty (data_numeric),not(isnan(data_numeric))) output.data(i-header_rows, j) = data_numeric; if not(isnan(data_numeric)) cut_rows(j)=1; end else output.textdata{i,j} = row_data{j}; endif endif endfor endif endfor output.data=output.data(:,cut_rows>0); ## Check wether rowheaders or colheaders should be used if ((header_rows == data_columns) && (size (output.textdata, 2) == 1)) output.rowheaders = output.textdata; elseif (size (output.textdata, 2) == data_columns) output.colheaders = output.textdata(end,:); endif ## When delimiter = "\\t" convert it to a tab, done for Matlab compatibility. if (strcmp (delimiter, '\t')) delimiter = "\t"; endif endfunction function [RET]=regexp(ARG1,ARG2,ARG3) %ARG3 is always 'split' in this context if (strcmp (ARG2, '\t')) ARG2 = "\t"; endif RET = strsplit(ARG1,"\t",fixed=true); endfunction ######################################## %!test %! # Comma separated values %! A = [3.1 -7.2 0; 0.012 6.5 128]; %! fn = tmpnam (); %! fid = fopen (fn, "w"); %! fputs (fid, "3.1,-7.2,0\n0.012,6.5,128"); %! fclose (fid); %! [a,d,h] = importdata (fn, ","); %! unlink (fn); %! assert (a, A); %! assert (d, ","); %! assert (h, 0); %!test %! # Tab separated values %! A = [3.1 -7.2 0; 0.012 6.5 128]; %! fn = tmpnam (); %! fid = fopen (fn, "w"); %! fputs (fid, "3.1\t-7.2\t0\n0.012\t6.5\t128"); %! fclose (fid); %! [a,d,h] = importdata (fn, "\\t"); %! unlink (fn); %! assert (a, A); %! assert (d, "\t"); %! assert (h, 0); %!test %! # Space separated values, using multiple spaces to align in columns. %! A = [3.1 -7.2 0; 0.012 6.5 128]; %! fn = tmpnam (); %! fid = fopen (fn, "w"); %! fprintf (fid, "%10.3f %10.3f %10.3f\n", A(1,:)); %! fprintf (fid, "%10.3f %10.3f %10.3f\n", A(2,:)); %! fclose (fid); %! [a,d,h] = importdata (fn, " "); %! unlink (fn); %! assert (a, A); %! assert (d, " "); %! assert (h, 0); %!test %! # Header %! A.data = [3.1 -7.2 0; 0.012 6.5 128]; %! A.textdata = {"This is a header row."; \ %! "this row does not contain any data, but the next one does."}; %! fn = tmpnam (); %! fid = fopen (fn, "w"); %! fputs (fid, [A.textdata{1} "\n"]); %! fputs (fid, [A.textdata{2} "\n"]); %! fputs (fid, "3.1\t-7.2\t0\n0.012\t6.5\t128"); %! fclose (fid); %! [a,d,h] = importdata (fn, "\\t"); %! unlink (fn); %! assert (a, A); %! assert (d, "\t"); %! assert (h, 2); %!test %! # Ignore empty rows containing only spaces %! A = [3.1 -7.2 0; 0.012 6.5 128]; %! fn = tmpnam (); %! fid = fopen (fn, "w"); %! fprintf (fid, "%10.3f %10.3f %10.3f\n", A(1,:)); %! fputs (fid, " "); %! fprintf (fid, "%10.3f %10.3f %10.3f\n", A(2,:)); %! fclose (fid); %! [a,d,h] = importdata (fn, " "); %! unlink (fn); %! assert (a, A); %! assert (d, " "); %! assert (h, 0); %!test %! # Exponentials %! A = [3.1 -7.2 0; 0.012 6.5 128]; %! fn = tmpnam (); %! fid = fopen (fn, "w"); %! fputs (fid, "+3.1e0\t-72E-1\t0\n12e-3\t6.5\t128"); %! fclose (fid); %! [a,d,h] = importdata (fn, "\\t"); %! unlink (fn); %! assert (a, A); %! assert (d, "\t"); %! assert (h, 0); %!test %! # Missing values %! A = [3.1 NaN 0; 0.012 6.5 128]; %! fn = tmpnam (); %! fid = fopen (fn, "w"); %! fputs (fid, "3.1\t\t0\n0.012\t6.5\t128"); %! fclose (fid); %! [a,d,h] = importdata (fn, "\\t"); %! unlink (fn); %! assert (a, A); %! assert (d, "\t"); %! assert (h, 0); %!test %! # CRLF for line breaks %! A = [3.1 -7.2 0; 0.012 6.5 128]; %! fn = tmpnam (); %! fid = fopen (fn, "w"); %! fputs (fid, "3.1\t-7.2\t0\r\n0.012\t6.5\t128"); %! fclose (fid); %! [a,d,h] = importdata (fn, "\\t"); %! unlink (fn); %! assert (a, A); %! assert (d, "\t"); %! assert (h, 0);