Mercurial > repos > xuebing > sharplabtool
diff tools/human_genome_variation/lps.xml @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/human_genome_variation/lps.xml Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,304 @@ +<tool id="hgv_lps" name="LPS" version="1.0.0"> + <description>LASSO-Patternsearch algorithm</description> + + <command interpreter="bash"> + lps_tool_wrapper.sh $lambda_fac $input_file $label_column $output_file $log_file + Initialization 0 + #if $advanced.options == "true": + Sample $advanced.sample + Verbosity $advanced.verbosity + Standardize $advanced.standardize + initialLambda $advanced.initialLambda + #if $advanced.continuation.continuation == "1": + Continuation $advanced.continuation.continuation + continuationSteps $advanced.continuation.continuationSteps + accurateIntermediates $advanced.continuation.accurateIntermediates + #end if + printFreq $advanced.printFreq + #if $advanced.newton.newton == "1": + Newton $advanced.newton.newton + NewtonThreshold $advanced.newton.newtonThreshold + #end if + HessianSampleFraction $advanced.hessianSampleFraction + BB 0 + Monotone 0 + FullGradient $advanced.fullGradient + GradientFraction $advanced.gradientFraction + InitialAlpha $advanced.initialAlpha + AlphaIncrease $advanced.alphaIncrease + AlphaDecrease $advanced.alphaDecrease + AlphaMax $advanced.alphaMax + c1 $advanced.c1 + MaxIter $advanced.maxIter + StopTol $advanced.stopTol + IntermediateTol $advanced.intermediateTol + FinalOnly $advanced.finalOnly + #end if + </command> + + <inputs> + <param name="input_file" type="data" format="tabular" label="Dataset"/> + <param name="label_column" type="data_column" data_ref="input_file" numerical="true" label="Label column" help="Column containing outcome labels: +1 or -1."/> + <param name="lambda_fac" label="Lambda_fac" type="float" value="0.03" help="Target value of the regularization parameter, expressed as a fraction of the calculated lambda_max."> + <validator type="in_range" message="0.00 < lambda_fac <= 1.00" min="0.00" max="1.00"/> + </param> + <conditional name="advanced"> + <param name="options" type="select" label="Advanced Options"> + <option value="false" selected="true">Hide advanced options</option> + <option value="true">Show advanced options</option> + </param> + <when value="false"> + <!-- no options --> + </when> + <when value="true"> + <!-- HARDCODED: 'Sample' we don't support passing an array --> + <param name="sample" type="float" value="1.0" label="Sample fraction" help="Sample this fraction of the data set."> + <validator type="in_range" message="0.0 <= sample <= 1.0" min="0.0" max="1.0"/> + </param> + <!-- HARDCODED: 'Initialization' = 0 :: Initialize at beta=0 --> + <param name="verbosity" type="select" format="integer" label="Verbosity"> + <option value="0" selected="true">Little output</option> + <option value="1">More output</option> + <option value="2">Still more output</option> + </param> + <param name="standardize" type="select" format="integer" label="Standardize" help="Scales and shifts each column so that it has mean zero and variance 1."> + <option value="0" selected="true">Don't standardize</option> + <option value="1">Standardize</option> + </param> + <param name="initialLambda" type="float" value="0.8" label="Initial lambda" help="First value of lambda to be used in the continuation scheme, expressed as a fraction of lambda_max."> + <validator type="in_range" message="0.0 < initialLambda < 1.0" min="0.0" max="1.0"/> + </param> + <conditional name="continuation"> + <param name="continuation" type="select" format="integer" label="Continuation" help="Use continuation strategy to start with a larger value of lambda, decreasing it successively to lambda_fac."> + <option value="0" selected="true">Don't use continuation</option> + <option value="1">Use continuation</option> + </param> + <when value="0"> + <!-- no options --> + </when> + <when value="1"> + <param name="continuationSteps" type="integer" value="5" label="Continuation steps" help="Number of lambda values to use in continuation <em>prior</em> to target value lambda_fac."/> + + <param name="accurateIntermediates" type="select" format="integer" label="Accurate intermediates" help="Indicates whether accurate solutions are required for lambda values other than the target value lambda_fac."> + <option value="0" selected="true">Don't need accurate intemediates</option> + <option value="1">Calculate accurate intermediates</option> + </param> + </when> + </conditional> <!-- name="continuation" --> + <param name="printFreq" type="integer" value="1" label="Print frequency" help="Print a progress report every NI iterations, where NI is the supplied value of this parameter."> + <validator type="in_range" message="printFreq >= 1" min="1"/> + </param> + <conditional name="newton"> + <param name="newton" type="select" format="integer" label="Projected Newton steps"> + <option value="0" selected="true">No Newton steps</option> + <option value="1">Try projected Newton steps</option> + </param> + <when value="0"> + <!-- no options --> + </when> + <when value="1"> + <param name="newtonThreshold" type="integer" value="500" label="Newton threshold" help="Maximum size of free variable subvector for Newton."/> + </when> + </conditional> + <param name="hessianSampleFraction" type="float" value="1.0" label="Hessian sample fraction" help="Fraction of terms to use in approximate Hessian calculation."> + <validator type="in_range" message="0.01 < hessianSampleFraction <= 1.00" min="0.01" max="1.00"/> + </param> + <!-- HARDCODED: 'BB' = 0 :: don't use Barzilai-Borwein steps --> + <!-- HARDCODED: 'Monotone' = 0 :: don't force monotonicity --> + <param name="fullGradient" type="select" format="integer" label="Partial gradient vector selection"> + <option value="0">Use randomly selected partial gradient, including current active components ("biased")</option> + <option value="1">Use full gradient vector at every step</option> + <option value="2">Randomly selected partial gradient, without regard to current active set ("unbiased")</option> + </param> + <param name="gradientFraction" type="float" value="0.1" label="Gradient fraction" help="Fraction of inactive gradient vector to evaluate."> + <validator type="in_range" message="0.0 < gradientFraction <= 1" min="0.0" max="1.0"/> + </param> + <param name="initialAlpha" type="float" value="1.0" label="Initial value of alpha"/> + <param name="alphaIncrease" type="float" value="2.0" label="Alpha increase" help="Factor by which to increase alpha after descent not obtained."/> + <param name="alphaDecrease" type="float" value="0.8" label="Alpha decrease" help="Factor by which to decrease alpha after successful first-order step."/> + <param name="alphaMax" type="float" value="1e12" label="Alpha max" help="Maximum value of alpha; terminate with error if we exceed this."/> + <param name="c1" type="float" value="1e-3" help="Parameter defining the margin by which the first-order step is required to decrease before being taken."> + <validator type="in_range" message="0.0 < c1 < 1.0" min="0.0" max="1.0"/> + </param> + <param name="maxIter" type="integer" value="10000" label="Maximum number of iterations" help="Terminate with error if we exceed this."/> + <param name="stopTol" type="float" value="1e-6" label="Stop tolerance" help="Convergence tolerance for target value of lambda."/> + <param name="intermediateTol" type="float" value="1e-4" label="Intermediate tolerance" help="Convergence tolerance for intermediate values of lambda."/> + <param name="finalOnly" type="select" format="integer" label="Final only"> + <option value="0" selected="true">Return information for all intermediate values</option> + <option value="1">Just return information at the last lambda</option> + </param> + </when> <!-- value="advanced" --> + </conditional> <!-- name="advanced" --> + </inputs> + + <outputs> + <data name="output_file" format="tabular" label="${tool.name} on ${on_string}: results"/> + <data name="log_file" format="txt" label="${tool.name} on ${on_string}: log"/> + </outputs> + + <requirements> + <requirement type="package">lps_tool</requirement> + </requirements> + + <tests> + <test> + <param name="input_file" value="lps_arrhythmia.tabular"/> + <param name="label_column" value="280"/> + <param name="lambda_fac" value="0.03"/> + <param name="options" value="true"/> + <param name="sample" value="1.0"/> + <param name="verbosity" value="1"/> + <param name="standardize" value="0"/> + <param name="initialLambda" value="0.9"/> + <param name="continuation" value="1"/> + <param name="continuationSteps" value="10"/> + <param name="accurateIntermediates" value="0"/> + <param name="printFreq" value="1"/> + <param name="newton" value="1"/> + <param name="newtonThreshold" value="500"/> + <param name="hessianSampleFraction" value="1.0"/> + <param name="fullGradient" value="1"/> + <param name="gradientFraction" value="0.5"/> + <param name="initialAlpha" value="1.0"/> + <param name="alphaIncrease" value="2.0"/> + <param name="alphaDecrease" value="0.8"/> + <param name="alphaMax" value="1e12"/> + <param name="c1" value="1e-3"/> + <param name="maxIter" value="2500"/> + <param name="stopTol" value="1e-6"/> + <param name="intermediateTol" value="1e-6"/> + <param name="finalOnly" value="0"/> + <output name="ouput_file" file="lps_arrhythmia_beta.tabular"/> + <output name="log_file" file="lps_arrhythmia_log.txt"/> + </test> + </tests> + + <help> +**Dataset formats** + +The input and output datasets are tabular_. The columns are described below. +There is a second output dataset (a log) that is in text_ format. +(`Dataset missing?`_) + +.. _tabular: ./static/formatHelp.html#tab +.. _text: ./static/formatHelp.html#text +.. _Dataset missing?: ./static/formatHelp.html + +----- + +**What it does** + +The LASSO-Patternsearch algorithm fits your dataset to an L1-regularized +logistic regression model. A benefit of using L1-regularization is +that it typically yields a weight vector with relatively few non-zero +coefficients. + +For example, say you have a dataset containing M rows (subjects) +and N columns (attributes) where one of these N attributes is binary, +indicating whether or not the subject has some property of interest P. +In simple terms, LPS calculates a weight for each of the other attributes +in your dataset. This weight indicates how "relevant" that attribute +is for predicting whether or not a given subject has property P. +The L1-regularization causes most of these weights to be equal to zero, +which means LPS will find a "small" subset of the remaining N-1 attributes +in your dataset that can be used to predict P. + +In other words, LPS can be used for feature selection. + +The input dataset is tabular, and must contain a label column which +indicates whether or not a given row has property P. In the current +version of this tool, P must be encoded using +1 and -1. The Lambda_fac +parameter ranges from 0 to 1, and controls how sparse the weight +vector will be. At the low end, when Lambda_fac = 0, there will be +no regularization. At the high end, when Lambda_fac = 1, there will be +"too much" regularization, and all of the weights will equal zero. + +The LPS tool creates two output datasets. The first, called the results +file, is a tabular dataset containing one column of weights for each +value of the regularization parameter lambda that was tried. The weight +columns are in order from left to right by decreasing values of lambda. +The first N-1 rows in each column are the weights for the N-1 attributes +in your input dataset. The final row is a constant, the intercept. + +Let **x** be a row from your input dataset and let **b** be a column +from the results file. To compute the probability that row **x** has +a label value of +1: + + Probability(row **x** has label value = +1) = 1 / [1 + exp{**x** \* **b**\[1..N-1\] + **b**\[N\]}] + +where **x** \* **b**\[1..N-1\] represents matrix multiplication. + +The second output dataset, called the log file, is a text file which +contains additional data about the fitted L1-regularized logistic +regression model. These data include the number of features, the +computed value of lambda_max, the actual values of lambda used, the +optimal values of the log-likelihood and regularized log-likelihood +functions, the number of non-zeros, and the number of iterations. + +Website: http://pages.cs.wisc.edu/~swright/LPS/ + +----- + +**Example** + +- input file:: + + +1 1 0 0 0 0 1 0 1 1 ... + +1 1 1 1 0 0 1 0 1 1 ... + +1 1 0 1 0 1 0 1 0 1 ... + etc. + +- output results file:: + + 0 + 0 + 0 + 0 + 0.025541 + etc. + +- output log file:: + + Data set has 100 vectors with 50 features. + calculateLambdaMax: n=50, m=100, m+=50, m-=50 + computed value of lambda_max: 5.0000e-01 + + lambda=2.96e-02 solution: + optimal log-likelihood function value: 6.46e-01 + optimal *regularized* log-likelihood function value: 6.79e-01 + number of nonzeros at the optimum: 5 + number of iterations required: 43 + etc. + +----- + +**References** + +Koh K, Kim S-J, Boyd S. (2007) +An interior-point method for large-scale l1-regularized logistic regression. +Journal of Machine Learning Research. 8:1519-1555. + +Shi W, Wahba G, Wright S, Lee K, Klein R, Klein B. (2008) +LASSO-Patternsearch algorithm with application to ophthalmology and genomic data. +Stat Interface. 1(1):137-153. + +<!-- +Wright S, Novak R, Figueiredo M. (2009) +Sparse reconstruction via separable approximation. +IEEE Transactions on Signal Processing. 57:2479-2403. + +Shi J, Yin W, Osher S, Sajda P. (2010) +A fast hybrid algorithm for large scale l1-regularized logistic regression. +Journal of Machine Learning Research. 11:713-741. + +Byrd R, Chin G, Neveitt W, Nocedal J. (2010) +On the use of stochastic Hessian information in unconstrained optimization. +Technical Report. Northwestern University. June 16, 2010. + +Wright S. (2010) +Accelerated block-coordinate relaxation for regularized optimization. +Technical Report. University of Wisconsin. August 10, 2010. +--> + + </help> +</tool>