Mercurial > repos > iuc > mageck_mle

<?xml version="1.0"?>
<tool id="mageck_mle" name="MAGeCK mle" version="@VERSION@.1" >
    <description>- perform maximum-likelihood estimation of gene essentiality scores</description>
    <macros>
        <import>mageck_macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <expand macro="version" />
    <command detect_errors="exit_code"><![CDATA[

mageck mle

-k '$count_table'

#if $samples.sample_select == "dmatrix":
    -d '$samples.design_matrix'
#elif $samples.sample_select == "labels":
    --day0-label '$samples.day0_label'
    -i '$samples.include_samples'
    -b '$samples.beta_labels'
#end if

-n output

#if $adv.control_sgrna:
    --control-sgrna $adv.control_sgrna
#end if

#if $adv.cnv_norm:
    --cnv-norm $adv.cnv_norm
#end if

#if $adv.norm_method:
    --norm-method $adv.norm_method
#end if

#if $adv.genes_var:
    --genes-varmodeling $adv.genes_var
#end if

#if $adv.permutation:
    --permutation-round $adv.permutation
#end if

#if $adv.adjust_method:
    --adjust-method $adv.adjust_method
#end if

#if $adv.sgrnaeff_file:
    --sgrna-efficiency $adv.sgrnaeff_file
    --sgrna-eff-name-column $adv.sgrnaid_col
    --sgrna-eff-score-column $adv.sgrnaeff_col
#end if

$adv.remove_outliers
$adv.update_eff
--threads \${GALAXY_SLOTS:-1}

    ]]></command>
    <inputs>
        <param name="count_table" argument="--count-table" type="data" format="tabular" label="Counts file"
            help="Provide a tab-separated count table. Each line in the table should include sgRNA name (1st column), target gene (2nd column) and read counts in each sample. See Help below for more information" />
        <conditional name="samples">
            <param name="sample_select" type="select" label="Design matrix or sample labels" help="You can choose to either provide a design matrix or specify the samples">
                <option value="dmatrix">Design matrix</option>
                <option value="labels">Specify samples</option>
            </param>
            <when value="dmatrix">
                <param name="design_matrix" argument="--design-matrix" type="data" format="tabular" label="Design Matrix file" help="Provide a design matrix, either a file name or a quoted string of the design matrix. For example, 1,1;1,0. The row of the design matrix must match the order of the samples in the count table (if --include-samples is not specified), or the order of the samples by the --include-samples option" />
            </when>
            <when value="labels">
                <param name="include_samples" argument="--include-samples" type="text" label="Sample labels" help="Specify the sample labels if the design matrix is not given by file in the --design-matrix option. Sample labels are separated by comma (,) and must match the labels in the count table">
                    <validator type="regex" message="Please only use letters, numbers or underscores in sample labels, and separate labels by commas">^[\w,]+$</validator>
                </param>
                <param name="day0_label" argument="--day0-label" type="text" format="tabular" optional="true" value="" label="Control sample" help="Specify the control sample label (usually day 0 or plasmid). For every other sample label, the MLE module will treat it as a single condition and generate an corresponding design matrix">
                    <validator type="regex" message="Please only use letters, numbers or underscores in sample label">^[\w]+$</validator>
                </param>
                <param name="beta_labels" argument="--beta-labels" type="text" label="Variables" help="Specify the labels of the variables (i.e., beta), if the design matrix is not given by file in the --design-matrix option. Should be separated by commas (,), and the number of labels must equal to the number of columns of design matrix), including baseline labels. Default: bata_0,beta_1,beta_2,....">
                    <validator type="regex" message="Please only use letters, numbers or underscores in labels, and separate labels by commas">^[\w,]+$</validator>
                </param>
            </when>
        </conditional>

        <section name="adv" title="Advanced Options">
        <param name="control_sgrna" argument="--control-sgrna" type="data" format="tabular" optional="true" label="Control sgRNAs file" help="A list of control sgRNAs for normalization and for generating the null distribution of RRA" />
            <param name="cnv_norm" argument="--cnv-norm" type="data" format="tabular" optional="true" label="CNV profile file" help="A matrix of copy number variation data across cell lines to normalize CNV-biased BetaScores" />
            <param name="norm_method" argument="--norm-method" type="select" optional="true" label="Method for normalization" help="If control is specified, the size factor will be estimated using control sgRNAs specified in --control-sgrna option. Default: Median" >
                <option value="median" selected="True">Median</option>
                <option value="none">None</option>
                <option value="total">Total</option>
                <option value="control">Control</option>
            </param>
            <param name="genes_var" argument="--genes-varmodeling" type="integer" optional="true" label="Number of genes for mean-variance modeling" help="Default: 1000" />
            <param name="permutation" argument="--permutation-round" type="integer" optional="true" label="Number of permutations" help="The rounds for permutation. The permutation time is (# genes) * x for x rounds of permutation. Default: 2" />
            <param name="remove_outliers" argument="--remove-outliers" type="boolean" truevalue="--remove-outliers" falsevalue="" checked="false" optional="true" label="Try to remove outliers" help="Turning this option on will slow the algorithm" />
            <param name="adjust_method" argument="--adjust-method" type="select" optional="true" label="P-Value Adjustment Method" help="Method for sgRNA-level p-value adjustment, including False Discovery Rate (FDR), Holm's method (Holm), or Pounds's method (Pounds). Default: FDR">
                <option value="fdr">FDR</option>
                <option value="holm">Holm</option>
                <option value="pounds">Pounds</option>
            </param>
            <param name="sgrnaeff_file" argument="--sgrna-efficiency" type="data" format="tabular" optional="true" label="sgRNA efficiency file" help="An optional file of sgRNA efficiency prediction. The efficiency prediction will be used as an initial guess of the probability an sgRNA is efficient. Must contain at least two columns, one containing sgRNA ID, the other containing sgRNA efficiency prediction" />
            <param name="sgrnaeff_name_col" argument="--sgrna-eff-score-column" type="data_column" data_ref="sgrnaeff_file" value="1" optional="true" label="sgRNA score column" help="The sgRNA efficiency prediction column in sgRNA efficiency prediction file (specified by the --sgrna-efficiency option). Default is 1 (the second column)." />
            <param name="sgrnaeff_score_col" argument="--sgrna-eff-name-column" type="data_column" data_ref="sgrnaeff_file" value="0" optional="true" label="sgRNA ID column" help="The sgRNA ID column in sgRNA efficiency prediction file (specified by the --sgrna-efficiency option). Default is 0 (the first column)" />
            <param name="update_eff" argument="--update-efficiency" type="boolean" truevalue="--update-efficiency" falsevalue="" checked="false" optional="true"
                label="Update efficiency" help="Iteratively update sgRNA efficiency during EM iteration" />
            <param name="out_log" type="boolean" truevalue="True" falsevalue="" checked="false"
                label="Output Log file" help="This file includes the logging information during the execution. For count command, it will list some basic statistics of the dataset at the end, including the number of reads, the number of reads mapped to the library, the number of zero-count sgRNAs, etc. Default: No" />
        </section>
    </inputs>
    <outputs>
        <data name="gene_summary" format="tabular" from_work_dir="*.gene_summary.txt" label="${tool.name} on ${on_string}: Gene Summary (MLE)" />
        <data name="sgrna_summary" format="tabular" from_work_dir="*.sgrna_summary.txt" label="${tool.name} on ${on_string}: sgRNA Summary (MLE)" />
        <data name="log" format="tabular" from_work_dir="output.log" label="${tool.name} on ${on_string}: Log (MLE)" >
            <filter>adv['out_log'] is True</filter>
        </data>
    </outputs>
    <tests>
        <test><!-- Ensure MAGeCK's demo1 test works -->
            <param name="count_table" value="demo/demo1/sample.txt" ftype="tabular" />
            <param name="design_matrix" ftype="tabular" value="in.mle.design_matrix.txt" />
            <param name="out_log" value="True"/>
            <output name="gene_summary" file="out.mle.gene_summary.txt" compare="sim_size"/>
            <output name="sgrna_summary" file="out.mle.sgrna_summary.txt"/>
            <output name="log" file="out.mle.log.txt" compare="sim_size"/>
        </test>
    </tests>

    <help><![CDATA[
.. class:: infomark

**What it does**

**MAGeCK mle** calculates gene essentiality from CRISPR screens. Compared with the original algorithm in **MAGeCK test**,
MAGeCK mle uses a measurement called beta score to call gene essentialities: a positive beta score means a gene is positively selected,
and a negative beta score means a gene is negatively selected. It is similar to the term log-fold change in differential expression,
and compared with the original robust ranking aggregation (RRA) algorithm, this measurement has the following advantages:

  * It has only one score for one gene, instead of two scores in RRA: one for positive selection, one for negative selection;
  * It allows a direct comparison across multiple conditions, or even experiments;
  * It is able to incorporate sgRNA efficiency information.

-----

**Inputs**

**sgRNA count file**

The sgRNA read count file will be used in -k parameter in the mle command. The read count file should list the names of the sgRNA, the gene it is targeting, followed by the read counts in each sample. Each item should be separated by the tab ('\t'). A header line is optional. For example in the studies of T. Wang et al. Science 2014, there are 4 CRISPR screening samples, and they are labeled as: HL60.initial, KBM7.initial, HL60.final, KBM7.final. Here are a few lines of the read count file:

    ==============  ========    ================    ================   ==============   ==============
    **sgRNA**       **gene**    **HL60.initial**    **KBM7.initial**   **HL60.final**   **KBM7.final**
    --------------  --------    ----------------    ----------------   --------------   --------------
    A1CF_m52595977  A1CF        213                 274                883              175
    A1CF_m52596017  A1CF        294                 412                1554             1891
    A1CF_m52596056  A1CF        421                 368                566              759
    A1CF_m52603842  A1CF        274                 243                314              855
    A1CF_m52603847  A1CF        0                   50                 145              266
    ==============  ========    ================    ================   ==============   ==============

**Design matrix file**

Either the sample labels can be specified in the tool form above, or alternatively, a `design matrix` file can be provided. The design matrix indicates which sample is affected by which condition. It is generally a binary matrix indicating which sample (indicated by the first column) is affected by which condition (indicated by the first row). For the meanings of the design matrix, check the input file format page.

    ============ ======== ==== ====
    **Samples**  baseline HL60 KBM7
    ------------ -------- ---- ----
    HL60.initial 1        0    0
    KBM7.initial 1        0    0
    HL60.final   1        1    0
    KBM7.final   1        0    1
    ============ ======== ==== ====

The following are the rules for the design matrix file:

* The design matrix file must include a header line of condition labels
* The first column is the sample labels that must match sample labels in read count file
* The second column must be a "baseline" column that sets all values to "1"
* The element in the design matrix is either "0" or "1"
* You must have at least one sample of "initial state" (e.g., day 0 or plasmid) that has only one "1" in the corresponding row. That only "1" must be in the baseline column.
* In the design matrix above, there are four samples, two corresponding to the initial states of two cell lines, and two corresponding to the final states of two cell lines. We design two conditions (HL60 and KBM7) that model the cell type-specific effects.

**Control sgRNA file**

The optional Control sgRNAs file is used to generate null distribution when calculating the p values. If this option is not specified, MAGeCK generates the null distribution of RRA scores by assuming all of the genes in the library are non-essential, see **More Information** below. This approach is sometimes over-conservative, and you can improve this if you know some genes are not essential. By providing the corresponding sgRNA IDs in this option, MAGeCK will have a better estimation of p values. To use this option, you need to prepare a text file specifying the IDs of control sgRNAs, one line for one sgRNA ID.

**Outputs**

This tool outputs

    * a ranked sgRNA Summary file
    * a ranked Gene Summary file

Optionally, under **Output Options** you can choose to output

    * a Log file of the analysis

If successful, MAGeCK mle will generate two files, the Gene Summary file (including gene beta scores), and the sgRNA Summary file (including sgRNA efficiency probability predictions).

**Gene Summary file (including beta scores)**

An example of the gene summary output file is below. This file includes the beta scores in two conditions specified in the design matrix (HL60|beta and KBM7|beta), and the associated statistics. For more information, check the output format specification of the **mageck test** *Gene Summary* file.

======== ========= ============= ========== ================ ============ ===================== ================= ============= =========== ================ ============ ===================== =================
**Gene** **sgRNA** **HL60|beta** **HL60|z** **HL60|p-value** **HL60|fdr** **HL60|wald-p-value** **HL60|wald-fdr** **KBM7|beta** **KBM7|ze** **KBM7|p-value** **KBM7|fdr** **KBM7|wald-p-value** **KBM7|wald-fdr**
-------- --------- ------------- ---------- ---------------- ------------ --------------------- ----------------- ------------- ----------- ---------------- ------------ --------------------- -----------------
RNF14    10        0.24927       0.72077    0.36256          0.75648      0.47105               0.9999            0.57276       1.6565      0.06468          0.32386      0.097625              0.73193
RNF10    10        0.10159       0.29373    0.92087          0.98235      0.76896               0.9999            0.11341       0.32794     0.90145          0.97365      0.74296               0.98421
RNF11    10        3.6354        10.513     0.00028          0.021739     7.5197e-26            1.3376e-22        2.5928        7.4925      0.0014898        0.032024     6.7577e-14            1.33e-11
======== ========= ============= ========== ================ ============ ===================== ================= ============= =========== ================ ============ ===================== =================


**sgRNA Summary file (including sgRNA efficiency probability predictions)**

An example of the sgRNA ranking output is as follows:

================  ========  ================= ===================  ================  ============== =======  ===============  ===========  =========  ========= =========== ============== ===========  =====================
**sgrna**         **Gene**  **control_count** **treatment_count**  **control_mean**  **treat_mean** **LFC**  **control_var**  **adj_var**  **score**  **p.low** **p.high**  **p.twosided** **FDR**      **high_in_treatment**
----------------  --------  ----------------- -------------------  ----------------  -------------- -------  ---------------  -----------  ---------  --------- ----------- -------------- -----------  ---------------------
INO80B_m74682554  INO80B    0.0/0.0           1220.15/1476.14      0.810860          1348.15        10.70    0.0              19.0767      308.478    1.0       1.11022e-16  2.22044e-16   1.57651e-14  True
NHS_p17705966     NHS       1.62172/3.90887   2327.09/1849.95      2.76529           2088.52        9.54     2.61554          68.2450      252.480    1.0       1.11022e-16  2.22044e-16   1.57651e-14  True
================  ========  ================= ===================  ================  ============== =======  ===============  ===========  =========  ========= =========== ============== ===========  =====================

The contents of each column are as follows:

* **sgrna** sgRNA ID
* **Gene**  The targeting gene
* **control_count** Normalized read counts in control samples
* **treatment_count** Normalized read counts in treatment samples
* **control_mean**  Mean read counts in control samples
* **treat_mean**  Mean read counts in treatment samples
* **LFC** The log fold change of sgRNA
* **control_var** The raw variance in control samples
* **adj_var** The adjusted variance in control samples
* **score** The score of this sgRNA
* **p.low** p-value (lower tail)
* **p.high**  p-value (higher tail)
* **p.twosided**  p-value (two sided)
* **FDR** false discovery rate
* **high_in_treatment** Whether the abundance is higher in treatment samples

-----

**More Information**

**Overview of the MAGeCK algorithm**

Briefly, read counts from different samples are first median-normalized to adjust for the effect of library sizes and read count distributions. Then the variance of read counts is estimated by sharing information across features, and a negative binomial (NB) model is used to test whether sgRNA abundance differs significantly between treatments and controls. This approach is similar to those used for differential RNA-Seq analysis. We rank sgRNAs based on P-values calculated from the NB model, and use a modified robust ranking aggregation (RRA) algorithm named α-RRA to identify positively or negatively selected genes. More specifically, α-RRA assumes that if a gene has no effect on selection, then sgRNAs targeting this gene should be uniformly distributed across the ranked list of all the sgRNAs. α-RRA ranks genes by comparing the skew in rankings to the uniform null model, and prioritizes genes whose sgRNA rankings are consistently higher than expected. α-RRA calculates the statistical significance of the skew by permutation, and a detailed description of the algorithm is presented in the Materials and methods section of the `MAGeCK paper`_. Finally, MAGeCK reports positively and negatively selected pathways by applying α-RRA to the rankings of genes in a pathway.

For more information on using MAGeCK, see the `MAGeCK website here`_.

.. _`design matrix`: https://sourceforge.net/p/mageck/wiki/input/#design-matrix-file
.. _`MAGeCK paper`: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0554-4
.. _`MAGeCK website here`: https://sourceforge.net/p/mageck/wiki/QA/#using-mageck

    ]]></help>
      <expand macro="citations" />
</tool>
author	iuc
date	Thu, 19 Apr 2018 05:34:53 -0400
parents	9cd937788131
children	b34c9d6373e0