view pyCRAC/pyClusterReads.xml @ 0:19b20927172d draft

Uploaded
author swebb
date Tue, 18 Jun 2013 09:11:00 -0400
parents
children
line wrap: on
line source

<tool id="pyClusterReads" name="pyClusterReads" force_history_refresh="True">
	<requirements>
        	<requirement type="package">pyCRAC</requirement>
    	</requirements>
	<command interpreter="python">
	/usr/local/bin/pyClusterReads.py
	-f $input
	--gtf=$addGTF.gtf
        #if $addGTF.annotate.annotations != "all":
           #if $addGTF.gtfFile == "default" and $addGTF.annotate.annotations == "auto":
                 --annotation=$addGTF.annotate.scan.annotation
           #else:
                --annotation=$addGTF.annotate.annotation
           #end if#
	#end if#
	-o $output
	#if $addOpt.options == "edit":
		--range=$addOpt.range
		--cic=$addOpt.cic
		--co=$addOpt.co
		--ch=$addOpt.ch
		--cl=$addOpt.cl
		--mutsfreq=$addOpt.mutsfreq
	#end if#
	</command>
	<version_command>/usr/local/bin/pyClusterReads.py --version</version_command>
	<inputs>
	    <param format="gtf" name="input" type="data" label="Input Read Data File -f" help="GTF format sorted by position i.e. pyReadCounters output file."/>         
	    <conditional name="addGTF">
            <param name="gtfFile" type="select"  label="Choose GTF File from">
                <option value="default" selected="true">Defaults</option>
                <option value="other">History</option>
            </param>        
            <when value="default">
                <param name="gtf" type="select"  label="GTF File --gtf" help="GTF file containing gene ID co-ordinates">
                    <options from_data_table="pycrac_gtf"/>
                </param>
                <conditional name="annotate">
                <param name="annotations" type="select"  label="Select annotation">
                    <option value="all" selected="true">All</option>
                    <option value="manual">Enter in text box</option>
                    <option value="auto">Scan pyGetGTFSources file</option>
                </param>        
                <when value="all">
                    <param name="annotation" type="hidden" format="txt" size="10" value="all"/>
                </when>
                <when value="manual">
                    <param name="annotation" type="text" format="txt" size="100" value="protein_coding" label="Select which annotation to focus search on --annotation" help="To find a list of available annotations please use pyGetGTFSources tool">
                        <validator type="empty_field" message="Please enter a value"/>                  
                    </param>
                </when>
                <when value="auto">
                    <param format="tabular" name="gtf_annotation" type="data" label="GTF annotation File (pyGetGTFSources output)" help="Tabular file containing unique list of annotations/sources in selected GTF file. Refer to pyGetGTFSources"/>       
                        <conditional name="scan">
                        <param name="annotations" type="select"  label="Scan this file for annotations" help="Choose the correct GTF file then choose GO">
                            <option value="wait" selected="true">Waiting</option>
                            <option value="scanning">Go</option>
                        </param>        
                        <when value="wait">
                        </when>
                        <when value="scanning">
                        <param name="annotation" type="select" multiple="false" label="Select which annotation to focus search on --annotation">
                            <options from_dataset="gtf_annotation">
                                <column name="name" index="0"/>
                                <column name="value" index="0"/>
                            </options>
                        </param>      
                        </when>
                        </conditional>
                </when>
                </conditional>

            </when>
            <when value="other">
                <param format="GTF" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/>
                <conditional name="annotate">
                <param name="annotations" type="select"  label="Select annotation">
                    <option value="all" selected="true">All</option>
                    <option value="manual">Enter in text box</option>
                    <option value="auto">Scan selected file</option>
                </param>        
                <when value="all">
                    <param name="annotation" type="hidden" format="txt" size="10" value="all"/>
                </when>
                <when value="manual">
                    <param name="annotation" type="text" format="txt" size="100" value="protein_coding" label="Select which annotation to focus search on --annotation" help="To find a list of available annotations please use pyGetGTFSources tool">
                        <validator type="empty_field" message="Please enter a value"/>                  
                    </param>
                </when>
                <when value="auto">
                    <param name="annotation" type="select" multiple="false" label="Select which annotation to focus search on --annotation">
                        <options from_dataset="gtf">
                            <column name="name" index="1"/>
                            <column name="value" index="1"/>
                            <filter type="unique_value" name="unique" column="1"/>
                        </options>
                    </param>      
                </when>
                </conditional>
            </when>
	  </conditional>

	  <conditional name="addOpt">
		<param name="options" type="select"  label="Standard Options">
			<option value="default" selected="true">Default</option>
			<option value="edit">Edit</option>
		</param>	
		<when value="edit">
            <param format="integer" name="range" type="integer" label="Range --range" value="0" size="5" help="Manually set the length of the 5' and 3' UTRs 0>50000">
                <validator type="in_range" min="0" max="50000" message="Please enter a value between 0 and 50000"/>
            </param>
            <param format="integer" name="ch" type="integer" label="Cluster height --ch" value="2" size="10" help="Minimal height of a cluster">
                <validator type="in_range" min="1" message="Please enter a value >= 1"/>
            </param>
			<param format="integer" name="cl" type="integer" label="Cluster length --cl" value="1" size="10" help="Maximum length of a cluster">
                <validator type="in_range" min="1" message="Please enter a value >= 1"/>
			</param>
			<param format="integer" name="cic" type="integer" label="cDNAs in clusters --cic" value="2" size="10" >
				<validator type="in_range" min="2" message="Please enter a value >= 1"/>
			</param>
			<param format="integer" name="co" type="integer" label="cDNA-cluster nucleotide overlap --co" value="1" size="10" >
				<validator type="in_range" min="1" message="Please enter a value >= 1"/>
			</param>
                <param format="integer" name="mutsfreq" type="integer" label="Minimum mutation frequency for a cluster position --mutsfreq" value="0" size="3" >
                    <validator type="in_range" min="0" max="100" message="Please enter a value between 0 and 100"/>
                </param>
		</when>
		<when value="default">
		</when>
	</conditional>	
	<param name="label" type="text" format="txt" size="30" value="pyClusterReads" label="Enter output file label -o" />
	</inputs>
	<outputs>
		<data format="gtf" name="output" label="${label.value}_clusters.gtf"/>
	</outputs>
	<help>

.. class:: infomark

**pyClusterReads**

pyClusterReads is part of the pyCRAC_ package. Takes a reads_count_output GTF file from pyReadCounters generates clusters from the interval coordinates. 
Produces a GTF output file with cluster intervals and overlapping genomic features.
It also includes mutation frequencies (after the # character) for nucleotides in intervals using chromosomal coordinates
The pyClusterReads GTF output file essentially has the same layout as other pyCRAC GTF output files.

**NOTE!** By default it calls each cluster an "exon" but this has no meaning. It may overlap with an intron.
Use bedtools to extract those intervals that overlap with introns or other features

The maximum height of the cluster is indicated in column 8. 
The hash character at the end of each line (#) shows chromosomal coordinates of mutated nucleotides within the cluster interval and their mutation frequencies. 

For example::
    
    # 114099S100.0 

indicates that 100% of the nucleotides in position 114099 were substituted in the cluster.

An example of a pyClusterReads output file::

    ##gff-version 2
    # generated by pyClusterReads.py version 0.0.1, Fri Jan 18 11:59:42 2013
    # pyClusterReads.py -f count_output_reads.gtf -o count_output_clusters.gtf -v
    # chromosome    feature source  start   end     cDNAs   strand  height  attributes
    chrI    cluster exon    112583  112643  6       -       5   gene_id "INT_0_114,YAL021C"; gene_name "INT_0_114,CCR4"; # 112612S75.0;
    chrI    cluster exon    113176  113232  3       -       3   gene_id "INT_0_114,YAL021C"; gene_name "INT_0_114,CCR4"; # 113184S100.0;
    chrI    cluster exon    113334  113386  2       -       2   gene_id "INT_0_114,YAL021C"; gene_name "INT_0_114,CCR4"; # 113349S50.0,113379S100.0;
    chrI    cluster exon    113534  113564  3       -       3   gene_id "INT_0_119,INT_0_114"; gene_name "INT_0_119,INT_0_114"; # 113554S33.3,113556S33.3,113557S33.3;
    chrI    cluster exon    113644  113691  5       -       4   gene_id "YAL020C,INT_0_114"; gene_name "ATS1,INT_0_114"; # 113649S50.0,113657S33.3,113679S25.0
    chrI    cluster exon    113912  113958  2       -       2   gene_id "YAL020C,INT_0_114"; gene_name "ATS1,INT_0_114"; # 113932S50.0,113946S50.0;
    chrI    cluster exon    113966  114066  5       -       3   gene_id "YAL020C,INT_0_114"; gene_name "ATS1,INT_0_114"; # 113987S50.0,114033S33.3,114039S33.3;
    chrI    cluster exon    114067  114130  3       -       3   gene_id "YAL020C,INT_0_114"; gene_name "ATS1,INT_0_114"; # 114099S100.0;

.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html                                                                          

------

**Parameter list**  


File input options::

    -f reads.gtf, --input_file=reads.gtf
                                  provide the path to your GTF read data file. NOTE the
				  file has to be correctly sorted! If you used
				  pyReadCounters to generate the file you should be
				  fine. If you modified it, use the sort command
				  described in the manual to sort your file first by
				  chromosome, then by strand and then by start position.
    -o clusters.gtf, --output_file=clusters.gtf
                                  provide a name for an output file. By default it
				  writes to the standard output
    --gtf=Yourfavoritegtf.gtf
                                  type the path to the gtf annotation file that you want
				  to use

Common pyCRAC options::

    -r 100, --range=100
                                  allows you to set the length of the UTR regions. If
				  you set '-r 50' or '--range=50', then the program will
				  set a fixed length (50 bp) regardless of whether the
				  GTF annotation file has genes with annotated UTRs.
    -a protein_coding, --annotation=protein_coding
                                  select which annotation (i.e. protein_coding, ncRNA,
				  sRNA, rRNA,snoRNA,snRNA, depending on the source of
				  your GTF file) you would like to focus your analysis
				  on. Default = all annotations

Options for cluster analysis::

    --cic=2, --cdnasinclusters=2
                                  sets the minimal number of overlapping cDNAs in each
				  cluster. Default = 2
    --co=5, --clusteroverlap=5
                                  sets the number of nucleotides cDNA sequences have to
				  overlap to form a cluster. Default = 1 nucleotide
    --ch=5, --clusterheight=5
                                  sets the minimal height of the cluster. Default = 2
				  nucleotides
    --cl=100, --clusterlength=100
                                  to set the maximum cluster sequence length
    --mutsfreq=10, --mutationfrequency=10
                                  sets the minimal mutations frequency for a cluster
				  position in the GTF output file. Default = 0%.
				  Example: if the mutsfrequency is set at 10 and a
				  cluster position has a mutated in less than 10% of the
				  reads, then the mutation will not be reported.
	</help>
</tool>