view pyCRAC/pyReadCounters.xml @ 0:19b20927172d draft

Uploaded
author swebb
date Tue, 18 Jun 2013 09:11:00 -0400
parents
children
line wrap: on
line source

<tool id ="pyReadCounters" name="pyReadCounters" force_history_refresh="True">
	<requirements>
			<requirement type="package">pyCRAC</requirement>
		</requirements>
	<command interpreter="perl"> 
	pyReadCounters.pl
	-f $ftype.input
	--file_type $ftype.file_type
	--gtf $addGTF.gtf
	#if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.disc.discard == "discard":
		--discarded $discarded 
	#end if#
	#if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.addAlignOpt.alignoptions == "edit":
		--alignOpt
		--align_quality $ftype.addAlignOpt.align_quality
		--align_score $ftype.addAlignOpt.align_score   
		#if int($ftype.addAlignOpt.max) > 0:												   
			--max $ftype.addAlignOpt.max							  
		#end if#	 
		--distance $ftype.addAlignOpt.d
		--length $ftype.addAlignOpt.length
		$ftype.addAlignOpt.unique	
		$ftype.addAlignOpt.blocks
		$ftype.addAlignOpt.mutations
	#end if#
	#if $addOpt.options == "edit":
		--options
		--range $addOpt.range
		$addOpt.ignore
		--overlap $addOpt.overlap
	#end if#

	--stats $stats
	--hittable $hittable
	--intronUTRoverlap $intronUTRoverlap

	#if $ftype.file_type == "novo" or $ftype.file_type == "sam":
		--countoutput $countoutput
	#end if#

	--id $stats.id
	</command>
	<version_command>/usr/local/bin/pyReadCounters.py --version</version_command>
	<inputs>
        <conditional name="addGTF">
            <param name="gtfFile" type="select"	 label="Choose GTF File from">
                <option value="default" selected="true">Defaults</option>
                <option value="other">History</option>
            </param>
            <when value="default">
                <param name="gtf" type="select"	 label="GTF File --gtf" help="GTF file containing gene ID co-ordinates">
                    <options from_data_table="pycrac_gtf"/>
                </param>
            </when>
            <when value="other">
                <param format="GTF" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/>
            </when>
        </conditional>
		<conditional name="ftype">
			<param name="file_type" type="select"  label="Input File Type --file_type" help="Use .novo or .sam input files">
				<option value="novo" selected="true">Novo</option>
				<option value="sam">Sam/Bam</option>
				<option value="gtf">GTF</option>
			</param>
			<when value="novo">
				<param format="tabular" name="input" type="data" label="Input File --input_file" help="Alignment file of type .novo" />
				<conditional name="disc">
				  <param name="discard" type="select"  label="Print discarded reads to a separate file">
					<option value="" selected="true">OFF</option>
					<option value="discard">ON</option>
				  </param>
				  <when value="discard">
				  </when>
				  <when value="">
				  </when>
				</conditional>
		    <conditional name="addAlignOpt">
                    <param name="alignoptions" type="select"  label="Alignment Options">
                        <option value="default" selected="true">Default</option>
                        <option value="edit">Edit</option>
                    </param>
                    <when value="edit">
                        <param name="mutations" type="select" label="Option for selecting type of mutations to report --mutations" help="cross-linking sites are often highlighted by deletions and/or substitutions in the reads. You can use this option to select specific mutations that you want to have reported in the GTF output file.">
                            <option value="" selected="true">Off</option>
                            <option value="--mutations delsonly">deletions</option>
                            <option value="--mutations subsonly">substitutions</option>
                            <option value="--mutations TC">T->C substitutions</option>
                            <option value="--mutations nomuts">no mutations</option>
                        </param>
                        <param format="integer" name="align_quality" type="integer" label="Align Quality --align_quality " value="0" size="5" >
                            <validator type="in_range" min="0" message="Please enter a value >= 0"/>
                        </param>
                        <param format="integer" name="align_score" type="integer" label="Align Score --align_score " value="0" size="5" >
                            <validator type="in_range" min="0" message="Please enter a value >= 0"/>
                        </param>
                        <param format="integer" name="max" type="integer" label="Mapped reads to read from input file --max" help="Set to 0 to align all reads." value="0" size="10" >
                            <validator type="in_range" min="0" max="100000000" message="Please enter a value between 1 and 100000000 or 0 to align all reads"/>
                        </param>
                        <param format="integer" name="d" type="integer" label="Distance --distance " value="1000" size="6" help="Set the maximum number of bp allowed between two non-overlapping paired reads">
                            <validator type="in_range" min="1" message="Please enter a value >= 0"/>
                        </param>
                        <param format="integer" name="length" type="integer" label="Set the maximum length of reads --length" value="1000" size="7" help="Set the read length threshold between 15 and 1000">
                            <validator type="in_range" min="15" max="1000" message="Please enter a value between 15 and 1000"/>
                        </param>
                        <param name="unique" type="select"	label="Remove reads with multiple alignment locations --unique">
                            <option value="" selected="true">OFF</option>
                            <option value="--unique">ON</option>
                        </param>
                        <param name="blocks" type="select"	label="Only count reads with same start and end coords once --blocks">
                            <option value="" selected="true">OFF</option>
                            <option value="--blocks">ON</option>
                        </param>
                    </when>
                    <when value="default">
                    </when>
				</conditional>
			</when>
			<when value="sam">
				<param format="sam,bam" name="input" type="data" label="Input File --input_file" help="Alignment file of type .sam or .bam" />
				<conditional name="disc">
				  <param name="discard" type="select"  label="Print discarded reads to a separate file">
					<option value="" selected="true">OFF</option>
					<option value="discard">ON</option>
				  </param>
				  <when value="discard">
				  </when>
				  <when value="">
				  </when>
				</conditional>
				<conditional name="addAlignOpt">
					<param name="alignoptions" type="select" label="Alignment Options">
						<option value="default" selected="true">Default</option>
						<option value="edit">Edit</option>
					</param>
					<when value="edit">
						<param name="mutations" type="select" label="Option for selecting type of mutations to report --mutations" help="cross-linking sites are often highlighted by deletions and/or substitutions in the reads. You can use this option to select specific mutations that you want to have reported in the GTF output file.">
							<option value="" selected="true">Off</option>
							<option value="--mutations delsonly">deletions</option>
							<option value="--mutations subsonly">substitutions</option>
							<option value="--mutations TC">T->C mutations</option>
							<option value="--mutations nomuts">no mutations</option>
						</param>
						<param format="integer" name="align_quality" type="integer" label="Align Quality --align_quality " value="0" size="5" >
							<validator type="in_range" min="0" message="Please enter a value >= 0"/>
						</param>
						<param format="integer" name="align_score" type="integer" label="Align Score --align_score " value="0" size="5" >
							<validator type="in_range" min="0" message="Please enter a value >= 0"/>
						</param>
						<param format="integer" name="max" type="integer" label="Mapped reads to read from input file --max" help="Set to 0 to align all reads." value="0" size="10" >
							<validator type="in_range" min="0" max="100000000" message="Please enter a value between 1 and 100000000 or 0 to align all reads"/>
						</param>
						<param format="integer" name="d" type="integer" label="Distance --distance " value="1000" size="6" help="Set the maximum number of bp allowed between two non-overlapping paired reads">
							<validator type="in_range" min="1" message="Please enter a value >= 0"/>
						</param>
						<param format="integer" name="length" type="integer" label="Set the maximum length of reads --length" value="1000" size="7" help="Set the read length threshold between 15 and 1000">
							<validator type="in_range" min="15" max="1000" message="Please enter a value between 15 and 1000"/>
						</param>
						<param name="unique" type="select"	label="Remove reads with multiple alignment locations --unique">
						  <option value="" selected="true">OFF</option>
						  <option value="--unique">ON</option>
						</param>
						<param name="blocks" type="select"	label="Only count reads with same start and end coords once --blocks">
						  <option value="" selected="true">OFF</option>
						  <option value="--blocks">ON</option>
						</param>
					</when>
					<when value="default">
					</when>
				</conditional>
			</when>
			<when value="gtf">
				<param format="gtf" name="input" type="data" label="Input File --input_file" help="File of type .gtf" />
			</when>
		</conditional>
		<conditional name="addOpt">
		<param name="options" type="select"	 label="Standard Options">
			<option value="default" selected="true">Default</option>
			<option value="edit">Edit</option>
		</param>	
		<when value="edit">
			<param format="integer" name="range" type="integer" label="Range --range" value="0" size="5" help="Manually set the length of the 5' and 3' UTRs 0>50000">
				<validator type="in_range" min="0" max="50000" message="Please enter a value between 0 and 50000"/>
			</param>
			<param name="ignore" type="select" label="Ignore strand information? --ignorestrand">
				<option value="" selected="true">No</option>
				<option value="--ignorestrand">Yes</option>
			</param>
			<param format="integer" name="overlap" type="integer" label="Overlap --overlap" value="1" size="5" help="Sets the number of nucleotides a read has to overlap with a gene before it is considered a hit. ">
				<validator type="in_range" min="1" message="Please enter a positive integer"/>
			</param>
		</when>
		<when value="default">
		</when>
		</conditional>	
			<param name="label" type="text" format="txt" size="30" value="pyReadCounters" label="Enter output file label -o" />
	</inputs>
	<outputs>
		<data format="tabular" name="stats" label="${label.value}_file_statistics.txt"/>							 
		<data format="tabular" name="hittable" label="${label.value}_hittable.txt"/>
		<data format="gtf" name="intronUTRoverlap" label="${label.value}_intron_and_UTR_overlap.txt"/>
		<data format="gtf" name="countoutput" label="${label.value}_count_output.gtf">
			<filter>ftype['file_type'] == "novo" or ftype['file_type'] == "sam"</filter>
		</data>
		<data format="txt" name="discarded" label="${label.value}_discarded.txt">
			<filter>(ftype['file_type'] == "novo" or ftype['file_type'] == "sam") and ftype['disc']['discard'] ==  "discard"</filter>
		</data> 
	</outputs>
	<help>

.. class:: infomark

**pyReadCounters**

pyReadCounters is part of the pyCRAC_ package. Produces a gene hittable file, two GTF output files showing to which genomic features the reads overlap.
Finally the tool produces a read statistics file that provides information about the complexity of your dataset.

**Output file examples**

A hittable file::

    # generated by pyReadCounters version 1.1.0, Mon Apr 16 20:34:22 2012
    # /usr/local/bin/pyReadCounters.py -f RNAseq_data.novo -c 1 --unique
    # total number of reads 12534556
    # total number of paired reads  10947376
    # total number of single reads  483095
    # total number of mapped reads: 11430471
    # total number of overlapping genomic features  7019550
    #       sense   5960669
    #       anti-sense      1058881
    # feature       sense_overlap anti-sense_overlap  number of reads
    
    ## protein_coding       3190701
    YEF3        49930       3629        24221
    PMA1        32621       2650        21776
    COX1        24559       1037        15174
    TFP1        21539       1689        13506
    HSC82       21177       1458        12729
    ADH1        20245       1467        11351
    AI5_ALPHA   20022       918         13101
    AI4         19390       886         12638
    AI3         17823       798         11473
    AI2         17590       790         11297
    RPL10       16822       1113        8797
    ENO2        16336       1125        8913
    TEF1        15578       1333        5450

An example of a GTF 'count_output' file::

    ##gff-version 2
    # generated by Counters version 1.2.0, Tue Jan  8 22:47:29 2013
    # pyReadCounters.py -f PAR_CLIP_unique.novo --mutations=TC -v
    # total number of reads:	2455251
    # total number of paired reads:	0
    # total number of single reads:	2455251
    # total number of mapped reads:	2455251
    # total number of overlapping genomic features:	5153943
    #	sense:	2640600
    #	anti-sense:	2513343
    chrXIV	reads	exon	661572	661605	2	+	.   gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661596S;
    chrXIV	reads	exon	661720	661738	1	+	.   gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661726S;
    chrXIV	reads	exon	661839	661878	4	+	.   gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661875S;
    
This output file also reports whether a read contains a mutation. 
    
For example::
    
    # 661596S
    
Indicates that the read had a nucleotide substitution ("S") at genomic coordinate 661596. The chromosome name can be found in the first column. 

.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
		
------

**Parameter list**

File input options::

	-f FILE, --input_file=FILE
						provide the path to your novo, SAM/BAM or gtf data
						file. Default is standard input. Make sure to specify
						the file type of the file you want to have analyzed
						using the --file_type option!
	-o OUTPUT_FILE, --output_file=OUTPUT_FILE
						Use this flag to override the standard file names. Do
						NOT add an extension.
	--file_type=FILE_TYPE
						use this option to specify the file type (i.e.
						'novo','sam' or 'gtf'). This will tell the program
						which parsers to use for processing the files. Default
						= 'novo'
	--gtf=annotation_file.gtf
						type the path to the gtf annotation file that you want
						to use

Common pyCRAC options::

		--ignorestrand						
											To ignore strand information and all reads overlapping
						with genomic features will be considered sense reads.
						Useful for analysing ChIP or RIP data
	--overlap=1					
												sets the number of nucleotides a read has to overlap
						with a gene before it is considered a hit. Default =
						1 nucleotide
	-r 100, --range=100
						allows you to add regions flanking the genomic
						feature. If you set '-r 50' or '--range=50', then the
						program will add 50 nucleotides to each feature on
						each side regardless of whether the GTF file has genes
						with annotated UTRs

Options for SAM/BAM and Novo files::

	--mutations=delsonly
						Use this option to only track mutations that are of
						interest. For CRAC data this is usually deletions
						(--mutations=delsonly). For PAR-CLIP data this is
						usually T-C mutations (--mutations=TC). Other options
						are\: do not report any mutations: --mutations=nomuts.
						Only report specific base mutations, for example only
						in T's, C's and G's :--mutations=[TCG]. The brackets
						are essential. Other nucleotide combinations are also
						possible
	--align_quality=100, --mapping_quality=100
						with these options you can set the alignment quality
						(Novoalign) or mapping quality (SAM) threshold. Reads
						with qualities lower than the threshold will be
						ignored. Default = 0
	--align_score=100					
												with this option you can set the alignment score
						threshold. Reads with alignment scores lower than the
						threshold will be ignored. Default = 0
	--unique							
												with this option reads with multiple alignment
						locations will be removed. Default = Off
	--blocks					
												with this option reads with the same start and end
						coordinates on a chromosome will be counted as one
						cDNA. Default = Off
	-m 100000, --max=100000
						maximum number of mapped reads that will be analyzed.
						Default = All
	-d 1000, --distance=1000
						this option allows you to set the maximum number of
						base-pairs allowed between two non-overlapping paired
						reads. Default = 1000
	--discarded=FILE					
												prints the lines from the alignments file that were
						discarded by the parsers. This file contains reads
						that were unmapped (NM), of poor quality (i.e. QC) or
						paired reads that were mapped to different chromosomal
						locations or were too far apart on the same
						chromosome. Useful for debugging purposes
	-l 100, --length=1000					
												to set read length threshold. Default = 1000

	</help>
</tool>