Mercurial > repos > swebb > pycrac
view pyCRAC/pyReadCounters.xml @ 0:19b20927172d draft
Uploaded
author | swebb |
---|---|
date | Tue, 18 Jun 2013 09:11:00 -0400 |
parents | |
children |
line wrap: on
line source
<tool id ="pyReadCounters" name="pyReadCounters" force_history_refresh="True"> <requirements> <requirement type="package">pyCRAC</requirement> </requirements> <command interpreter="perl"> pyReadCounters.pl -f $ftype.input --file_type $ftype.file_type --gtf $addGTF.gtf #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.disc.discard == "discard": --discarded $discarded #end if# #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.addAlignOpt.alignoptions == "edit": --alignOpt --align_quality $ftype.addAlignOpt.align_quality --align_score $ftype.addAlignOpt.align_score #if int($ftype.addAlignOpt.max) > 0: --max $ftype.addAlignOpt.max #end if# --distance $ftype.addAlignOpt.d --length $ftype.addAlignOpt.length $ftype.addAlignOpt.unique $ftype.addAlignOpt.blocks $ftype.addAlignOpt.mutations #end if# #if $addOpt.options == "edit": --options --range $addOpt.range $addOpt.ignore --overlap $addOpt.overlap #end if# --stats $stats --hittable $hittable --intronUTRoverlap $intronUTRoverlap #if $ftype.file_type == "novo" or $ftype.file_type == "sam": --countoutput $countoutput #end if# --id $stats.id </command> <version_command>/usr/local/bin/pyReadCounters.py --version</version_command> <inputs> <conditional name="addGTF"> <param name="gtfFile" type="select" label="Choose GTF File from"> <option value="default" selected="true">Defaults</option> <option value="other">History</option> </param> <when value="default"> <param name="gtf" type="select" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"> <options from_data_table="pycrac_gtf"/> </param> </when> <when value="other"> <param format="GTF" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/> </when> </conditional> <conditional name="ftype"> <param name="file_type" type="select" label="Input File Type --file_type" help="Use .novo or .sam input files"> <option value="novo" selected="true">Novo</option> <option value="sam">Sam/Bam</option> <option value="gtf">GTF</option> </param> <when value="novo"> <param format="tabular" name="input" type="data" label="Input File --input_file" help="Alignment file of type .novo" /> <conditional name="disc"> <param name="discard" type="select" label="Print discarded reads to a separate file"> <option value="" selected="true">OFF</option> <option value="discard">ON</option> </param> <when value="discard"> </when> <when value=""> </when> </conditional> <conditional name="addAlignOpt"> <param name="alignoptions" type="select" label="Alignment Options"> <option value="default" selected="true">Default</option> <option value="edit">Edit</option> </param> <when value="edit"> <param name="mutations" type="select" label="Option for selecting type of mutations to report --mutations" help="cross-linking sites are often highlighted by deletions and/or substitutions in the reads. You can use this option to select specific mutations that you want to have reported in the GTF output file."> <option value="" selected="true">Off</option> <option value="--mutations delsonly">deletions</option> <option value="--mutations subsonly">substitutions</option> <option value="--mutations TC">T->C substitutions</option> <option value="--mutations nomuts">no mutations</option> </param> <param format="integer" name="align_quality" type="integer" label="Align Quality --align_quality " value="0" size="5" > <validator type="in_range" min="0" message="Please enter a value >= 0"/> </param> <param format="integer" name="align_score" type="integer" label="Align Score --align_score " value="0" size="5" > <validator type="in_range" min="0" message="Please enter a value >= 0"/> </param> <param format="integer" name="max" type="integer" label="Mapped reads to read from input file --max" help="Set to 0 to align all reads." value="0" size="10" > <validator type="in_range" min="0" max="100000000" message="Please enter a value between 1 and 100000000 or 0 to align all reads"/> </param> <param format="integer" name="d" type="integer" label="Distance --distance " value="1000" size="6" help="Set the maximum number of bp allowed between two non-overlapping paired reads"> <validator type="in_range" min="1" message="Please enter a value >= 0"/> </param> <param format="integer" name="length" type="integer" label="Set the maximum length of reads --length" value="1000" size="7" help="Set the read length threshold between 15 and 1000"> <validator type="in_range" min="15" max="1000" message="Please enter a value between 15 and 1000"/> </param> <param name="unique" type="select" label="Remove reads with multiple alignment locations --unique"> <option value="" selected="true">OFF</option> <option value="--unique">ON</option> </param> <param name="blocks" type="select" label="Only count reads with same start and end coords once --blocks"> <option value="" selected="true">OFF</option> <option value="--blocks">ON</option> </param> </when> <when value="default"> </when> </conditional> </when> <when value="sam"> <param format="sam,bam" name="input" type="data" label="Input File --input_file" help="Alignment file of type .sam or .bam" /> <conditional name="disc"> <param name="discard" type="select" label="Print discarded reads to a separate file"> <option value="" selected="true">OFF</option> <option value="discard">ON</option> </param> <when value="discard"> </when> <when value=""> </when> </conditional> <conditional name="addAlignOpt"> <param name="alignoptions" type="select" label="Alignment Options"> <option value="default" selected="true">Default</option> <option value="edit">Edit</option> </param> <when value="edit"> <param name="mutations" type="select" label="Option for selecting type of mutations to report --mutations" help="cross-linking sites are often highlighted by deletions and/or substitutions in the reads. You can use this option to select specific mutations that you want to have reported in the GTF output file."> <option value="" selected="true">Off</option> <option value="--mutations delsonly">deletions</option> <option value="--mutations subsonly">substitutions</option> <option value="--mutations TC">T->C mutations</option> <option value="--mutations nomuts">no mutations</option> </param> <param format="integer" name="align_quality" type="integer" label="Align Quality --align_quality " value="0" size="5" > <validator type="in_range" min="0" message="Please enter a value >= 0"/> </param> <param format="integer" name="align_score" type="integer" label="Align Score --align_score " value="0" size="5" > <validator type="in_range" min="0" message="Please enter a value >= 0"/> </param> <param format="integer" name="max" type="integer" label="Mapped reads to read from input file --max" help="Set to 0 to align all reads." value="0" size="10" > <validator type="in_range" min="0" max="100000000" message="Please enter a value between 1 and 100000000 or 0 to align all reads"/> </param> <param format="integer" name="d" type="integer" label="Distance --distance " value="1000" size="6" help="Set the maximum number of bp allowed between two non-overlapping paired reads"> <validator type="in_range" min="1" message="Please enter a value >= 0"/> </param> <param format="integer" name="length" type="integer" label="Set the maximum length of reads --length" value="1000" size="7" help="Set the read length threshold between 15 and 1000"> <validator type="in_range" min="15" max="1000" message="Please enter a value between 15 and 1000"/> </param> <param name="unique" type="select" label="Remove reads with multiple alignment locations --unique"> <option value="" selected="true">OFF</option> <option value="--unique">ON</option> </param> <param name="blocks" type="select" label="Only count reads with same start and end coords once --blocks"> <option value="" selected="true">OFF</option> <option value="--blocks">ON</option> </param> </when> <when value="default"> </when> </conditional> </when> <when value="gtf"> <param format="gtf" name="input" type="data" label="Input File --input_file" help="File of type .gtf" /> </when> </conditional> <conditional name="addOpt"> <param name="options" type="select" label="Standard Options"> <option value="default" selected="true">Default</option> <option value="edit">Edit</option> </param> <when value="edit"> <param format="integer" name="range" type="integer" label="Range --range" value="0" size="5" help="Manually set the length of the 5' and 3' UTRs 0>50000"> <validator type="in_range" min="0" max="50000" message="Please enter a value between 0 and 50000"/> </param> <param name="ignore" type="select" label="Ignore strand information? --ignorestrand"> <option value="" selected="true">No</option> <option value="--ignorestrand">Yes</option> </param> <param format="integer" name="overlap" type="integer" label="Overlap --overlap" value="1" size="5" help="Sets the number of nucleotides a read has to overlap with a gene before it is considered a hit. "> <validator type="in_range" min="1" message="Please enter a positive integer"/> </param> </when> <when value="default"> </when> </conditional> <param name="label" type="text" format="txt" size="30" value="pyReadCounters" label="Enter output file label -o" /> </inputs> <outputs> <data format="tabular" name="stats" label="${label.value}_file_statistics.txt"/> <data format="tabular" name="hittable" label="${label.value}_hittable.txt"/> <data format="gtf" name="intronUTRoverlap" label="${label.value}_intron_and_UTR_overlap.txt"/> <data format="gtf" name="countoutput" label="${label.value}_count_output.gtf"> <filter>ftype['file_type'] == "novo" or ftype['file_type'] == "sam"</filter> </data> <data format="txt" name="discarded" label="${label.value}_discarded.txt"> <filter>(ftype['file_type'] == "novo" or ftype['file_type'] == "sam") and ftype['disc']['discard'] == "discard"</filter> </data> </outputs> <help> .. class:: infomark **pyReadCounters** pyReadCounters is part of the pyCRAC_ package. Produces a gene hittable file, two GTF output files showing to which genomic features the reads overlap. Finally the tool produces a read statistics file that provides information about the complexity of your dataset. **Output file examples** A hittable file:: # generated by pyReadCounters version 1.1.0, Mon Apr 16 20:34:22 2012 # /usr/local/bin/pyReadCounters.py -f RNAseq_data.novo -c 1 --unique # total number of reads 12534556 # total number of paired reads 10947376 # total number of single reads 483095 # total number of mapped reads: 11430471 # total number of overlapping genomic features 7019550 # sense 5960669 # anti-sense 1058881 # feature sense_overlap anti-sense_overlap number of reads ## protein_coding 3190701 YEF3 49930 3629 24221 PMA1 32621 2650 21776 COX1 24559 1037 15174 TFP1 21539 1689 13506 HSC82 21177 1458 12729 ADH1 20245 1467 11351 AI5_ALPHA 20022 918 13101 AI4 19390 886 12638 AI3 17823 798 11473 AI2 17590 790 11297 RPL10 16822 1113 8797 ENO2 16336 1125 8913 TEF1 15578 1333 5450 An example of a GTF 'count_output' file:: ##gff-version 2 # generated by Counters version 1.2.0, Tue Jan 8 22:47:29 2013 # pyReadCounters.py -f PAR_CLIP_unique.novo --mutations=TC -v # total number of reads: 2455251 # total number of paired reads: 0 # total number of single reads: 2455251 # total number of mapped reads: 2455251 # total number of overlapping genomic features: 5153943 # sense: 2640600 # anti-sense: 2513343 chrXIV reads exon 661572 661605 2 + . gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661596S; chrXIV reads exon 661720 661738 1 + . gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661726S; chrXIV reads exon 661839 661878 4 + . gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661875S; This output file also reports whether a read contains a mutation. For example:: # 661596S Indicates that the read had a nucleotide substitution ("S") at genomic coordinate 661596. The chromosome name can be found in the first column. .. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html ------ **Parameter list** File input options:: -f FILE, --input_file=FILE provide the path to your novo, SAM/BAM or gtf data file. Default is standard input. Make sure to specify the file type of the file you want to have analyzed using the --file_type option! -o OUTPUT_FILE, --output_file=OUTPUT_FILE Use this flag to override the standard file names. Do NOT add an extension. --file_type=FILE_TYPE use this option to specify the file type (i.e. 'novo','sam' or 'gtf'). This will tell the program which parsers to use for processing the files. Default = 'novo' --gtf=annotation_file.gtf type the path to the gtf annotation file that you want to use Common pyCRAC options:: --ignorestrand To ignore strand information and all reads overlapping with genomic features will be considered sense reads. Useful for analysing ChIP or RIP data --overlap=1 sets the number of nucleotides a read has to overlap with a gene before it is considered a hit. Default = 1 nucleotide -r 100, --range=100 allows you to add regions flanking the genomic feature. If you set '-r 50' or '--range=50', then the program will add 50 nucleotides to each feature on each side regardless of whether the GTF file has genes with annotated UTRs Options for SAM/BAM and Novo files:: --mutations=delsonly Use this option to only track mutations that are of interest. For CRAC data this is usually deletions (--mutations=delsonly). For PAR-CLIP data this is usually T-C mutations (--mutations=TC). Other options are\: do not report any mutations: --mutations=nomuts. Only report specific base mutations, for example only in T's, C's and G's :--mutations=[TCG]. The brackets are essential. Other nucleotide combinations are also possible --align_quality=100, --mapping_quality=100 with these options you can set the alignment quality (Novoalign) or mapping quality (SAM) threshold. Reads with qualities lower than the threshold will be ignored. Default = 0 --align_score=100 with this option you can set the alignment score threshold. Reads with alignment scores lower than the threshold will be ignored. Default = 0 --unique with this option reads with multiple alignment locations will be removed. Default = Off --blocks with this option reads with the same start and end coordinates on a chromosome will be counted as one cDNA. Default = Off -m 100000, --max=100000 maximum number of mapped reads that will be analyzed. Default = All -d 1000, --distance=1000 this option allows you to set the maximum number of base-pairs allowed between two non-overlapping paired reads. Default = 1000 --discarded=FILE prints the lines from the alignments file that were discarded by the parsers. This file contains reads that were unmapped (NM), of poor quality (i.e. QC) or paired reads that were mapped to different chromosomal locations or were too far apart on the same chromosome. Useful for debugging purposes -l 100, --length=1000 to set read length threshold. Default = 1000 </help> </tool>