Mercurial > repos > swebb > pycrac
changeset 0:19b20927172d draft
Uploaded
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyAlignment2Tab.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,139 @@ +<tool id="pyAlignment2Tab" name="pyAlignment2Tab"> + <description>converter</description> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="python">/usr/local/bin/pyAlignment2Tab.py -f $input --limit $limit -o $output --singlefile + </command> + <version_command>/usr/local/bin/pyAlignment2Tab.py --version</version_command> + <inputs> + <param name="input" type="data" format="fasta" label="pyReadAligner output file -f" help="Fasta file"/> + <param name="limit" type="integer" format="integer" value="90" size="4" label="Set the column width of alignment" help="Enter a value > 50"> + <validator type="in_range" min="50" message="Please enter a value greater than 50"/> + </param> + <param name="label" type="text" format="txt" size="30" value="pyAlignment2Tab" label="Enter output file label -o" /> + </inputs> + <outputs> + <data name="output" format="txt" label="${label.value}.tab"/> + </outputs> + <help> + +.. class:: infomark + +**pyAlignment2Tab** + +pyAlignment2Tab is part of the pyCRAC_ package. Converts pyReadAligner fasta output to a tabular alignment output. + +Example:: + + The tool expects a standard pyReadAligner fasta-formatted output file: + + >GeneX + ATGTCTCGTACTAACATGGATACAAGACACGCACATTCTGCTTTACTGGCAGCACCACAGAGTGCTACTGCAAATAGCAGGAGCAGCAACAGCAGCAGCGAGAGTAGTAGTAACAAAAACAATATCAATGTCGGCGTCGGTGACGATAGCGGTAA + >257930-10 + ---TCTCGTACcAACATGGATACAAGACACGCACATTCTGCTT---------------------------------------------------------------------------------------------------------------- + >3664964-1 + ---TCTCGcACcAACATGGATACAAGACACGCACATTtTGCTT---------------------------------------------------------------------------------------------------------------- + >4033560-1 + ---TCTCGTACcAACATGGATACAAGACACGCACATTCTGtTT---------------------------------------------------------------------------------------------------------------- + >8571880-1 + ---TCTCGTACcAACATGGATACAAGACACGCAgATTCTGCTT---------------------------------------------------------------------------------------------------------------- + >9617396-1 + ---TCTCGTACcAACATGGATACAAGACACGCcCATTCTGCTT---------------------------------------------------------------------------------------------------------------- + >843368-5 + ------------AACAcGGATACAAGACACGCACATTCTG------------------------------------------------------------------------------------------------------------------- + >854553-5 + ------------AACATGGATACAAGACACGCAC--TCTG------------------------------------------------------------------------------------------------------------------- + >1522401-2 + --------------CATGGATACAAGACACGCACATTCTGCTTTACTGGCAGgA----------------------------------------------------------------------------------------------------- + >5981234-1 + --------------CATGGATACAAGACACGCACAcTCTGCTTTACTGGCAGCA----------------------------------------------------------------------------------------------------- + >997684-4 + --------------CATGGATACAAGACACGCACATTCTGCTTTACTGGCAGCA----------------------------------------------------------------------------------------------------- + >1046653-4 + ---------------ATGGATACAAGACACGCACATTCTGCTTTACTGGCAGgAC---------------------------------------------------------------------------------------------------- + >1103730-4 + ---------------ATGGATACAAGACACGCACAcTCTG------------------------------------------------------------------------------------------------------------------- + >1603913-2 + ---------------ATGGATACAAGAaACGCACAcTCTG------------------------------------------------------------------------------------------------------------------- + >180349-12 + ---------------ATGGATACAAGACACGCACATTCcGCTTTACTG----------------------------------------------------------------------------------------------------------- + >1985106-1 + ---------------ATGGATACAAGACACGCACATTCgGCTTTACTGGCAGCcC---------------------------------------------------------------------------------------------------- + >1987775-1 + ---------------ATGGATACccGACACGCACATTCTGCTTTACTGcCAGCAC---------------------------------------------------------------------------------------------------- + >2258725-1 + ---------------ATGGATACAAGACACGCACATTCTGCTTTgCTGGCAGCAC---------------------------------------------------------------------------------------------------- + >2631987-1 + ---------------ATGGATACAAGACACGCACATTCTGCTTTACcGGCAGgAC---------------------------------------------------------------------------------------------------- + + This will be converted into: + + 1 .........|.........|.........|.........|.........|.........|.........|.........|.........| 90 + >GeneX ATGTCTCGTACTAACATGGATACAAGACACGCACATTCTGCTTTACTGGCAGCACCACAGAGTGCTACTGCAAATAGCAGGAGCAGCAAC + >257930-10 ---TCTCGTACcAACATGGATACAAGACACGCACATTCTGCTT----------------------------------------------- + >3664964-1 ---TCTCGcACcAACATGGATACAAGACACGCACATTtTGCTT----------------------------------------------- + >4033560-1 ---TCTCGTACcAACATGGATACAAGACACGCACATTCTGtTT----------------------------------------------- + >8571880-1 ---TCTCGTACcAACATGGATACAAGACACGCAgATTCTGCTT----------------------------------------------- + >9617396-1 ---TCTCGTACcAACATGGATACAAGACACGCcCATTCTGCTT----------------------------------------------- + >843368-5 ------------AACAcGGATACAAGACACGCACATTCTG-------------------------------------------------- + >854553-5 ------------AACATGGATACAAGACACGCAC--TCTG-------------------------------------------------- + >1522401-2 --------------CATGGATACAAGACACGCACATTCTGCTTTACTGGCAGgA------------------------------------ + >5981234-1 --------------CATGGATACAAGACACGCACAcTCTGCTTTACTGGCAGCA------------------------------------ + >997684-4 --------------CATGGATACAAGACACGCACATTCTGCTTTACTGGCAGCA------------------------------------ + >1046653-4 ---------------ATGGATACAAGACACGCACATTCTGCTTTACTGGCAGgAC----------------------------------- + >1103730-4 ---------------ATGGATACAAGACACGCACAcTCTG-------------------------------------------------- + >1603913-2 ---------------ATGGATACAAGAaACGCACAcTCTG-------------------------------------------------- + >180349-12 ---------------ATGGATACAAGACACGCACATTCcGCTTTACTG------------------------------------------ + >1985106-1 ---------------ATGGATACAAGACACGCACATTCgGCTTTACTGGCAGCcC----------------------------------- + >1987775-1 ---------------ATGGATACccGACACGCACATTCTGCTTTACTGcCAGCAC----------------------------------- + >2258725-1 ---------------ATGGATACAAGACACGCACATTCTGCTTTgCTGGCAGCAC----------------------------------- + >2631987-1 ---------------ATGGATACAAGACACGCACATTCTGCTTTACcGGCAGgAC----------------------------------- + >337206-9 ---------------ATGGATACAAGACACGCACATTCTGCTTTACTGGCAGCAC----------------------------------- + >4616761-1 ---------------ATGGATAgAAGACACGCACATTCTGCTTTACTGGtAGCAC----------------------------------- + >4756312-1 ---------------ATGGATACAAcACACGCACAcTCTG-------------------------------------------------- + >4763682-1 ---------------ATGGATACAAGACACGCACATTCcGCTTTcCTG------------------------------------------ + >5971268-1 ---------------ATGGATACAAGACACGCACATTCcGCTcTACTc------------------------------------------ + >6644790-1 ---------------ATGGATACAAGACACGCACATTCTGCTTTACTcGCAGCAC----------------------------------- + >7112423-1 ---------------ATGGATACAAGACACGCACATTCTGCTTTACTGtCAGCAC----------------------------------- + >7559990-1 ---------------ATGGATACAAGACACGCACATTCTGCTTTACTGGCcGgAC----------------------------------- + >8007281-1 ---------------ATGGATAtAAGACACGCACAcTCTG-------------------------------------------------- + >9150255-1 ---------------ATGGATACAcGACACGCACATTCcGCTTTcCTG------------------------------------------ + >9180814-1 ---------------ATGGATACAAGACACGCACATTCTGCTTTACTGGCAGgcC----------------------------------- + >963117-4 ---------------ATGGATACAAGACACGCACATTCTGCTTTACcGGCAGCAC----------------------------------- + >9672073-1 ---------------ATGGATACAAGACACGCACATTCTGCTTTACTGGCAGCcC----------------------------------- + >971218-4 ---------------ATGGATACAAGACACGCACATcCTGCTTTACTGG-AGCACC---------------------------------- + >10040274-1 -------------------ATACAAGACACGCACATTCTGCTTTACTGGCAGgACCACA------------------------------- + >1063072-4 -------------------ATACAAGACACGCACATTCTGCTTcACTGGCAGCACCACA------------------------------- + >1430188-2 -------------------ATACAAGACACGCACATTCTGCTTTACTGGCAGCACCACA------------------------------- + >5196741-1 -------------------ATACAAGACACGCACATTCTGCTTcACTGGCcGCACCACA------------------------------- + >6017337-1 -------------------ATACAAGACACGCACATTCTGCTTcACTGtCAGaACCcCA------------------------------- + >7159053-1 -------------------ATACAAGACACGCACATTCTGCTTTACTGGCAGCACCcaA------------------------------- + >7528336-1 -------------------ATACAAGACACGCACATTCTGCTTcACTGGCAGCAaCACA------------------------------- + >735584-6 --------------------------------------------------------ACAGAGTGCTACTGCAAAcAGCAGGAGCAGCAAC + >8551047-1 --------------------------------------------------------ACAGAGTGCTAtTGCAAAcAGCAGGAGtAGtAAC + >3000121-1 ------------------------------------------------------------AGTcCTACcGCAAATAGCAGcAGCAGCAAC + >928481-5 ------------------------------------------------------------AGTGCTACcGCAAATAGCAGGAGCAGCAAC + >126987-15 ----------------------------------------------------------------------CAAATAGCAGGAGCAGCAAC + >3122797-1 ----------------------------------------------------------------------CAAATAGCAGGcGCAGCAAC + >6684686-1 ----------------------------------------------------------------------CAAATAGCAGGAGCAGCAAC + + Note that the column width here was set to 90 characters + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +Options:: + + -f data.fasta + Type the path to the fasta file that you want to use. + --limit=90 + Allows the user to set the column width of the alignment. Default=90 characters + -o output.fasta + Provide the name of your output file + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyBarcodeFilter.pl Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,71 @@ +#!/usr/bin/perl -w +use strict; +use Getopt::Long; + +my %opt; + + +GetOptions(\%opt, "f=s", "b=s", "out=s", "output_path=s","id=s","m=i", "file_type=s", "both", "r=s", "version", "i"); + +my $cmnd; + +if (exists $opt{version}){ + $cmnd = "python /usr/local/bin/pyBarcodeFilter.py --version"; +} +else{ + $cmnd = "python /usr/local/bin/pyBarcodeFilter.py -f $opt{f} -b $opt{b} -m $opt{m} --file_type $opt{file_type}"; + + if(defined $opt{r}){ + + $cmnd.= " -r $opt{r}"; + + if(exists $opt{both}){ + $cmnd .= " --both"; + } + } + + if(exists $opt{i}){ + $cmnd .= " -i"; + } +} + +# Create the output directory (for the multiple output files) +my $output_path = $opt{output_path}; + +system $cmnd; + +open(BC,$opt{b}) || die "Cannot open barcode file"; +my %bc; +while(my $line = <BC>){ + chomp($line); + my ($barcode,$sample) = (split(/\t/,$line))[0,1]; + $bc{$barcode}=$sample; +} + +system "mv barcode_statistics.txt $opt{out}"; + +my $ft = lc($opt{file_type}); + +foreach my $key(keys %bc){ + my @split = (split(/\//,$opt{f})); + my $l = @split; + my $output = $split[$l-1]; + $output = (split(/\./,$output))[0]; + $output = "$output"."_"."$key"."_"."$bc{$key}"."."."$ft"; + my $rename = "$output_path/primary_$opt{id}_$bc{$key}-1"."_visible_"."$ft"; + system "mv $output $rename"; + + if(defined $opt{r}){ + my @split2 = (split(/\//,$opt{r})); + $l = @split2; + $output = $split2[$l-1]; + $output = (split(/\./,$output))[0]; + $output = "$output"."_"."$key"."_"."$bc{$key}"."."."$ft"; + $rename = "$output_path/primary_$opt{id}_$bc{$key}-2"."_visible_"."$ft"; + system "mv $output $rename"; + } +} + + +close BC; +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyBarcodeFilter.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,125 @@ + <tool id ="pyBarcodeFilter" name="pyBarcodeFilter" force_history_refresh="True"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="perl"> + /usr/local/bin/pyBarcodeFilter.pl + --file_type $ftype.type + -f $ftype.f + -b $barcode + -m $mismatch + $index + --out $out + --id $out.id + --output_path $__new_file_path__ + #if $ftype.reverse.rev == "yes": + -r=$ftype.reverse.r + $ftype.reverse.both + #end if# + </command> + <version_command>pyBarcodeFilter.py --version</version_command> + <inputs> + <conditional name="ftype"> + <param name="type" type="select" label="File type"> + <option value="fastq" selected="true">FASTQ</option> + <option value="fasta">FASTA</option> + </param> + <when value="fastq"> + <param format="fastq" name="f" type="data" label="FastQ File -f" help="FastQ format" /> + <conditional name="reverse"> + <param name="rev" type="select" label="Add a reverse or paired FastQ file"> + <option value="no" selected="true">NO</option> + <option value="yes">YES</option> + </param> + <when value="yes"> + <param format="fastq" name="r" type="data" label="Reverse FastQ File -f" help="FastQ format" /> + <param name="both" type="select" label="Search for barcode in both reads"> + <option value="" selected="true">NO</option> + <option value="--both">YES</option> + </param> + </when> + <when value="no"> + </when> + </conditional> + </when> + <when value="fasta"> + <param format="fasta" name="f" type="data" label="FastA File -f" help="FastA format" /> + <conditional name="reverse"> + <param name="rev" type="select" label="Add a reverse or paired FastA file"> + <option value="no" selected="true">NO</option> + <option value="yes">YES</option> + </param> + <when value="yes"> + <param format="fasta" name="r" type="data" label="Reverse FastA File -f" help="FastA format" /> + <param name="both" type="select" label="Search for barcode in both reads"> + <option value="" selected="true">NO</option> + <option value="--both">YES</option> + </param> + </when> + <when value="no"> + </when> + </conditional> + </when> + </conditional> + <param format="tabular" name="barcode" type="data" label="Barcode File -f" help="Tab delimited file with barcodes and barcode names" /> + <param format="integer" name="mismatch" type="integer" label="Mismatches -m" value="0" size="3" help="Set the number of allowed mismatches in a barcode"> + <validator type="in_range" min="0" max="100" message="Please enter a value between 0 and 100"/> + </param> + <param name="index" type="select" label="Split data using Illumina indexing barcode information -i"> + <option value="" selected="true">NO</option> + <option value="-i">YES</option> + </param> + </inputs> + <outputs> + <data format="text" name="out" label="pyBarcodeFilter"/> + </outputs> + <help> + +.. class:: infomark + +**pySolexaBarcodeFilter** + +pySolexaBarcodeFilter is part of the pyCRAC_ package. Filters sequence files by barcodes. + +This tool requires FASTA or FASTQ input files containing the raw data and a text file containing barcode information. +To process paired end data, use -f and the -r flags to indicate the path to the forward and reverse sequencing reactions, respectively. +The barcodes file should two columns separated by a tab (see the table below). The first column should contain the barcode nucleotide sequences. +The second column should contain an identifier, for example, the name of the barcode or the name of the experiment. +The āNā in the barcode sequence indicates a random nucleotide. Make sure to use a simple text editor like TextEdit (MacOS X), gedit (Linux/Unix) or use a text editor in the terminal. +The program is case sensitive: all the nucleotide sequences should be upper case. +You can freely combine different barcodes but if you are mixing samples containing random nucleotide barcodes and normal barcodes. +**NOTE!** make sure to place the regular barcode sequence below the sequence with random nucleotides and make sure the shortest sequence is ALWAYS at the bottom in the column (see below) + +Example of a barcode text file:: + + NNNCGCTTAGC mutant2 + NNNGCGCAGC mutant1 + NNNATTAG control + NNNTAAGC myfavprotein + AGC oldcontrol + AC veryfirstbarcodedsample + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +Options:: + + -f FILE, --input_file=FILE + name of the FASTQ or FASTA input file + -r FILE, --reverse_input_file=FILE + name of the paired (or reverse) FASTQ or FASTA input file + --file_type=FASTQ + type of file, uncompressed (fasta or fastq) or compressed (fasta.gz or fastq.gz, gzip/gunzip + compressed). Default is fastq + -b FILE, --barcode_list=FILE + name of tab-delimited file containing barcodes and barcode names + -m 1, --mismatches=1 + to set the number of allowed mismatches in a barcode. A maximum of one mismatch is allowed. Default = 0 + -i, --index + use this option if you want to split the data using the Illumina indexing barcode information + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyBinCollector.pl Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,47 @@ +#!/usr/bin/perl -w +use strict; +use Getopt::Long; + +my %opt=(s=>"genomic",numberofbins=>20); + + +GetOptions(\%opt, "f=s","version","gtf=s","range=i","annotation=s", "numberofbins=i","min_length=i","max_length=i","s=s","o=s","ignorestrand","outputall","sd=s","ssub=s","sdel=s","asd=s","assub=s","asdel=s","out=s","options","bins1=i","bins2=i","id=s"); + +my $cmnd; + +my $prefix = "bc_$opt{id}"; + +if (exists $opt{version}){ + $cmnd = "python /usr/local/bin/pyBinCollector.py --version"; +} +else{ + $cmnd = "python /usr/local/bin/pyBinCollector.py -f $opt{f} --gtf $opt{gtf} --annotation $opt{annotation} -o $prefix"; + + if(exists $opt{outputall}) + { + $cmnd .= " --outputall"; + } + + if(exists $opt{options}){ + + $cmnd .= " --range=$opt{range} --numberofbins $opt{numberofbins} --min_length $opt{min_length} --max_length $opt{max_length} -s $opt{s}"; + + if(exists $opt{ignorestrand}){ $cmnd .= " --ignorestrand";} + if(exists $opt{bins1}){ $cmnd .= " --binselect $opt{bins1} $opt{bins2}";} + } +} + + +system $cmnd; +if(exists $opt{outputall}){ + + system "mv sense_data_$prefix.txt $opt{sd}"; + system "mv sense_subs_$prefix.txt $opt{ssub}"; + system "mv sense_dels_$prefix.txt $opt{sdel}"; + system "mv anti_sense_data_$prefix.txt $opt{asd}"; + system "mv anti_sense_subs_$prefix.txt $opt{assub}"; + system "mv anti_sense_dels_$prefix.txt $opt{asdel}"; +} +else{ + system "mv $prefix"."_cumulative_densities_$opt{annotation}"."_$opt{s}_"."$opt{numberofbins}_bins.pileup $opt{out}"; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyBinCollector.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,290 @@ + <tool id ="pyBinCollector" name="pyBinCollector"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="perl"> + pyBinCollector.pl + -f $input + --gtf $addGTF.gtf + #if $addGTF.gtfFile == "default" and $addGTF.annotate.annotations == "auto": + --annotation $addGTF.annotate.scan.annotation + #else: + --annotation $addGTF.annotate.annotation + #end if# + #if $addOpt.options == "edit": + --options + --range $addOpt.range + --min_length $addOpt.min_length + --max_length $addOpt.max_length + --numberofbins $addOpt.numberofbins + -s $addOpt.sequence + #if $addOpt.limitBins.binselect == "yes": + --bins1 $addOpt.limitBins.bs_first + --bins2 $addOpt.limitBins.bs_last + #end if# + $addOpt.ignore + $addOpt.oall.outputall + #end if# + -o "$input.name" + #if $addOpt.options == "edit" and $addOpt.oall.outputall == "--outputall": + --id $sd.id + --sd $sd + --ssub $ssub + --sdel $sdel + --asd $asd + --assub $assub + --asdel $asdel + #else: + --out $out + --id $out.id + #end if# + </command> + <version_command>/usr/local/bin/pyBinCollector.py --version</version_command> + <inputs> + <param format="gtf" name="input" type="data" label="Input File -f" help="pyReadCounters or pyMotif gtf output files" /> + + <conditional name="addGTF"> + <param name="gtfFile" type="select" label="Choose GTF File from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="gtf" type="select" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"> + <options from_data_table="pycrac_gtf"/> + </param> + + <conditional name="annotate"> + <param name="annotations" type="select" label="Select annotation"> + <option value="all" selected="true">All</option> + <option value="manual">Enter in text box</option> + <option value="auto">Scan pyGetGTFSources file</option> + </param> + <when value="all"> + <param name="annotation" type="hidden" format="txt" size="10" value="all"/> + </when> + <when value="manual"> + <param name="annotation" type="text" format="txt" size="100" value="protein_coding" label="Select which annotation to focus search on --annotation" help="To find a list of available annotations please use pyGetGTFSources tool"> + <validator type="empty_field" message="Please enter a value"/> + </param> + </when> + <when value="auto"> + <param format="tabular" name="gtf_annotation" type="data" label="GTF annotation File (pyGetGTFSources output)" help="Tabular file containing unique list of annotations/sources in selected GTF file. Refer to pyGetGTFSources"/> + <conditional name="scan"> + <param name="annotations" type="select" label="Scan this file for annotations" help="Choose the correct GTF file then choose GO"> + <option value="wait" selected="true">Waiting</option> + <option value="scanning">Go</option> + </param> + <when value="wait"> + </when> + <when value="scanning"> + <param name="annotation" type="select" multiple="false" label="Select which annotation to focus search on --annotation"> + <options from_dataset="gtf_annotation"> + <column name="name" index="0"/> + <column name="value" index="0"/> + </options> + </param> + </when> + </conditional> + </when> + </conditional> + + </when> + <when value="other"> + <param format="gtf" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/> + <conditional name="annotate"> + <param name="annotations" type="select" label="Select annotation"> + <option value="all" selected="true">All</option> + <option value="manual">Enter in text box</option> + <option value="auto">Scan selected file</option> + </param> + <when value="all"> + <param name="annotation" type="hidden" format="txt" size="10" value="all"/> + </when> + <when value="manual"> + <param name="annotation" type="text" format="txt" size="100" value="protein_coding" label="Select which annotation to focus search on --annotation" help="To find a list of available annotations please use pyGetGTFSources tool"> + <validator type="empty_field" message="Please enter a value"/> + </param> + </when> + <when value="auto"> + <param name="annotation" type="select" multiple="false" label="Select which annotation to focus search on --annotation"> + <options from_dataset="gtf"> + <column name="name" index="1"/> + <column name="value" index="1"/> + <filter type="unique_value" name="unique" column="1"/> + </options> + </param> + </when> + </conditional> + </when> + </conditional> + + + <conditional name="addOpt"> + <param name="options" type="select" label="Options"> + <option value="default" selected="true">Default</option> + <option value="edit">Edit</option> + </param> + <when value="edit"> + <param format="integer" name="range" type="integer" label="Range --range" value="0" size="5" help="Manually set the length of the 5' and 3' UTRs 0>50000"> + <validator type="in_range" min="0" max="50000" message="Please enter a value between 0 and 50000"/> + </param> + <param format="integer" name="numberofbins" type="integer" label="Set the number of bins --numberofbins" value="20" size="7" help="Set the number of bins you want to divide the genes into"> + <validator type="in_range" min="20" max="1000" message="Please enter a value between 20 and 1000"/> + </param> + <param format="integer" name="min_length" type="integer" label="Set the minimum gene length (nt) --min_length" value="50" size="7" help="To filter the data for gene length (nucleotides)" > + <validator type="in_range" min="20" message="Please enter a value greater than 20"/> + </param> + <param format="integer" name="max_length" type="integer" label="Set the maximum gene length (nt) --max_length" help="Default = 100000000" value="100000000" size="10" > + <validator type="in_range" min="50" max="100000000" message="Please enter a value between 50 and 100000000"/> + </param> + <param name="sequence" type="select" label="What sequences do you want to run pyBinCollector on? --sequence"> + <option value="genomic" selected="true">Genomic Sequence</option> + <option value="coding">Coding Sequence</option> + <option value="intron">Introns</option> + <option value="exon">Exons</option> + <option value="CDS">CDS</option> + <option value="5UTR">5UTR</option> + <option value="3UTR">3UTR</option> + </param> + <conditional name="limitBins"> + <param name="binselect" type="select" label="Select sequences that map to specific bins --binselect"> + <option value="no" selected="true">No</option> + <option value="yes">Yes</option> + </param> + <when value="yes"> + <param format="integer" name="bs_first" type="integer" label="Select First Bin" value="1" size="7"> + <validator type="in_range" min="1" message="Please enter a value greater than 0"/> + </param> + <param format="integer" name="bs_last" type="integer" label="Select Last Bin" value="2" size="7"> + <validator type="in_range" min="2" message="Please enter a value greater than 0"/> + </param> + </when> + <when value="no"> + </when> + </conditional> + <param name="ignore" type="select" label="Ignore strand information? --ignorestrand"> + <option value="" selected="true">No</option> + <option value="--ignorestrand">Yes</option> + </param> + <conditional name="oall"> + <param name="outputall" type="select" label="Output all genes --outputall" help="output the normalized distribution for each individual gene, rather than making a cumulative coverage plot"> + <option value="" selected="true">No</option> + <option value="--outputall">Yes</option> + </param> + <when value="--outputall"/> + <when value=""/> + </conditional> + </when> + <when value="default"> + </when> + </conditional> + <param name="label" type="text" format="txt" size="30" value="pyBinCollector" label="Enter output file label -o" /> + </inputs> + + <outputs> + <data format="gtf" name="out" label="${label.value}.gtf"> + <filter>addOpt['oall']['outputall'] == ""</filter> + </data> + <data format="txt" name="sd" label="sense_data_${label.value}.txt"> + <filter>addOpt['options'] == "edit" and addOpt['oall']['outputall'] == "--outputall"</filter> + </data> + <data format="txt" name="ssub" label="sense_subs_${label.value}.txt"> + <filter>addOpt['options'] == "edit" and addOpt['oall']['outputall'] == "--outputall"</filter> + </data> + <data format="txt" name="sdel" label="sense_dels_${label.value}.txt"> + <filter>addOpt['options'] == "edit" and addOpt['oall']['outputall'] == "--outputall"</filter> + </data> + <data format="txt" name="asd" label="anti_sense_data_${label.value}.txt"> + <filter>addOpt['options'] == "edit" and addOpt['oall']['outputall'] == "--outputall"</filter> + </data> + <data format="txt" name="assub" label="anti_sense_subs_${label.value}.txt"> + <filter>addOpt['options'] == "edit" and addOpt['oall']['outputall'] == "--outputall"</filter> + </data> + <data format="txt" name="asdel" label="anti_sense_dels_${label.value}.txt"> + <filter>addOpt['options'] == "edit" and addOpt['oall']['outputall'] == "--outputall"</filter> + </data> + </outputs> + <help> + + +.. class:: infomark + +**pyBinCollector** + +pyBinCollector is part of the pyCRAC_ package. Allows the user to generate genome-wide coverage plots. Normalises gene lengths by dividing genes into a +fixed number of bins and then calculates the hit density in each bin. The program also allows the user to input specific bin numbers to extract +blocks/clusters present in these bins. + + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +File input options:: + + -f FILE, --input_file=FILE + Provide the path and name of the pyReadCounters.py or + pyMotif.py GTF file. By default the program expects + data from the standard input. + -o OUTPUT_FILE, --output_file=OUTPUT_FILE + To set an output file name. Do not add a file + extension. By default, if the --outputall flag is not + used, the program writes to the standard output. + --gtf=yeast.gtf + type the path to the gtf annotation file that you want + to use. Default is /usr/local/pyCRAC/db/Saccharomyces_ + cerevisiae.EF2.59.1.2.gtf + +pyBinCollector.py specific options:: + + -a protein_coding, --annotation=protein_coding + select which annotation (i.e. protein_coding, ncRNA, + sRNA, rRNA, tRNA, snoRNA, all) you would like to focus + your search on. Default = all + --min_length=20 + to set a minimum length threshold for genes. Genes + shorter than the minimal length will be discarded. + Default = 1 + --max_length=10000 + to set a maximum length threshold for genes. Genes + larger than the maximum length will be discarded. + Default = 100000000 + -n 20, --numberofbins=20 + select the number of bins you want to generate. + Default=20 + --binselect=2 4 + allows selection of sequences that were mapped to + specific bins. This option expects two numbers, one + for each bin, separated by a space. For example: + --binselect 20 30. + --outputall + use this flag to output the normalized distribution + for each individual gene, rather than making a + cumulative coverage plot. Useful for making box plots + or for making heat maps. + +Common options:: + + -r 100, --range=100 + allows you to set the length of the UTR regions. If + you set '-r 50' or '--range=50', then the program will + set a fixed length (50 bp) regardless of whether the + GTF file has genes with annotated UTRs. + -s intron, --sequence=intron + with this option you can select whether you want to + generate bins from the coding or genomic sequence or + introns,exon,CDS, or UTR coordinates. Default = + genomic + --ignorestrand + To ignore strand information and all reads overlapping + with genomic features will be considered sense reads. + Useful for analysing ChIP or RIP data + + + + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyCalculateChromosomeLengths.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,53 @@ +<tool id="pyCalculateChromosomeLengths" name="pyCalculateChromosomeLengths"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="python"> + /usr/local/bin/pyCalculateChromosomeLengths.py + -f $ftype.input + --file_type $ftype.filetype + -o $output </command> + <version_command>/usr/local/bin/pyCalculateChromosomeLengths.py --version</version_command> + <inputs> + <conditional name="ftype"> + <param name="filetype" type="select" label="File type"> + <option value="fasta" selected="true">Fasta</option> + <option value="tab">Tab</option> + </param> + <when value="fasta"> + <param name="input" type="data" format="fasta" label="Input file" help="Fasta or Tab file"/> + </when> + <when value="tab"> + <param name="input" type="data" format="tabular" label="Input file" help="Fasta or Tab file"/> + </when> + </conditional> + </inputs> + <param name="label" type="text" format="txt" size="30" value="pyCalculateChromosomeLengths" label="Enter output file label -o" /> + <outputs> + <data name="output" format="txt" label="${label.value}.len"/> + </outputs> + <help> + +.. class:: infomark + +**pyCalculateChromosomeLengths** + +pyCalculateChromosomeLengths is part of the pyCRAC_ package. Takes a genome sequence in fasta or tab format and generates a tab-delimited file showing chromosome name and chromosome length. + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------- + +**Parameter list** + +Options:: + + -f chromosomes.fasta, --input_file=chromosomes.fasta + provide the name and path of your fasta or tab genomic + sequence file. Default is standard input. + --file_type=fasta + provide the file type (fasta or tab). Default is fasta + + </help> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyCalculateFDRs.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,247 @@ + <tool id ="pyCalculateFDRs" name="pyCalculateFDRs"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="python"> + /usr/local/bin/pyCalculateFDRs.py + -f $ftype.input + --file_type $ftype.file_type + --gtf=$addGTF.gtf + + #if $addGTF.annotate.annotations != "all": + #if $addGTF.gtfFile == "default" and $addGTF.annotate.annotations == "auto": + --annotation $addGTF.annotate.scan.annotation + #else: + --annotation $addGTF.annotate.annotation + #end if# + #end if# + --chromfile=$addChr.chr + #if $addOpt.options == "edit" + -s $addOpt.sequence + --min $addOpt.min + --minfdr $addOpt.minfdr + --iterations=$addOpt.iterations + --range $addOpt.range + #end if# + -o $output + + </command> + <version_command>/usr/local/bin/pyCalculateFDRs.py --version</version_command> + <inputs> + <conditional name="ftype"> + <param name="file_type" type="select" label="Input File Type --file_type" help="Use bed6, gff or gtf input files containing read/cDNA co-ordinates"> + <option value="gff" selected="true">GFF</option> + <option value="bed">Bed6</option> + <option value="gtf">GTF</option> + </param> + <when value="gff"> + <param format="gff" name="input" type="data" label="Input File --readdatafile" help="GFF format containing read/cDNA co-ordinates" /> + </when> + <when value="gtf"> + <param format="gtf" name="input" type="data" label="Input File --readdatafile" help="GTF format containing read/cDNA co-ordinates" /> + </when> + <when value="bed"> + <param format="bed6" name="input" type="data" label="Input File --readdatafile" help="Bed 6 column format containing read/cDNA co-ordinates" /> + </when> + </conditional> + + <conditional name="addChr"> + <param name="chrfile" type="select" label="Choose Chromosome length file from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="chr" type="select" label="Chromosome length file -c" help="This file should have two columns: first column is the names of the chromosomes, second column is length of the chromosomes"> + <options from_data_table="pycrac_chr"/> + </param> + </when> + <when value="other"> + <param format="tabular" name="chr" type="data" label="Chromosome length file -c" help="This file should have two columns: first column is the names of the chromosomes, second column is length of the chromosomes. Use pyCrac utility pyCalculateChromosomeLengths to create."/> + </when> + </conditional> + + <conditional name="addGTF"> + <param name="gtfFile" type="select" label="Choose GTF File from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="gtf" type="select" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"> + <options from_data_table="pycrac_gtf"/> + </param> + <conditional name="annotate"> + <param name="annotations" type="select" label="Select annotation"> + <option value="all" selected="true">All</option> + <option value="manual">Enter in text box</option> + <option value="auto">Scan pyGetGTFSources file</option> + </param> + <when value="all"> + <param name="annotation" type="hidden" format="txt" size="10" value="all"/> + </when> + <when value="manual"> + <param name="annotation" type="text" format="txt" size="100" value="protein_coding" label="Select which annotation to focus search on --annotation" help="To find a list of available annotations please use pyGetGTFSources tool"> + <validator type="empty_field" message="Please enter a value"/> + </param> + </when> + <when value="auto"> + <param format="tabular" name="gtf_annotation" type="data" label="GTF annotation File (pyGetGTFSources output)" help="Tabular file containing unique list of annotations/sources in selected GTF file. Refer to pyGetGTFSources"/> + <conditional name="scan"> + <param name="annotations" type="select" label="Scan this file for annotations" help="Choose the correct GTF file then choose GO"> + <option value="wait" selected="true">Waiting</option> + <option value="scanning">Go</option> + </param> + <when value="wait"> + </when> + <when value="scanning"> + <param name="annotation" type="select" multiple="false" label="Select which annotation to focus search on --annotation"> + <options from_dataset="gtf_annotation"> + <column name="name" index="0"/> + <column name="value" index="0"/> + </options> + </param> + </when> + </conditional> + </when> + </conditional> + </when> + <when value="other"> + <param format="gtf" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/> + <conditional name="annotate"> + <param name="annotations" type="select" label="Select annotation"> + <option value="all" selected="true">All</option> + <option value="manual">Enter in text box</option> + <option value="auto">Scan selected file</option> + </param> + <when value="all"> + <param name="annotation" type="hidden" format="txt" size="10" value="all"/> + </when> + <when value="manual"> + <param name="annotation" type="text" format="txt" size="100" value="protein_coding" label="Select which annotation to focus search on --annotation" help="To find a list of available annotations please use pyGetGTFSources tool"> + <validator type="empty_field" message="Please enter a value"/> + </param> + </when> + <when value="auto"> + <param name="annotation" type="select" multiple="false" label="Select which annotation to focus search on --annotation"> + <options from_dataset="gtf"> + <column name="name" index="1"/> + <column name="value" index="1"/> + <filter type="unique_value" name="unique" column="1"/> + </options> + </param> + </when> + </conditional> + </when> + </conditional> + <conditional name="addOpt"> + <param name="options" type="select" label="Standard options"> + <option value="default" selected="true">Default</option> + <option value="edit">Edit</option> + </param> + <when value="edit"> + <param name="sequence" type="select" label="Align reads to --sequence"> + <option value="genomic" selected="true">Genomic Sequence</option> + <option value="coding">Coding Sequence</option> + </param> + <param format="integer" name="min" type="integer" label="Minimum read coverage --min " value="1" size="10" help="Set the minimal read coverage for a region"> + <validator type="in_range" min="1" message="Please enter a value >= 1"/> + </param> + <param name="minfdr" type="float" label="Minimum FDR threshold --minfdr" value="0.05" size="6" help="Set a minimal FDR threshold for filtering interval data"> + <validator type="in_range" min="0" max="1" message="Please enter a value between 0 and 1"/> + </param> + <param format="integer" name="iterations" type="integer" label="Number of iterations --iterations" value="100" size="6" help="The number of iterations for randomization of read coordinates"> + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="range" type="integer" label="Range --range" value="0" size="5" help="Manually set the length of the 5' and 3' UTRs 0>50000"> + <validator type="in_range" min="0" max="50000" message="Please enter a value between 0 and 50000"/> + </param> + </when> + <when value="default"> + </when> + </conditional> + <param name="label" type="text" format="txt" size="30" value="pyCalculateFDRs" label="Enter output file label -o" /> + </inputs> + <outputs> + <data format="gtf" name="output" label="${label.value}.gtf"/> + </outputs> + <help> + +.. class:: infomark + +**pyCalculateFDRs** + +By default the FDR value is set to 0.05, meaning that there is a 5% chance that the interval is not significantly enriched. +The tool reports significant intervals in the GTF format and reports overlapping genomic features. +Mutation frequencies are not included but these can be added using the pyCalculateMutationFrequencies tool + +**NOTE!** By default it calls each significant interval an "exon" but this has no meaning! It may overlap with an intron. +Use bedtools to extract those intervals that overlap with introns or other features + +Example of an output file:: + + ##gff-version 2 + # generated by pyCalculateFDRs version 0.0.3, Sat Jun 1 21:16:23 2013 + # pyCalculateFDRs.py -f test_count_output_reads.gtf -r 200 -o test_count_output_FDRs_005.gtf -v -m 0.05 + # chromosome feature source start end minimal_coverage strand . attributes + chrI protein_coding exon 140846 140860 5 - . gene_id "YAL005C"; gene_name "SSA1"; + chrI intergenic_region exon 223118 223164 4 - . gene_id "INT_0_179"; gene_name "INT_0_179"; + chrI intergenic_region exon 71889 71922 3 + . gene_id "INT_0_94"; gene_name "INT_0_94"; + chrII intergenic_region exon 296127 296158 3 - . gene_id "INT_0_365"; gene_name "INT_0_365"; + chrII intergenic_region exon 680697 680722 4 - . gene_id "INT_0_626"; gene_name "INT_0_626"; + chrII intergenic_region exon 680827 680846 4 - . gene_id "INT_0_626"; gene_name "INT_0_626"; + chrII snRNA exon 680827 680838 5 - . gene_id "LSR1"; gene_name "LSR1"; + chrII snRNA exon 680951 681001 5 - . gene_id "LSR1"; gene_name "LSR1"; + chrII intergenic_region exon 577985 577996 3 - . gene_id "INT_0_556"; gene_name "INT_0_556"; + chrII protein_coding exon 203838 203887 3 + . gene_id "YBL011W"; gene_name "SCT1"; + chrII protein_coding exon 296127 296158 3 - . gene_id "YBR028C"; gene_name "YBR028C"; + + +pyCalculateFDRs is part of the pyCRAC_ package. Takes interval information in GTF or bed format and calculates False Discovery Rates (FDRs). + + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +Options:: + + -f read_file, --readdatafile=read_file + Name of the bed/gff/gtf file containing the read/cDNA + coordinates + --file_type=FILE_TYPE + this tool supports bed6, gtf and gff input files. + Please select from 'bed','gtf' or 'gff'. Default=gtf + -o outfile.gtf, --outfile=outfile.gtf + Optional. Provide the name of the output file. Default + is 'selected_intervals.gtf' + -r 100, --range=100 + allows you to set the length of the UTR regions. If + you set '-r 50' or '--range=50', then the program will + set a fixed length (50 bp) regardless of whether the + GTF file has genes with annotated UTRs. + -a protein_coding, --annotation=protein_coding + select which annotation (i.e. protein_coding, ncRNA, + sRNA, rRNA,snoRNA,snRNA, depending on the source of + your GTF file) you would like to focus your analysis + on. Default = all annotations + -c yeast.txt, --chromfile=yeast.txt + Location of the chromosome info file. This file should + have two columns: first column is the names of the + chromosomes, second column is length of the + chromosomes. Default is yeast + --gtf=yeast.gtf + Name of the annotation file. Default is /usr/local/pyC + RAC/db/Saccharomyces_cerevisiae.EF2.59.1.2.gtf + -m MINFDR, --minfdr=MINFDR + To set a minimal FDR threshold for filtering interval + data. Default is 0.05 + --min=MIN + to set a minimal read coverages for a region. Regions + with coverage less than minimum will be ignoredve an + FDR of zero + --iterations=ITERATIONS + to set the number of iterations for randomization of + read coordinates. Default=100 + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyCalculateMutationFrequencies.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,126 @@ +<tool id ="pyCalculateMutationFrequencies" name="pyCalculateMutationFrequencies"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="python"> + /usr/local/bin/pyCalculateMutationFrequencies.py + -r $readdatafile + -i $intervaldatafile + -c $addChr.chr + -o $output + --mutsfreq $mutsfreq + </command> + <version_command>/usr/local/bin/pyCalculateMutationFrequencies.py --version</version_command> + <inputs> + <param format="gff" name="readdatafile" type="data" label="GFF Reads File --readdatafile" help="GFF file containing read data" /> + <param format="gtf" name="intervaldatafile" type="data" label="GFF Interval File --intervaldatafile" help="GFF file containing interval co-ordinates"/> + <conditional name="addChr"> + <param name="chrfile" type="select" label="Choose Chromosome length file from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="chr" type="select" label="Chromosome length file -c" help="This file should have two columns: first column is the names of the chromosomes, second column is length of the chromosomes.Use pyCrac utility pyCalculateChromosomeLengths to create."> + <options from_data_table="pycrac_chr"/> + </param> + </when> + <when value="other"> + <param format="tabular" name="chr" type="data" label="Chromosome length file -c" help="This file should have two columns: first column is the names of the chromosomes, second column is length of the chromosomes"/> + </when> + </conditional> + + <param format="integer" name="mutsfreq" type="integer" label="Minimum mutation frequency --mutsfreq " value="0" size="10" help="sets the minimal mutations frequency for an interval that you want to have written to our output file"> + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param name="label" type="text" format="txt" size="30" value="pyCalculateMutationFrequencies" label="Enter output file label -o" /> + </inputs> + <outputs> + <data format="gtf" name="output" label="${label.value}.gtf"/> + </outputs> + <help> + +.. class:: infomark + +**pyCalculateMutationFrequencies** + +pyCalculateMutationFrequencies is part of the pyCRAC_ package. Takes an interval file and a pyReadCounters GTF file and calculates (cross-linking induced) mutation frequencies fore each interval. +This tool can be used to calculate mutation frequencies for significant intervals (pyCalculateFDRs output file) or over-represented motifs (pyMotif GTF output file). +It expects a pyCRAC GTF count_output_reads.gtf file and a GTF file with the intervals. + +For example:: + + This pyCalculateFDRs GTF output file:: + + ##gff-version 2 + # generated by pyCalculateFDRs version 0.0.3, Sat Jun 1 21:16:23 2013 + # pyCalculateFDRs.py -f test_count_output_reads.gtf -r 200 -o test_count_output_FDRs_005.gtf -v -m 0.05 + # chromosome feature source start end minimal_coverage strand . attributes + chrII protein_coding exon 203838 203887 3 + . gene_id "YBL011W"; gene_name "SCT1"; + chrII intergenic_region exon 407669 407708 3 + . gene_id "INT_0_445"; gene_name "INT_0_445"; + chrII intergenic_region exon 585158 585195 2 + . gene_id "INT_0_562"; gene_name "INT_0_562"; + chrII protein_coding exon 372390 372433 4 - . gene_id "YBR067C"; gene_name "TIP1"; + chrII intergenic_region exon 380754 380815 6 - . gene_id "INT_0_431"; gene_name "INT_0_431"; + chrIII protein_coding exon 138001 138044 5 + . gene_id "YCR012W"; gene_name "PGK1"; + chrIII intergenic_region exon 227997 228036 5 + . gene_id "INT_0_885"; gene_name "INT_0_885"; + chrIII intergenic_region exon 227997 228037 4 + . gene_id "INT_0_887"; gene_name "INT_0_887"; + chrIII tRNA exon 227997 228037 4 + . gene_id "tS(CGA)C"; gene_name "SUP61"; + + Will be converted into:: + + ##gff-version 2 + # generated by pyCalculateFDRs version 0.0.3, Sat Jun 1 21:16:23 2013 + # /Library/Frameworks/EPD64.framework/Versions/Current/bin/pyCalculateFDRs.py -f test_count_output_reads.gtf -r 200 -o test_count_output_FDRs_005.gtf -v -m 0.05 + # chromosome feature source start end minimal_coverage strand . attributes + chrII protein_coding exon 203838 203887 3 + . gene_id "YBL011W"; gene_name "SCT1"; # 203882D33.3,203883D33.3,203884D33.3; + chrII intergenic_region exon 407669 407708 3 + . gene_id "INT_0_445"; gene_name "INT_0_445"; # 407680D33.3,407681D33.3; + chrII intergenic_region exon 585158 585195 2 + . gene_id "INT_0_562"; gene_name "INT_0_562"; # 585171D100.0,585172D100.0,585173D100.0; + chrII protein_coding exon 372390 372433 4 - . gene_id "YBR067C"; gene_name "TIP1"; # 372412D50.0,372413D50.0; + chrII intergenic_region exon 380754 380815 6 - . gene_id "INT_0_431"; gene_name "INT_0_431"; # 380786D90.2,380787D90.2; + chrIII protein_coding exon 138001 138044 5 + . gene_id "YCR012W"; gene_name "PGK1"; # 138025D40.0,138026D30.0,138027D40.0; + chrIII intergenic_region exon 227997 228036 5 + . gene_id "INT_0_885"; gene_name "INT_0_885"; # 228006D85.7,228007D100.0; + chrIII intergenic_region exon 227997 228037 4 + . gene_id "INT_0_887"; gene_name "INT_0_887"; # 228006D85.7,228007D100.0; + chrIII tRNA exon 227997 228037 4 + . gene_id "tS(CGA)C"; gene_name "SUP61"; # 228006D85.7,228007D100.0; + + +The hash character at the end of each line (#) shows chromosomal coordinates of mutated nucleotides within the cluster interval and their mutation frequencies. + +For example:: + + # 228007D100.0 + +indicates that 100% of the nucleotides in position 228007 were deleted in the interval. + +By setting the --mutsfreq flag you can set a limit for the lowest mutation frequency that you want to have reported. +This makes it relatively easy to select those significant regions that have nucleotides with high mutation frequencies. + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +Options:: + + -i intervals.gtf, --intervaldatafile=intervals.gtf + provide the path to your GTF interval data file. + -r reads.gtf, --readdatafile=reads.gtf + provide the path to your GTF read data file. + -c yeast.txt, --chromfile=yeast.txt + Location of the chromosome info file. This file should + have two columns: first column is the names of the + chromosomes, second column is length of the + chromosomes. Default is yeast + -o intervals_with_muts.gtf, --output_file=intervals_with_muts.gtf + provide a name for an output file. By default it + writes to the standard output + --mutsfreq=10, --mutationfrequency=10 + sets the minimal mutations frequency for an interval + that you want to have written to our output file. + Default = 0%. Example: if the mutsfrequency is set at + 10 and an interval position has a mutated in less than + 10% of the reads,then the mutation will not be + reported. + + + </help> + </tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyCheckGTFfile.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,53 @@ + <tool id ="pyCheckGTFfile" name="pyCheckGTFfile"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="python"> + /usr/local/bin/pyCheckGTFfile.py --gtf $addGTF.gtf -o $out + </command> + <version_command>/usr/local/bin/pyCheckGTFfile.py --version</version_command> + <inputs> + <conditional name="addGTF"> + <param name="gtfFile" type="select" label="Choose GTF File from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="gtf" type="select" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"> + <options from_data_table="pycrac_gtf"/> + </param> + </when> + <when value="other"> + <param format="GTF" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/> + </when> + </conditional> + <param name="label" type="text" format="txt" size="30" value="pyCheckGTFfile" label="Enter output file label -o" /> + </inputs> + + <outputs> + <data format="gtf" name="out" label="${label.value}.gtf"/> + </outputs> + <help> +.. class:: infomark + +**pyCheckGTFfile** + +pyCheckGTFfile is part of the pyCRAC_ package. Renames duplicated gene names in your GTF annotation file. + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +Options:: + + --gtf=gtf input file + type the path to the gtf file that you want to use. + -o FILE, --output=FILE + Optional. Specify the name of the output file. Default + is standard output. Make sure it has the .gtf + extension! + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyClusterReads.pl Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,10 @@ +#!/usr/bin/perl -w +use strict; +use Getopt::Long; + +print join(" ",@ARGV,"\n"); + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyClusterReads.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,230 @@ +<tool id="pyClusterReads" name="pyClusterReads" force_history_refresh="True"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="python"> + /usr/local/bin/pyClusterReads.py + -f $input + --gtf=$addGTF.gtf + #if $addGTF.annotate.annotations != "all": + #if $addGTF.gtfFile == "default" and $addGTF.annotate.annotations == "auto": + --annotation=$addGTF.annotate.scan.annotation + #else: + --annotation=$addGTF.annotate.annotation + #end if# + #end if# + -o $output + #if $addOpt.options == "edit": + --range=$addOpt.range + --cic=$addOpt.cic + --co=$addOpt.co + --ch=$addOpt.ch + --cl=$addOpt.cl + --mutsfreq=$addOpt.mutsfreq + #end if# + </command> + <version_command>/usr/local/bin/pyClusterReads.py --version</version_command> + <inputs> + <param format="gtf" name="input" type="data" label="Input Read Data File -f" help="GTF format sorted by position i.e. pyReadCounters output file."/> + <conditional name="addGTF"> + <param name="gtfFile" type="select" label="Choose GTF File from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="gtf" type="select" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"> + <options from_data_table="pycrac_gtf"/> + </param> + <conditional name="annotate"> + <param name="annotations" type="select" label="Select annotation"> + <option value="all" selected="true">All</option> + <option value="manual">Enter in text box</option> + <option value="auto">Scan pyGetGTFSources file</option> + </param> + <when value="all"> + <param name="annotation" type="hidden" format="txt" size="10" value="all"/> + </when> + <when value="manual"> + <param name="annotation" type="text" format="txt" size="100" value="protein_coding" label="Select which annotation to focus search on --annotation" help="To find a list of available annotations please use pyGetGTFSources tool"> + <validator type="empty_field" message="Please enter a value"/> + </param> + </when> + <when value="auto"> + <param format="tabular" name="gtf_annotation" type="data" label="GTF annotation File (pyGetGTFSources output)" help="Tabular file containing unique list of annotations/sources in selected GTF file. Refer to pyGetGTFSources"/> + <conditional name="scan"> + <param name="annotations" type="select" label="Scan this file for annotations" help="Choose the correct GTF file then choose GO"> + <option value="wait" selected="true">Waiting</option> + <option value="scanning">Go</option> + </param> + <when value="wait"> + </when> + <when value="scanning"> + <param name="annotation" type="select" multiple="false" label="Select which annotation to focus search on --annotation"> + <options from_dataset="gtf_annotation"> + <column name="name" index="0"/> + <column name="value" index="0"/> + </options> + </param> + </when> + </conditional> + </when> + </conditional> + + </when> + <when value="other"> + <param format="GTF" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/> + <conditional name="annotate"> + <param name="annotations" type="select" label="Select annotation"> + <option value="all" selected="true">All</option> + <option value="manual">Enter in text box</option> + <option value="auto">Scan selected file</option> + </param> + <when value="all"> + <param name="annotation" type="hidden" format="txt" size="10" value="all"/> + </when> + <when value="manual"> + <param name="annotation" type="text" format="txt" size="100" value="protein_coding" label="Select which annotation to focus search on --annotation" help="To find a list of available annotations please use pyGetGTFSources tool"> + <validator type="empty_field" message="Please enter a value"/> + </param> + </when> + <when value="auto"> + <param name="annotation" type="select" multiple="false" label="Select which annotation to focus search on --annotation"> + <options from_dataset="gtf"> + <column name="name" index="1"/> + <column name="value" index="1"/> + <filter type="unique_value" name="unique" column="1"/> + </options> + </param> + </when> + </conditional> + </when> + </conditional> + + <conditional name="addOpt"> + <param name="options" type="select" label="Standard Options"> + <option value="default" selected="true">Default</option> + <option value="edit">Edit</option> + </param> + <when value="edit"> + <param format="integer" name="range" type="integer" label="Range --range" value="0" size="5" help="Manually set the length of the 5' and 3' UTRs 0>50000"> + <validator type="in_range" min="0" max="50000" message="Please enter a value between 0 and 50000"/> + </param> + <param format="integer" name="ch" type="integer" label="Cluster height --ch" value="2" size="10" help="Minimal height of a cluster"> + <validator type="in_range" min="1" message="Please enter a value >= 1"/> + </param> + <param format="integer" name="cl" type="integer" label="Cluster length --cl" value="1" size="10" help="Maximum length of a cluster"> + <validator type="in_range" min="1" message="Please enter a value >= 1"/> + </param> + <param format="integer" name="cic" type="integer" label="cDNAs in clusters --cic" value="2" size="10" > + <validator type="in_range" min="2" message="Please enter a value >= 1"/> + </param> + <param format="integer" name="co" type="integer" label="cDNA-cluster nucleotide overlap --co" value="1" size="10" > + <validator type="in_range" min="1" message="Please enter a value >= 1"/> + </param> + <param format="integer" name="mutsfreq" type="integer" label="Minimum mutation frequency for a cluster position --mutsfreq" value="0" size="3" > + <validator type="in_range" min="0" max="100" message="Please enter a value between 0 and 100"/> + </param> + </when> + <when value="default"> + </when> + </conditional> + <param name="label" type="text" format="txt" size="30" value="pyClusterReads" label="Enter output file label -o" /> + </inputs> + <outputs> + <data format="gtf" name="output" label="${label.value}_clusters.gtf"/> + </outputs> + <help> + +.. class:: infomark + +**pyClusterReads** + +pyClusterReads is part of the pyCRAC_ package. Takes a reads_count_output GTF file from pyReadCounters generates clusters from the interval coordinates. +Produces a GTF output file with cluster intervals and overlapping genomic features. +It also includes mutation frequencies (after the # character) for nucleotides in intervals using chromosomal coordinates +The pyClusterReads GTF output file essentially has the same layout as other pyCRAC GTF output files. + +**NOTE!** By default it calls each cluster an "exon" but this has no meaning. It may overlap with an intron. +Use bedtools to extract those intervals that overlap with introns or other features + +The maximum height of the cluster is indicated in column 8. +The hash character at the end of each line (#) shows chromosomal coordinates of mutated nucleotides within the cluster interval and their mutation frequencies. + +For example:: + + # 114099S100.0 + +indicates that 100% of the nucleotides in position 114099 were substituted in the cluster. + +An example of a pyClusterReads output file:: + + ##gff-version 2 + # generated by pyClusterReads.py version 0.0.1, Fri Jan 18 11:59:42 2013 + # pyClusterReads.py -f count_output_reads.gtf -o count_output_clusters.gtf -v + # chromosome feature source start end cDNAs strand height attributes + chrI cluster exon 112583 112643 6 - 5 gene_id "INT_0_114,YAL021C"; gene_name "INT_0_114,CCR4"; # 112612S75.0; + chrI cluster exon 113176 113232 3 - 3 gene_id "INT_0_114,YAL021C"; gene_name "INT_0_114,CCR4"; # 113184S100.0; + chrI cluster exon 113334 113386 2 - 2 gene_id "INT_0_114,YAL021C"; gene_name "INT_0_114,CCR4"; # 113349S50.0,113379S100.0; + chrI cluster exon 113534 113564 3 - 3 gene_id "INT_0_119,INT_0_114"; gene_name "INT_0_119,INT_0_114"; # 113554S33.3,113556S33.3,113557S33.3; + chrI cluster exon 113644 113691 5 - 4 gene_id "YAL020C,INT_0_114"; gene_name "ATS1,INT_0_114"; # 113649S50.0,113657S33.3,113679S25.0 + chrI cluster exon 113912 113958 2 - 2 gene_id "YAL020C,INT_0_114"; gene_name "ATS1,INT_0_114"; # 113932S50.0,113946S50.0; + chrI cluster exon 113966 114066 5 - 3 gene_id "YAL020C,INT_0_114"; gene_name "ATS1,INT_0_114"; # 113987S50.0,114033S33.3,114039S33.3; + chrI cluster exon 114067 114130 3 - 3 gene_id "YAL020C,INT_0_114"; gene_name "ATS1,INT_0_114"; # 114099S100.0; + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + + +File input options:: + + -f reads.gtf, --input_file=reads.gtf + provide the path to your GTF read data file. NOTE the + file has to be correctly sorted! If you used + pyReadCounters to generate the file you should be + fine. If you modified it, use the sort command + described in the manual to sort your file first by + chromosome, then by strand and then by start position. + -o clusters.gtf, --output_file=clusters.gtf + provide a name for an output file. By default it + writes to the standard output + --gtf=Yourfavoritegtf.gtf + type the path to the gtf annotation file that you want + to use + +Common pyCRAC options:: + + -r 100, --range=100 + allows you to set the length of the UTR regions. If + you set '-r 50' or '--range=50', then the program will + set a fixed length (50 bp) regardless of whether the + GTF annotation file has genes with annotated UTRs. + -a protein_coding, --annotation=protein_coding + select which annotation (i.e. protein_coding, ncRNA, + sRNA, rRNA,snoRNA,snRNA, depending on the source of + your GTF file) you would like to focus your analysis + on. Default = all annotations + +Options for cluster analysis:: + + --cic=2, --cdnasinclusters=2 + sets the minimal number of overlapping cDNAs in each + cluster. Default = 2 + --co=5, --clusteroverlap=5 + sets the number of nucleotides cDNA sequences have to + overlap to form a cluster. Default = 1 nucleotide + --ch=5, --clusterheight=5 + sets the minimal height of the cluster. Default = 2 + nucleotides + --cl=100, --clusterlength=100 + to set the maximum cluster sequence length + --mutsfreq=10, --mutationfrequency=10 + sets the minimal mutations frequency for a cluster + position in the GTF output file. Default = 0%. + Example: if the mutsfrequency is set at 10 and a + cluster position has a mutated in less than 10% of the + reads, then the mutation will not be reported. + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyExtractLinesFromGTF.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,77 @@ + <tool id ="pyExtractLinesFromGTF" name="pyExtractLinesFromGTF"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="python"> + /usr/local/bin/pyExtractLinesFromGTF.py --gtf $addGTF.gtf --genes_file $g --attribute $attribute $v -o $out + </command> + <version_command>/usr/local/bin/pyExtractLinesFromGTF.py --version</version_command> + <inputs> + <conditional name="addGTF"> + <param name="gtfFile" type="select" label="Choose GTF File from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="gtf" type="select" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"> + <options from_data_table="pycrac_gtf"/> + </param> + </when> + <when value="other"> + <param format="GTF" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/> + </when> + </conditional> + <param format="txt" name="g" type="data" label="File containing gene list --genes_file" help="Tabular file with 1 column of gene or annotation names"/> + <param name="attribute" type="select" label="Select the attribute to extract names from --attribute"> + <option value="gene_name" selected="true">gene_name</option> + <option value="gene_id">gene_id</option> + <option value="transcript_name">transcript_name</option> + <option value="transcript_id">transcript_id</option> + </param> + <param name="v" type="select" label="Extract lines from GTF that -v"> + <option value="" selected="true">Match the gene file</option> + <option value="-v">Do not match in gene fil</option> + </param> + <param name="label" type="text" format="txt" size="30" value="pyExtractLinesFromGTF" label="Enter output file label -o" /> + </inputs> + + <outputs> + <data format="gtf" name="out" label="${label.value}.gtf"/> + </outputs> + <help> +.. class:: infomark + +**pyExtractLinesFromGTF** + +pyExtractLinesFromGTF is part of the pyCRAC_ package. Extracts lines from a GTF file that contain gene names of interest. + + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +Options:: + + --gtf=Yourfavoritegtf.gtf + type the path to the gtf file that you want to use. By + default it expects data from the standard input. + -g FILE, --genes_file=FILE + name of your gene list or annotations list file (1 + column) + -o OUTFILE, --outfile=OUTFILE + type the name and path of the file you want to write + the output to. Default is standard output + -a ATTRIBUTE, --attribute=ATTRIBUTE + from which attribute do you want to extract names? + Choices: gene_name, gene_id, transcript_name, + transcript_id + -v + similar to grep -v option. Remove the genes from the + GTF that are in the gene list + + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyFasta2tab.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,51 @@ +<tool id="pyFasta2Tab" name="pyFasta2Tab"> + <description>converter</description> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="python">/usr/local/bin/pyFasta2tab.py -f $input -o $output + </command> + <version_command>/usr/local/bin/pyFasta2tab.py --version</version_command> + <inputs> + <param name="input" type="data" format="fasta" label="Fasta file -f"/> + </inputs> + <param name="label" type="text" format="txt" size="30" value="pyFasta2Tab" label="Enter output file label -o" /> + <outputs> + <data name="output" format="tabular" label="${label.value}.tab"/> + </outputs> + <help> + +.. class:: infomark + +**pyFasta2Tab** + +pyFasta2Tab is part of the pyCRAC_ package. Converts fasta to tabular format. Is used to convert your reference sequences in fasta format to the tabular format that pyCRAC uses for almost all tools. + +Example:: + + >sequence1 + ATAGGATACATAACCATATTATGAGACC + +Is converted into:: + + sequence1 ATAGGATACATAACCATATTATGAGACC + +The pyCRAC package lo + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------- + +**Parameter list** + +Options:: + + -f fasta_file, --input_file=fasta_file + provide the name and path of your fasta input file. + Default is standard input. + + + + </help> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyFastqDuplicateRemover.pl Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,34 @@ +#!/usr/bin/perl -w +use strict; +use Getopt::Long; + +my %opt; + + +GetOptions(\%opt, "f=s", "r=s", "o=s", "out2=s", "version","id=s"); + +my $cmnd; + +if (exists $opt{version}){ + $cmnd = "python /usr/local/bin/pyFastqDuplicateRemover.py --version"; +} +else{ + $cmnd = "python /usr/local/bin/pyFastqDuplicateRemover.py -f $opt{f} -o $opt{id}"; + + if(defined $opt{r}){ + $cmnd.= " -r $opt{r}"; + } +} + +system $cmnd; + + + +if(defined $opt{r}){ + system "mv $opt{id}"."_1.fasta $opt{o}"; + system "mv $opt{id}"."_2.fasta $opt{out2}"; +} +else{ + system "mv $opt{id} $opt{o}"; +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyFastqDuplicateRemover.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,117 @@ + <tool id ="pyFastqDuplicateRemover" name="pyFastqDuplicateRemover"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="perl"> + pyFastqDuplicateRemover.pl + -f $ftype.f + #if $ftype.reverse.rev == "yes": + -r=$ftype.reverse.r + --out2 $out2 + #end if# + -o $out + --id $out.id + </command> + <version_command>pyFastqDuplicateRemover.py --version</version_command> + <inputs> + <conditional name="ftype"> + <param name="type" type="select" label="File type"> + <option value="fastq" selected="true">FASTQ</option> + <option value="fasta">FASTA</option> + </param> + <when value="fastq"> + <param format="fastq" name="f" type="data" label="FastQ File -f" help="FastQ format" /> + <conditional name="reverse"> + <param name="rev" type="select" label="Add a reverse or paired FastQ file"> + <option value="no" selected="true">NO</option> + <option value="yes">YES</option> + </param> + <when value="yes"> + <param format="fastq" name="r" type="data" label="Reverse FastQ File -f" help="FastQ format" /> + </when> + <when value="no"> + </when> + </conditional> + </when> + <when value="fasta"> + <param format="fasta" name="f" type="data" label="FastA File -f" help="FastA format" /> + <conditional name="reverse"> + <param name="rev" type="select" label="Add a reverse or paired FastA file"> + <option value="no" selected="true">NO</option> + <option value="yes">YES</option> + </param> + <when value="yes"> + <param format="fasta" name="r" type="data" label="Reverse FastA File -f" help="FastA format" /> + </when> + <when value="no"> + </when> + </conditional> + </when> + </conditional> + <param name="label" type="text" format="txt" size="30" value="pyFastqDuplicateRemover" label="Enter output file label -o" /> + </inputs> + <outputs> + <data format="fasta" name="out" label="${label.value}.fasta"/> + <data format="fasta" name="out2" label="${label.value}_reverse.fasta"> + <filter>ftype['reverse']['rev'] == "yes"</filter> + </data> + </outputs> + <help> + +.. class:: infomark + +**pyFastqDuplicateRemover** + +pyFastqDuplicateRemover is part of the pyCRAC_ package. Removes identical sequences from fastq and fasta files and returns a fasta file with collapsed data. + +Can also process paired-end data. + +**Examples** + +Unprocessed fastq data with six random nucleotides at 5' end of the read:: + + @FCC102EACXX:3:1101:3231:2110#TGACCAAT/1 + GCGCCTGCCAATTCCATCGTAATGATTAATAGGGACGGTCGGGGGCATC + + + bb_ceeeegggggiiiiiifghiihiihiiiiiiiiiifggfhiecccc + +After pyBarcodeFilter:: + + @FCC102EACXX:3:1101:3231:2110#TGACCAAT/1##GCGCCT + TCCATCGTAATGATTAATAGGGACGGTCGGGGGCATC + + + giiiiiifghiihiihiiiiiiiiiifggfhiecccc + + This entry is printed to the NNNNNNGCCAAT barcode file. + +After pyFastqDuplicateRemover:: + + >1_GCGCCT_5/1 + TCCATCGTAATGATTAATAGGGACGGTCGGGGGCATC + + The '1' indicates that this is the first unique cDNA in the data + GCGCCT is the random barcode sequence + the '5' indicates that 5 reads were found with identical read and random barcode sequences + the '/1' indicates that the seqeuence originates from the forward sequencing reaction + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +Options:: + + -f FILE, --input_file=FILE + name of the FASTQ or FASTA input file + + -r FILE, --reverse_input_file=FILE + name of the paired (or reverse) FASTQ or FASTA input file + + -o FILE, --output_file=FILE + Provide the path and name of the fastq or fasta output file. Default is standard output. + For paired-end data just provide a file name without file extension (!) + </help> +</tool> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyFastqJoiner.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,139 @@ + <tool id ="pyFastqJoiner" name="pyFastqJoiner"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="python"> + /usr/local/bin/pyFastqJoiner.py + -f $ftype.f1 $ftype.f2 + -o $out + --file_type=$ftype.type + #if $joinc.ch == "-c": + -c $joinc.c + #end if# + </command> + <version_command>/usr/local/bin/pyFastqJoiner.py --version</version_command> + <inputs> + <conditional name="ftype"> + <param name="type" type="select" label="File type"> + <option value="fastq" selected="true">FASTQ</option> + <option value="fasta">FASTA</option> + </param> + <when value="fastq"> + <param format="fastq" name="f1" type="data" label="First FastQ File -f" help="FastQ format" /> + <param format="fastq" name="f2" type="data" label="Second FastQ File -f" help="FastQ format" /> + </when> + <when value="fasta"> + <param format="fasta" name="f1" type="data" label="First FastA File -f" help="FastA format" /> + <param format="fasta" name="f2" type="data" label="Second FastA File -f" help="FastA format" /> + </when> + </conditional> + <conditional name="joinc"> + <param name="ch" type="select" label="Insert a character at join"> + <option value="" selected="true">NO</option> + <option value="-c">YES</option> + </param> + <when value="-c"> + <param type="text" name="c" label="Add this character -c" value=":" > + <validator type="empty_field" message="enter a character or turn this option off" /> + </param> + </when> + <when value=""> + </when> + </conditional> + <param name="label" type="text" format="txt" size="30" value="pyFastqJoiner" label="Enter output file label -o" /> + </inputs> + <outputs> + <data format="input" name="out" label="${label.value}.${ftype.type}"/> + <change_format> + <when input="ftype.type" value="fasta" format="fasta" /> + </change_format> + </outputs> + <help> + +.. class:: infomark + +**pyFastqJoiner** + +pyFastqJoiner is part of the pyCRAC_ package. Merges paired sequences from two fastq or fasta formatted files. + +Example:: + + Forward reaction: + + @FCC102EACXX:3:1101:1343:2181#ATCACGAT/1##CAATAG + CAAATTAGAGTGTTCAAAGCAGGCGTATTGCTCGAAT + + + `efhYb][bdQQ`eeaeaYbeY^ceU__IXa[^ZYae + @FCC102EACXX:3:1101:1424:2248#ATCACGAT/1##CCAGGA + CTAACCATAAACTATGCCTACTAGGGATCCAGAGGTG + + + ^_adddhJbaehbedd`dIb_^cXaRI^BBBBBBBBB + @FCC102EACXX:3:1101:1623:2036#ATCACGAN/1##CTCAGC + CAAAGTTAGGGGATCGAAGATGATCAGATACCGTCGT + + + bghfc^YbgbeadggfdffeaS^ac_X^cegaGZ_ef + @FCC102EACXX:3:1101:1574:2214#ATCACGAT/1##CGTTTT + CTAATGACCCACTCGGCACCTTACGAAATCAAAGTCT + + + cdfgYY`cefhhZef\eaggXaceeghfQaeghWNW\ + + Reverse reaction: + + @FCC102EACXX:3:1101:1343:2181#ATCACGAT/2 + AGCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGA + + + YJaSJ`Z`K`YbSb[[daeJRR[YeWd_I^I^ecgc]OV\bdeaegbXb + @FCC102EACXX:3:1101:1424:2248#ATCACGAT/2 + AAGTCCTTTAAGTTACAGCCTTGCGACCATACTACACCCAGAACCCAAA + + + YJJ\`JQY\`KJ`gY[[QRYY[[`H[_ceI^e[PYO^IWOHW^eaefhh + @FCC102EACXX:3:1101:1623:2036#ATCACGAN/2 + GGCCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTC + + + PP\`ccQ`eY[bQQ[d`ghehaghfgdg[`gb^bd[ePbH^c_c\a_eg + + Here the ":" character is used to split the two sequences. This character tells pyFastqSplitter where to split the sequences. + This character is ignored by pyFastqDuplicateRemover + + Result: + + @FCC102EACXX:3:1101:1343:2181#ATCACGAT/1##CAATAG@FCC102EACXX:3:1101:1343:2181#ATCACGAT/2 + CAAATTAGAGTGTTCAAAGCAGGCGTATTGCTCGAAT:AGCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGA + + + `efhYb][bdQQ`eeaeaYbeY^ceU__IXa[^ZYaeYJaSJ`Z`K`YbSb[[daeJRR[YeWd_I^I^ecgc]OV\bdeaegbXb + @FCC102EACXX:3:1101:1424:2248#ATCACGAT/1##CCAGGA@FCC102EACXX:3:1101:1424:2248#ATCACGAT/2 + CTAACCATAAACTATGCCTACTAGGGATCCAGAGGTG:AAGTCCTTTAAGTTACAGCCTTGCGACCATACTACACCCAGAACCCAAA + + + ^_adddhJbaehbedd`dIb_^cXaRI^BBBBBBBBBYJJ\`JQY\`KJ`gY[[QRYY[[`H[_ceI^e[PYO^IWOHW^eaefhh + @FCC102EACXX:3:1101:1623:2036#ATCACGAN/1##CTCAGC@FCC102EACXX:3:1101:1623:2036#ATCACGAN/2 + CAAAGTTAGGGGATCGAAGATGATCAGATACCGTCGT:GGCCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTC + + + bghfc^YbgbeadggfdffeaS^ac_X^cegaGZ_efPP\`ccQ`eY[bQQ[d`ghehaghfgdg[`gb^bd[ePbH^c_c\a_eg + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +Options:: + + -f fastq_file1 fastq_file2 + Provide the names of two raw data files separated by a single space. + Make sure the first file is the data file of the forward (/1) sequencing reaction. + + --file_type=FASTQ + Can join fasta and fastq files. Fastq is default + + -o mergedfastq.fastq, --outfile=mergedfastq.fastq + provide the name of the output file. By default it + will be printed to the standard output + + -c : + This option adds the '|' character between the DNA + sequences so that it is much easier to split the data + again later on + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyFastqSplitter.pl Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,27 @@ +#!/usr/bin/perl -w +use strict; +use Getopt::Long; + +my %opt; + + +GetOptions(\%opt, "f=s", "c=s", "o1=s", "o2=s","file_type=s", "version","id=s"); + +my $cmnd; + +if (exists $opt{version}){ + $cmnd = "python /usr/local/bin/pyFastqSplitter.py --version"; +} +else{ + $cmnd = "python /usr/local/bin/pyFastqSplitter.py -f $opt{f} -o $opt{id} --file_type=$opt{file_type}"; + + if(defined $opt{c}){ + $cmnd.= " -c $opt{c}"; + } + +} + +system $cmnd; +system "mv $opt{id}_1.$opt{file_type} $opt{o1}"; +system "mv $opt{id}_2.$opt{file_type} $opt{o2}"; +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyFastqSplitter.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,140 @@ + <tool id ="pyFastqSplitter" name="pyFastqSplitter" force_history_refresh="True"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="perl"> + pyFastqSplitter.pl + -f $f + --o1 $out1 + --id $label.value + --o2 $out2 + --file_type $ftype.type + #if $joinc.ch == "-c": + -c $joinc.c + #end if# + </command> + <version_command>/usr/local/bin/pyFastqSplitter.py --version</version_command> + <inputs> + <conditional name="ftype"> + <param name="type" type="select" label="File type"> + <option value="fastq" selected="true">FASTQ</option> + <option value="fasta">FASTA</option> + </param> + <when value="fastq"> + <param format="fastq" name="f" type="data" label="FastQ File -f" help="FastQ format" /> + </when> + <when value="fasta"> + <param format="fasta" name="f" type="data" label="FastA File -f" help="FastA format" /> + </when> + </conditional> + <conditional name="joinc"> + <param name="ch" type="select" label="Insert a character at join"> + <option value="" selected="true">NO</option> + <option value="-c">YES</option> + </param> + <when value="-c"> + <param type="text" name="c" label="Split the reads on the -c character" value=":" > + <validator type="empty_field" message="enter a character or turn this option off" /> + </param> + </when> + <when value=""> + </when> + </conditional> + <param name="label" type="text" format="txt" size="30" value="pyFastqSplitter" label="Enter output file label -o" /> + </inputs> + <outputs> + <data format="input" name="out1" label="${label.value}_1.${ftype.type}"/> + <data format="input" name="out2" label="${label.value}_2.${ftype.type}"/> + <change_format> + <when input="ftype.type" value="fasta" format="fasta" /> + </change_format> + </outputs> + <help> + +.. class:: infomark + +**pyFastqSplitter** + +pyFastqSplitter is part of the pyCRAC_ package. Splits a merged fastq file (pyFastqJoiner output) in to two files. + +Example:: + + Here the ":" character was used to separate the two sequences. By using the -c flag you can tell pyFastqSplitter where to split the sequences. + This character is ignored by pyFastqDuplicateRemover + + + @FCC102EACXX:3:1101:1343:2181#ATCACGAT/1##CAATAG@FCC102EACXX:3:1101:1343:2181#ATCACGAT/2 + CAAATTAGAGTGTTCAAAGCAGGCGTATTGCTCGAAT:AGCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGA + + + `efhYb][bdQQ`eeaeaYbeY^ceU__IXa[^ZYaeYJaSJ`Z`K`YbSb[[daeJRR[YeWd_I^I^ecgc]OV\bdeaegbXb + @FCC102EACXX:3:1101:1424:2248#ATCACGAT/1##CCAGGA@FCC102EACXX:3:1101:1424:2248#ATCACGAT/2 + CTAACCATAAACTATGCCTACTAGGGATCCAGAGGTG:AAGTCCTTTAAGTTACAGCCTTGCGACCATACTACACCCAGAACCCAAA + + + ^_adddhJbaehbedd`dIb_^cXaRI^BBBBBBBBBYJJ\`JQY\`KJ`gY[[QRYY[[`H[_ceI^e[PYO^IWOHW^eaefhh + @FCC102EACXX:3:1101:1623:2036#ATCACGAN/1##CTCAGC@FCC102EACXX:3:1101:1623:2036#ATCACGAN/2 + CAAAGTTAGGGGATCGAAGATGATCAGATACCGTCGT:GGCCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTC + + + bghfc^YbgbeadggfdffeaS^ac_X^cegaGZ_efPP\`ccQ`eY[bQQ[d`ghehaghfgdg[`gb^bd[ePbH^c_c\a_eg + + Result: + + Forward reaction: + + @FCC102EACXX:3:1101:1343:2181#ATCACGAT/1##CAATAG + CAAATTAGAGTGTTCAAAGCAGGCGTATTGCTCGAAT + + + `efhYb][bdQQ`eeaeaYbeY^ceU__IXa[^ZYae + @FCC102EACXX:3:1101:1424:2248#ATCACGAT/1##CCAGGA + CTAACCATAAACTATGCCTACTAGGGATCCAGAGGTG + + + ^_adddhJbaehbedd`dIb_^cXaRI^BBBBBBBBB + @FCC102EACXX:3:1101:1623:2036#ATCACGAN/1##CTCAGC + CAAAGTTAGGGGATCGAAGATGATCAGATACCGTCGT + + + bghfc^YbgbeadggfdffeaS^ac_X^cegaGZ_ef + @FCC102EACXX:3:1101:1574:2214#ATCACGAT/1##CGTTTT + CTAATGACCCACTCGGCACCTTACGAAATCAAAGTCT + + + cdfgYY`cefhhZef\eaggXaceeghfQaeghWNW\ + + Reverse reaction: + + @FCC102EACXX:3:1101:1343:2181#ATCACGAT/2 + AGCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGA + + + YJaSJ`Z`K`YbSb[[daeJRR[YeWd_I^I^ecgc]OV\bdeaegbXb + @FCC102EACXX:3:1101:1424:2248#ATCACGAT/2 + AAGTCCTTTAAGTTACAGCCTTGCGACCATACTACACCCAGAACCCAAA + + + YJJ\`JQY\`KJ`gY[[QRYY[[`H[_ceI^e[PYO^IWOHW^eaefhh + @FCC102EACXX:3:1101:1623:2036#ATCACGAN/2 + GGCCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTC + + + PP\`ccQ`eY[bQQ[d`ghehaghfgdg[`gb^bd[ePbH^c_c\a_eg + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +Options:: + + -f fastq_file, --filename=fastq_file + To provide the names of two raw data files separated + by a single space. Default = standard input + --file_type=FASTQ + Can split joined fasta and fastq files. Fastq is default + If there isn't a specific character splitting the two reads + the tool assumes that the two reads were of equal length + -o splitfastq, --outfile=splitfastq + Provide the name of the output files (WITHOUT file + extension). By default the data will be printed to the + standard output + -c :, --character=: + If the joined sequences were separated by a specific + character then the program can divide the sequences by + looking for that character + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyGTF2bed.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,107 @@ +<tool id="pyGTF2bed" name="pyGTF2bed"> + <description>converter</description> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="python">/usr/local/bin/pyGTF2bed.py --gtf $input -o $output + #if $addtrack.track == "--track": + --track + --name $addtrack.name + --description $addtrack.description + #if $addtrack.colorscheme.colorsel == "default": + -c $addtrack.colorscheme.color + #else: + -s '$addtrack.colorscheme.plus,$addtrack.colorscheme.minus' + #end if# + #end if# + </command> + <version_command>/usr/local/bin/pyGTF2bed.py --version</version_command> + <inputs> + <param name="input" type="data" format="gtf" label="GTF file --gtf"/> + <conditional name="addtrack"> + <param name="track" type="select" label="Add UCSC track line to output --track"> + <option value="" selected="true">NO</option> + <option value="--track">YES</option> + </param> + <when value=""/> + <when value="--track"> + <param name="name" format="txt" type="text" value="User_supplied_track" size="80" label="Track name -n"/> + <param name="description" format="txt" type="text" value="User_supplied_track" size="80" label="Track description -d"/> + <conditional name="colorscheme"> + <param name="colorsel" type="select" label="Colouring scheme"> + <option value="default" selected="true">One Colour</option> + <option value="strand">By Strand</option> + </param> + <when value="default"> + <param name="color" type="select" label="Choose track colour -c"> + <option value="black" selected="true">Black</option> + <option value="red">Red</option> + <option value="blue">Blue</option> + <option value="green">Green</option> + <option value="purple">Purple</option> + </param> + </when> + <when value="strand"> + <param name="plus" type="select" label="Choose forward strand track colour -s"> + <option value="black" selected="true">Black</option> + <option value="red">Red</option> + <option value="blue">Blue</option> + <option value="green">Green</option> + <option value="purple">Purple</option> + </param> + <param name="minus" type="select" label="Choose minus strand track colour -s"> + <option value="black" selected="true">Black</option> + <option value="red">Red</option> + <option value="blue">Blue</option> + <option value="green">Green</option> + <option value="purple">Purple</option> + </param> + </when> + </conditional> + </when> + </conditional> + <param name="label" type="text" format="txt" size="30" value="pyGTF2bed" label="Enter output file label -o" /> + </inputs> + <outputs> + <data name="output" format="bed6" label="${label.value}.bed"/> + </outputs> + <help> + +.. class:: infomark + +**pyGTF2bed** + +pyGTF2bed is part of the pyCRAC_ package. Converts GTF files to the bed 6 format. Gene names present in the GTF file will be included in the bed file. + + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +These options can be used to add and modify a track line for the UCSC genome browser:: + + --track + Use this flag to add a UCSC genome browser track line + to the beginning of your file + -n NAME, --name=NAME + For the USCS track line: provide a track name. Default + = 'User_supplied_track' + -d DESCRIPTION, --description=DESCRIPTION + For the USCS track line: provide a track description. + Default = 'User_supplied_track' + -c COLOR, --color=COLOR + select the track color. Default = black + -s STRANDS, --colorstrands=STRANDS + select the colors for each strand. Default = + 'red,blue' + +File input options:: + + --gtf=Yourfavoritegtf.gtf + type the path to the gtf file that you want to + convert. Default is standard input + </help> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyGTF2bedGraph.pl Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,38 @@ +#!/usr/bin/perl -w +use strict; +use Getopt::Long; + +my %opt; + + +GetOptions(\%opt, "gtf=s","po=s","version","mo=s","count=i","chromfile=s","t=s","iCLIP","track","name=s","description=s","color=s","s=s","id=s"); + +my $cmnd; +my $prefix = "gb_$opt{id}"; +$prefix =~ s/\s/_/g; + + +if (exists $opt{version}){ + $cmnd = "python /usr/local/bin/pyGTF2bedGraph.py --version"; +} +else{ + $cmnd = "python /usr/local/bin/pyGTF2bedGraph.py --gtf $opt{gtf} --chromfile $opt{chromfile} -t $opt{t} --count $opt{count} -o $prefix"; + + if(exists $opt{iCLIP}){ + + $cmnd .= " --iCLIP"; + } + + if(exists $opt{track}){ + $cmnd .= " --track --name \"$opt{name}\" --description \"$opt{description}\""; + + if(exists $opt{color}){$cmnd .= " --color $opt{color}";} + if(exists $opt{s}){$cmnd .= " -s \"$opt{s}\"";} + } +} + +system $cmnd; + +system "mv $prefix"."_plus_strand.bedgraph $opt{po}"; +system "mv $prefix"."_minus_strand.bedgraph $opt{mo}"; +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyGTF2bedGraph.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,162 @@ +<tool id="pyGTF2bedGraph" name="pyGTF2bedGraph"> + <description>converter</description> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="perl">pyGTF2bedGraph.pl --gtf $input --po $po --mo $mo + --chromfile $addchr.chr + -t $type + --count $count + $iclip + #if $addtrack.track == "--track": + --track + --name $addtrack.name + --description $addtrack.description + #if $addtrack.colorscheme.colorsel == "default": + --color $addtrack.colorscheme.color + #else: + -s '$addtrack.colorscheme.plus,$addtrack.colorscheme.minus' + #end if# + #end if# + --id $po.id + </command> + <version_command>/usr/local/bin/pyGTF2bedGraph.py --version</version_command> + <inputs> + <param name="input" type="data" format="gtf" label="GTF file --gtf"/> + <conditional name="addchr"> + <param name="chrfile" type="select" label="Choose Chromosome length file from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="chr" type="select" label="Chromosome length file -c" help="This file should have two columns: first column is the names of the chromosomes, second column is length of the chromosomes. Use pyCrac utility pyCalculateChromosomeLengths to create."> + <options from_data_table="pycrac_chr"/> + </param> + </when> + <when value="other"> + <param format="tabular" name="chr" type="data" label="Chromosome length file -c" help="This file should have two columns: first column is the names of the chromosomes, second column is length of the chromosomes"/> + </when> + </conditional> + <param name="type" type="select" label="Choose type of data -t"> + <option value="reads" selected="true">Reads</option> + <option value="substitutions">Substitutions</option> + <option value="deletions">Deletions</option> + </param> + <param format="integer" name="count" type="integer" label="Count per feature --count " value="1" size="5" help="Takes the numbers in the 'score' column of the GTF file as the total number of reads for each position" > + <validator type="in_range" min="1" message="Please enter a value >= 1"/> + </param> + <param name="iclip" type="select" label="iCLIP mode --iCLIP"> + <option value="" selected="true">OFF</option> + <option value="--iCLIP">ON</option> + </param> + <conditional name="addtrack"> + <param name="track" type="select" label="Add UCSC track line to output"> + <option value="" selected="true">NO</option> + <option value="--track">YES</option> + </param> + <when value=""/> + <when value="--track"> + <param name="name" format="txt" type="text" value="User_supplied_track" size="80" label="Track name"/> + <param name="description" format="txt" type="text" value="User_supplied_track" size="80" label="Track description"/> + <conditional name="colorscheme"> + <param name="colorsel" type="select" label="Colouring scheme"> + <option value="default" selected="true">One Colour</option> + <option value="strand">By Strand</option> + </param> + <when value="default"> + <param name="color" type="select" label="Choose track colour"> + <option value="black" selected="true">Black</option> + <option value="red">Red</option> + <option value="blue">Blue</option> + <option value="green">Green</option> + <option value="purple">Purple</option> + </param> + </when> + <when value="strand"> + <param name="plus" type="select" label="Choose forward strand track colour"> + <option value="black" selected="true">Black</option> + <option value="red">Red</option> + <option value="blue">Blue</option> + <option value="green">Green</option> + <option value="purple">Purple</option> + </param> + <param name="minus" type="select" label="Choose minus strand track colour"> + <option value="black" selected="true">Black</option> + <option value="red">Red</option> + <option value="blue">Blue</option> + <option value="green">Green</option> + <option value="purple">Purple</option> + </param> + </when> + </conditional> + </when> + </conditional> + <param name="label" type="text" format="txt" size="30" value="pyGTF2bedGraph" label="Enter output file label -o" /> + </inputs> + <outputs> + <data name="po" format="bedgraph" label="${label.value}_plus_strand.bg"/> + <data name="mo" format="bedgraph" label="${label.value}_minus_strand.bg"/> + </outputs> + <help> + +.. class:: infomark + +**pyGTF2bedGraph** + +pyGTF2bedGraph is part of the pyCRAC_ package. Generates bedgraph files for each chromosome. An homage to bedtools genomecoverage. Takes a pyReadCounters GTF file as input file. Can also output bedGraph files for substitutions and deletions. + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +File input options:: + + --gtf=readdata.gtf + type the path to the gtf file data file. Be default it + expects data from the standard input + -o converted + provide a name for an output file. A file extension or + strand information is not necessary. + -c yeast.txt, --chromfile=yeast.txt + Location of the chromosome info file. This file should + have two columns: first column is the names of the + chromosomes, second column is length of the + chromosomes. Default is yeast + -t TYPE, --type=TYPE + this tool can generate bedGraph files for reads, + substitutions or deletions. Please use + 'reads','substitutions' or 'deletions' to indicate the + type of data. Default='reads' + --count + Takes the numbers in the 'score' column of the GTF + file as the total number of reads for each position. + Default is 1 for each interval. + --iCLIP + This turns on the iCLIP mode and the sgr reads or cDNA + files will report cross-linking site frequencies in + iCLIP data + -v, --verbose + to print status messages to a log file + +These options can be used to add a track line for the UCSC genome browser:: + + --track + Use this flag to add a UCSC genome browser track line + to the beginning of your file + -n NAME, --name=NAME + For the USCS track line: provide a track name. Default + = 'User_supplied_track' + -d DESCRIPTION, --description=DESCRIPTION + For the USCS track line: provide a track description. + Default = 'User_supplied_track' + --color=COLOR + select the track color. Default = black + -s STRANDS, --colorstrands=STRANDS + select the colors for each strand. Default = + 'red,blue' + + </help> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyGetGTFSources.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,63 @@ + <tool id ="pyGetGTFSources" name="pyGetGTFSources"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="python"> + /usr/local/bin/pyGetGTFSources.py --gtf $addGTF.gtf --count -o $out + </command> + <version_command>/usr/local/bin/pyGetGTFSources.py --version</version_command> + <inputs> + <conditional name="addGTF"> + <param name="gtfFile" type="select" label="Choose GTF File from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="gtf" type="select" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"> + <options from_data_table="pycrac_gtf"/> + </param> + </when> + <when value="other"> + <param format="GTF" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/> + </when> + </conditional> + <param name="count" type="select" label="Count occurences of each annotation --count"> + <option value="" selected="true">No</option> + <option value="--count">Yes</option> + </param> + <param name="label" type="text" format="txt" size="30" value="GTF sources list" label="Enter output file label -o" /> + </inputs> + + <outputs> + <data format="tabular" name="out" label="${label.value}.txt"/> + </outputs> + <help> +.. class:: infomark + +**pyGetGTFSources** + +pyGetGTFSources is part of the pyCRAC_ package. Extracts source names from the second column in a GTF file. + + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +Options:: + + --gtf=Yourfavoritegtf.gtf + type the path to the gtf file that you want to use. By + default it expects data from the standard input + -o OUTFILE, --outfile=OUTFILE + type the name and path of the file you want to write + the output to. Default is standard output + --count with this flag you the program will count the + occurence for each source/annotation in the gtf file + + + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyGetGeneNamesFromGTF.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,71 @@ + <tool id ="pyGetGeneNamesFromGTF" name="pyGetGeneNamesFromGTF"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="python"> + /usr/local/bin/pyGetGeneNamesFromGTF.py --gtf $addGTF.gtf --attribute $attribute $count -o $out + </command> + <version_command>/usr/local/bin/pyGetGeneNamesFromGTF.py --version</version_command> + <inputs> + <conditional name="addGTF"> + <param name="gtfFile" type="select" label="Choose GTF File from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="gtf" type="select" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"> + <options from_data_table="pycrac_gtf"/> + </param> + </when> + <when value="other"> + <param format="GTF" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/> + </when> + </conditional> + <param name="attribute" type="select" label="Select the attribute to extract names from --attribute"> + <option value="gene_name" selected="true">gene_name</option> + <option value="gene_id">gene_id</option> + <option value="transcript_name">transcript_name</option> + <option value="transcript_id">transcript_id</option> + </param> + <param name="count" type="select" label="Count occurences of each annotation --count"> + <option value="" selected="true">No</option> + <option value="--count">Yes</option> + </param> + <param name="label" type="text" format="txt" size="30" value="GTF gene list" label="Enter output file label -o" /> + </inputs> + + <outputs> + <data format="tabular" name="out" label="${label.value}.txt"/> + </outputs> + <help> +.. class:: infomark + +**pyGetGeneNamesFromGTF** + +pyGetGeneNamesFromGTF is part of the pyCRAC_ package. Extracts and counts all gene names from a GTF file. + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +Options:: + + --gtf=Yourfavoritegtf.gtf + type the path to the gtf file that you want to use. By + default it expects data from the standard input. + -o OUTFILE, --outfile=OUTFILE + type the name and path of the file you want to write + the output to. Default is standard output + -a ATTRIBUTE, --attribute=ATTRIBUTE + from which attribute do you want to extract names? + Choices: gene_name, gene_id, transcript_name, + transcript_id + --count + with this flag you the program will count the + occurence for each source/annotation in the gtf file + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyMotif.pl Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,41 @@ +#!/usr/bin/perl -w +use strict; +use Getopt::Long; + +my %opt; + + +GetOptions(\%opt, "f=s","version","gtf=s","range=i","overlap=i","--annotation=s", "--tab=s","--k_min=i","--k_max=i","--numberofkmers=i","--count=s","--features=s","--zscores=s","--random=s","options","o=s","id=s"); + +my $cmnd; + +my $prefix = "m_$opt{id}"; + + +if (exists $opt{version}){ + $cmnd = "python /usr/local/bin/pyMotif.py --version"; +} +else{ + $cmnd = "python /usr/local/bin/pyMotif.py -f $opt{f} --gtf $opt{gtf} --tab $opt{tab} --annotation $opt{annotation} -o $prefix"; + + if(exists $opt{options}){ + + $cmnd .= " --range=$opt{range} --overlap=$opt{overlap} --k_min=$opt{k_min} --k_max=$opt{k_max} --numberofkmers=$opt{numberofkmers}"; + } +} + +#testing +#open (COUNT, ">$opt{count}") || die ""; +#print COUNT "$cmnd"; + +system $cmnd; + + +system "mv $prefix"."_$opt{annotation}_data_k-mers_count.txt $opt{count}"; +system "mv $prefix"."_$opt{annotation}_top_k-mers_in_features.gtf $opt{features}"; +system "mv $prefix"."_$opt{annotation}_k-mer_Z_scores.txt $opt{zscores}"; +system "mv $prefix"."_$opt{annotation}_random_k-mers_count.txt $opt{random}"; + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyMotif.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,222 @@ + <tool id ="pyMotif" name="pyMotif"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="perl"> + pyMotif.pl + -f $input + --gtf=$addGTF.gtf + + #if $addGTF.gtfFile == "default" and $addGTF.annotate.annotations == "auto": + --annotation $addGTF.annotate.scan.annotation + #else: + --annotation $addGTF.annotate.annotation + #end if# + + --tab=$addTab.tab + + #if $addOpt.options == "edit": + --options + --k_min $addOpt.kmin + --k_max $addOpt.kmax + --numberofkmers=$addOpt.numberofkmers + --overlap $addOpt.overlap + --range $addOpt.range + #end if# + -o "$input.name" + --id $count.id + --count $count + --random $random + --features $features + --zscores $zscores + </command> + <version_command>/usr/local/bin/pyMotif.py --version</version_command> + <inputs> + <param format="gtf" name="input" type="data" label="Input File --input_file" help="File of type .gtf" /> + <conditional name="addTab"> + <param name="tabFile" type="select" label="Choose Genomic Reference Sequence from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="tab" type="select" label="Genomic Reference Sequence --tab" help="Tab file containing genomic reference sequence"> + <options from_data_table="pycrac_tab"/> + </param> + </when> + <when value="other"> + <param format="tabular" name="tab" type="data" label="Genomic Reference Sequence --tab" help="Tab file containing genomic reference sequence"/> + </when> + </conditional> + <conditional name="addGTF"> + <param name="gtfFile" type="select" label="Choose GTF File from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="gtf" type="select" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"> + <options from_data_table="pycrac_gtf"/> + </param> + + <conditional name="annotate"> + <param name="annotations" type="select" label="Select annotation"> + <option value="all" selected="true">All</option> + <option value="manual">Enter in text box</option> + <option value="auto">Scan pyGetGTFSources file</option> + </param> + <when value="all"> + <param name="annotation" type="hidden" format="txt" size="10" value="all"/> + </when> + <when value="manual"> + <param name="annotation" type="text" format="txt" size="100" value="protein_coding" label="Select which annotation to focus search on --annotation" help="To find a list of available annotations please use pyGetGTFSources tool"> + <validator type="empty_field" message="Please enter a value"/> + </param> + </when> + <when value="auto"> + <param format="tabular" name="gtf_annotation" type="data" label="GTF annotation File (pyGetGTFSources output)" help="Tabular file containing unique list of annotations/sources in selected GTF file. Refer to pyGetGTFSources"/> + <conditional name="scan"> + <param name="annotations" type="select" label="Scan this file for annotations" help="Choose the correct GTF file then choose GO"> + <option value="wait" selected="true">Waiting</option> + <option value="scanning">Go</option> + </param> + <when value="wait"> + </when> + <when value="scanning"> + <param name="annotation" type="select" multiple="false" label="Select which annotation to focus search on --annotation"> + <options from_dataset="gtf_annotation"> + <column name="name" index="0"/> + <column name="value" index="0"/> + </options> + </param> + </when> + </conditional> + </when> + </conditional> + </when> + <when value="other"> + <param format="gtf" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/> + <conditional name="annotate"> + <param name="annotations" type="select" label="Select annotation"> + <option value="all" selected="true">All</option> + <option value="manual">Enter in text box</option> + <option value="auto">Scan selected file</option> + </param> + <when value="all"> + <param name="annotation" type="hidden" format="txt" size="10" value="all"/> + </when> + <when value="manual"> + <param name="annotation" type="text" format="txt" size="100" value="protein_coding" label="Select which annotation to focus search on --annotation" help="To find a list of available annotations please use pyGetGTFSources tool"> + <validator type="empty_field" message="Please enter a value"/> + </param> + </when> + <when value="auto"> + <param name="annotation" type="select" multiple="false" label="Select which annotation to focus search on --annotation"> + <options from_dataset="gtf"> + <column name="name" index="1"/> + <column name="value" index="1"/> + <filter type="unique_value" name="unique" column="1"/> + </options> + </param> + </when> + </conditional> + </when> + </conditional> + <conditional name="addOpt"> + <param name="options" type="select" label="Standard options"> + <option value="default" selected="true">Default</option> + <option value="edit">Edit</option> + </param> + <when value="edit"> + <param format="integer" name="kmin" type="integer" label="Minimum k-mer Length --k_min " value="4" size="6" help="Set the minimal k-mer length"> + <validator type="in_range" min="1" message="Please enter a value >= 1"/> + </param> + <param format="integer" name="kmax" type="integer" label="Maximum k-mer Length --k_min " value="8" size="6" help="Set the minimal k-mer length"> + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="numberofkmers" type="integer" label="Maximum number of k-mers in output file --numberofkmers" value="1000" size="6" help="Set the maximum number of k-mers in output"> + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="range" type="integer" label="Range --range" value="0" size="5" help="Manually set the length of the 5' and 3' UTRs 0>50000"> + <validator type="in_range" min="0" max="50000" message="Please enter a value between 0 and 50000"/> + </param> + <param format="integer" name="overlap" type="integer" label="Overlap --overlap" value="1" size="5" help="Sets the number of nucleotides a read has to overlap with a gene before it is considered a hit. "> + <validator type="in_range" min="1" message="Please enter a positive integer"/> + </param> + </when> + <when value="default"> + </when> + </conditional> + <param name="label" type="text" format="txt" size="30" value="pyMotif" label="Enter output file label -o" /> + </inputs> + + <outputs> + <data format="tabular" name="zscores" label="${label.value}_k-mer_Z_scores.txt"/> + <data format="tabular" name="count" label="${label.value}_data_k-mers_count.txt"/> + <data format="gtf" name="features" label="${label.value}_top_k-mers_in_features.gtf"/> + <data format="tabular" name="random" label="${label.value}_random_k-mers_count.txt"/> + </outputs> + <help> + +.. class:: infomark + +**pyMotif** + +pyMotif is part of the pyCRAC_ package. Looks for enriched sequence motifs in high-throughput sequencing data. Produces a GTF type output file +with coordinates and Z-scores for enriched motifs. The GTF file can be visualised in genome browsers. + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +File input options:: + + -f intervals.gtf, --input_file=intervals.gtf + Provide the path to an interval gtf file. By default + it expects data from the standard input. + -o OUTPUT_FILE, --output_file=OUTPUT_FILE + Use this flag to override the standard file names. Do + NOT add an extension. + --gtf=annotation_file.gtf + type the path to the gtf annotation file that you want + to use + --tab=tab_file.tab + type the path to the tab file that contains the + genomic reference sequence + +pyMotif specific options:: + + --k_min=4 + this option allows you to set the shortest k-mer + length. Default = 4. + --k_max=6 + this option allows you to set the longest k-mer + length. Default = 8. + -n 100, --numberofkmers=100 + choose the maximum number of enriched k-mer sequences + you want to have reported in output files. Default = + 1000 + +pyCRAC common options:: + + -a protein_coding, --annotation=protein_coding + select which annotation (i.e. protein_coding, ncRNA, + sRNA, rRNA,snoRNA,snRNA, depending on the source of + your GTF file) you would like to focus your search on. + Default = all annotations + -r 100, --range=100 + allows you to add regions flanking the genomic + feature. If you set '-r 50' or '--range=50', then the + program will add 50 nucleotides to each feature on + each side regardless of whether the GTF file has genes + with annotated UTRs. + --overlap=1 + sets the number of nucleotides a motif has to overlap + with a genomic feature before it is considered a hit. + Default = 1 nucleotide + + + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyPileup.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,384 @@ +<?xml version="1.0" encoding="utf-8"?> + <tool id ="pyPileup" name="pyPileup"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="python"> + /usr/local/bin/pyPileup.py + -f $ftype.input + --file_type $ftype.file_type + #if $geneOpt.alignGene == "gene": + -g $geneOpt.genes + #end if# + #if $geneOpt.alignGene == "chr": + --chr $geneOpt.chr + #end if# + #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.disc.discard == "discard": + --discarded $discarded + #end if# + --gtf=$addGTF.gtf + --tab=$addTab.tab + #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.addAlignOpt.alignoptions == "edit": + --align_quality=$ftype.addAlignOpt.align_quality + --align_score=$ftype.addAlignOpt.align_score + --distance=$ftype.addAlignOpt.d + --length=$ftype.addAlignOpt.length + #if int($ftype.addAlignOpt.max) > 0: + --max=$ftype.addAlignOpt.max + #end if# + $ftype.addAlignOpt.unique + $ftype.addAlignOpt.blocks + $ftype.addAlignOpt.mutations + #if $ftype.disc.discard == "--discarded": + --discarded $discarded + #end if# + #end if# + #if $addOpt.options == "edit": + --range=$addOpt.range + --overlap=$addOpt.overlap + $addOpt.iclip + $addOpt.ignore + -s $addOpt.sequence + #if int($addOpt.limit) > 0: + --limit=$addOpt.limit + #end if# + #end if# + -o $output + </command> + <version_command>/usr/local/bin/pyPileup.py --version</version_command> + <inputs> + + + <conditional name="geneOpt"> + <param name="alignGene" type="select" label="Do you want to align reads to genes or chromosome co-ordinates?"> + <option value="gene" selected="true">Genes</option> + <option value="chr">Chromosome Co-ordinates</option> + </param> + <when value="chr"> + <param format="interval" name="chr" type="data" label="Choose a Chromosome Coordinate File" help="Tab delimited text file containing an identifier, chromosome name, start position, end position and strand ('-' or '+')"/> + </when> + <when value="gene"> + <param format="txt" name="genes" type="data" label="Choose a Gene List -g" help="Single column gene ID file"/> + </when> + </conditional> + <conditional name="addGTF"> + <param name="gtfFile" type="select" label="Choose GTF File from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="gtf" type="select" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"> + <options from_data_table="pycrac_gtf"/> + </param> + </when> + <when value="other"> + <param format="GTF" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/> + </when> + </conditional> + <conditional name="addTab"> + <param name="tabFile" type="select" label="Choose Genomic Reference Sequence from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="tab" type="select" label="Genomic Reference Sequence --tab" help="Tab file containing genomic reference sequence"> + <options from_data_table="pycrac_tab"/> + </param> + </when> + <when value="other"> + <param format="tabular" name="tab" type="data" label="Genomic Reference Sequence --tab" help="Tab file containing genomic reference sequence"/> + </when> + </conditional> + + + <conditional name="ftype"> + <param name="file_type" type="select" label="Input File Type --file_type"> + <option value="novo" selected="true">Novo</option> + <option value="sam">Sam/BAM</option> + <option value="gtf">GTF</option> + </param> + <when value="sam"> + <param format="sam,bam" name="input" type="data" label="Input File -f" help="Alignment file of type .sam or .bam" /> + <conditional name="disc"> + <param name="discard" type="select" label="Print discarded reads to a separate file"> + <option value="" selected="true">OFF</option> + <option value="discard">ON</option> + </param> + <when value="discard"> + </when> + <when value=""> + </when> + </conditional> + <conditional name="addAlignOpt"> + <param name="alignoptions" type="select" label="Alignment Options"> + <option value="default" selected="true">Default</option> + <option value="edit">Edit</option> + </param> + <when value="edit"> + <param name="mutations" type="select" label="Filter reads by mutations --mutations" help="cross-linking sites are often highlighted by deletions and/or substitutions in the reads. You can use this option to filter reads based on whether they have mutations or not."> + <option value="" selected="true">Off</option> + <option value="--mutations=delsonly">deletions</option> + <option value="--mutations=subsonly">substitutions</option> + <option value="--mutations=TC">T->C mutations</option> + <option value="--mutations=allmuts">all mutations</option> + <option value="--mutations=nomuts">no mutations</option> + </param> + <param format="integer" name="align_quality" type="integer" label="Align Quality --align_quality " value="0" size="5" > + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="align_score" type="integer" label="Align Score --align_score " value="0" size="5" > + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="max" type="integer" label="Mapped reads to read from input file --max" help="Set to 0 to align all reads." value="0" size="10" > + <validator type="in_range" min="0" max="100000000" message="Please enter a value between 1 and 100000000 or 0 to align all reads"/> + </param> + <param format="integer" name="d" type="integer" label="Distance --distance " value="1000" size="6" help="Set the maximum number of bp allowed between two non-overlapping paired reads"> + <validator type="in_range" min="1" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="length" type="integer" label="Set the maximum length of reads --length" value="1000" size="7" help="Set the read length threshold between 15 and 1000"> + <validator type="in_range" min="15" max="1000" message="Please enter a value between 15 and 1000"/> + </param> + <param name="unique" type="select" label="Remove reads with multiple alignment locations --unique"> + <option value="" selected="true">OFF</option> + <option value="--unique">ON</option> + </param> + <param name="blocks" type="select" label="Only count reads with same start and end coords once --blocks"> + <option value="" selected="true">OFF</option> + <option value="--blocks">ON</option> + </param> + </when> + <when value="default"> + </when> + </conditional> + </when> + <when value="novo"> + <param format="tabular" name="input" type="data" label="Input File -f" help="Alignment file of type .novo" /> + <conditional name="disc"> + <param name="discard" type="select" label="Print discarded reads to a separate file"> + <option value="" selected="true">OFF</option> + <option value="discard">ON</option> + </param> + <when value="discard"> + </when> + <when value=""> + </when> + </conditional> + <conditional name="addAlignOpt"> + <param name="alignoptions" type="select" label="Alignment Options"> + <option value="default" selected="true">Default</option> + <option value="edit">Edit</option> + </param> + <when value="edit"> + <param name="mutations" type="select" label="Filter reads by mutations --mutations" help="cross-linking sites are often + highlighted by deletions and/or substitutions in the reads. You can use this option to filter reads based on whether they have mutations or not."> + <option value="" selected="true">Off</option> + <option value="--mutations=delsonly">deletions</option> + <option value="--mutations=subsonly">substitutions</option> + <option value="--mutations=TC">T->C mutations</option> + <option value="--mutations=allmuts">all mutations</option> + <option value="--mutations=nomuts">no mutations</option> + </param> + <param format="integer" name="align_quality" type="integer" label="Align Quality --align_quality " value="0" size="5" > + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="align_score" type="integer" label="Align Score --align_score " value="0" size="5" > + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="max" type="integer" label="Mapped reads to read from input file --max" help="Set to 0 to align all reads." value="0" size="10" > + <validator type="in_range" min="0" max="100000000" message="Please enter a value between 1 and 100000000 or 0 to align all reads"/> + </param> + <param format="integer" name="d" type="integer" label="Distance --distance " value="1000" size="6" help="Set the maximum number of bp allowed between two non-overlapping paired reads"> + <validator type="in_range" min="1" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="length" type="integer" label="Set the maximum length of reads --length" value="1000" size="7" help="Set the read length threshold between 15 and 1000"> + <validator type="in_range" min="15" max="1000" message="Please enter a value between 15 and 1000"/> + </param> + <param name="unique" type="select" label="Remove reads with multiple alignment locations --unique"> + <option value="" selected="true">OFF</option> + <option value="--unique">ON</option> + </param> + <param name="blocks" type="select" label="Only count reads with same start and end coords once --blocks"> + <option value="" selected="true">OFF</option> + <option value="--blocks">ON</option> + </param> + </when> + <when value="default"> + </when> + </conditional> + </when> + <when value="gtf"> + <param format="gtf" name="input" type="data" label="Input File -f" help="File of type .gtf" /> + </when> + </conditional> + + <conditional name="addOpt"> + <param name="options" type="select" label="Standard Options"> + <option value="default" selected="true">Default</option> + <option value="edit">Edit</option> + </param> + <when value="edit"> + <param format="integer" name="range" type="integer" label="Range --range" value="0" size="5" help="Manually set the length of the 5' and 3' UTRs 0>50000"> + <validator type="in_range" min="0" max="50000" message="Please enter a value between 0 and 50000"/> + </param> + <param name="ignore" type="select" label="Ignore strand information? --ignorestrand"> + <option value="" selected="true">No</option> + <option value="--ignorestrand">Yes</option> + </param> + <param format="integer" name="overlap" type="integer" label="Overlap --overlap" value="1" size="5" help="Sets the number of nucleotides a read has to overlap with a gene before it is considered a hit. "> + <validator type="in_range" min="1" message="Please enter a positive integer"/> + </param> + <param name="sequence" type="select" label="Align reads to --sequence"> + <option value="genomic" selected="true">Genomic Sequence</option> + <option value="coding">Coding Sequence</option> + </param> + <param name="iclip" type="select" label="iCLIP mode --iCLIP"> + <option value="" selected="true">OFF</option> + <option value="--iCLIP">ON</option> + </param> + <param format="integer" name="limit" type="integer" label="Limit number of reads to count that map to a particular region --limit" value="0" size="15" help="Set to 0 for unlimited reads" > + <validator type="in_range" min="0" message="Please enter a value greater than 1 or set to 0 for unlimited reads"/> + </param> + </when> + <when value="default"> + </when> + </conditional> + <param name="label" type="text" format="txt" size="30" value="pyPileup" label="Enter output file label -o" /> + </inputs> + <outputs> + <data format="tabular" name="output" label="${label.value}.pileup"/> + <data format="txt" name="discarded" label="${label.value}_discarded.txt"> + <filter>(ftype['file_type'] == "novo" or ftype['file_type'] == "sam") and ftype['disc']['discard'] == "discard"</filter> + </data> + </outputs> + <help> + + +.. class:: infomark + +**pyPileup** + +pyPileup is part of the pyCRAC_ package. Produces pileups containing the number of hits, substitutions and deletions for each nucleotide covered by +reads in specific genes or genomic regions + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +File input options:: + + -f FILE, --input_file=FILE + As input files you can use Novoalign native output, + SAM, pyMotif or pyReadCounters GTF files as input + file. By default it expects data from the standard + input. Make sure to specify the file type of the file + you want to have analyzed using the --file_type + option! + -o OUTPUT_FILE, --output_file=OUTPUT_FILE + Use this flag to override the standard output file + names. All pileups will be written to one output file. + -g FILE, --genes_file=FILE + here you need to type in the name of your gene list + file (1 column) or the hittable file + --chr=FILE + if you simply would like to align reads against a + genomic sequence you should generate a tab delimited + file containing an identifyer, chromosome name, start + position, end position and strand + --gtf=annotation_file.gtf + type the path to the gtf annotation file that you want + to use + --tab=tab_file.tab + type the path to the tab file that contains the + genomic reference sequence + --file_type=FILE_TYPE + use this option to specify the file type (i.e. 'novo', + 'sam', 'gtf'). This will tell the program which + parsers to use for processing the files. Default = + 'novo' + +pyPileup specific options:: + + --limit=500 + with this option you can select how many reads mapped + to a particular gene/ORF/region you want to count. + Default = All + --iCLIP + This turns on the iCLIP mode and the pileups will + report cross-linking site frequencies in iCLIP data in + reference sequences + +Common options:: + + -v, --verbose + prints all the status messages to a file rather than + the standard output + --ignorestrand + this flag tells the program to ignore strand + information and all overlapping reads will considered + sense reads. Useful for analysing ChIP or RIP data + --zip=FILE + use this option to compress all the output files in a + single zip file + --overlap=1 + sets the number of nucleotides a read has to overlap + with a gene before it is considered a hit. Default = + 1 nucleotide + -s genomic, --sequence=genomic + with this option you can select whether you want the + reads aligned to the genomic or the coding sequence. + Default = genomic + -r 100, --range=100 + allows you to set the length of the UTR regions. If + you set '-r 50' or '--range=50', then the program will + set a fixed length (50 bp) regardless of whether the + GTF file has genes with annotated UTRs. + +Options for novo, SAM and BAM files:: + + --align_quality=100, --mapping_quality=100 + with these options you can set the alignment quality + (Novoalign) or mapping quality (SAM) threshold. Reads + with qualities lower than the threshold will be + ignored. Default = 0 + --align_score=100 + with this option you can set the alignment score + threshold. Reads with alignment scores lower than the + threshold will be ignored. Default = 0 + -l 100, --length=100 + to set read length threshold. Default = 1000 + -m 100000, --max=100000 + maximum number of mapped reads that will be analyzed. + Default = All + --unique + with this option reads with multiple alignment + locations will be removed. Default = Off + --blocks + with this option reads with the same start and end + coordinates on a chromosome will only be counted once. + Default = Off + --discarded=FILE + prints the lines from the alignments file that were + discarded by the parsers. This file contains reads + that were unmapped (NM), of poor quality (i.e. QC) or + paired reads that were mapped to different chromosomal + locations or were too far apart on the same + chromosome. Useful for debugging purposes + -d 1000, --distance=1000 + this option allows you to set the maximum number of + base-pairs allowed between two non-overlapping paired + reads. Default = 1000 + --mutations=delsonly + Use this option to only track mutations that are of + interest. For CRAC data this is usually deletions + (--mutations=delsonly). For PAR-CLIP data this is + usually T-C mutations (--mutations=TC). Other options + are: do not report any mutations: --mutations=nomuts. + Only report specific base mutations, for example only + in T's, C's and G's :--mutations=[TCG]. The brackets + are essential. Other nucleotide combinations are also + possible + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyReadAligner.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,368 @@ + <tool id ="pyReadAligner" name="pyReadAligner"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="python"> + /usr/local/bin/pyReadAligner.py + -f $ftype.input + --file_type $ftype.file_type + #if $geneOpt.alignGene == "gene": + -g $geneOpt.genes + #end if# + #if $geneOpt.alignGene == "chr": + --chr $geneOpt.chr + #end if# + #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.disc.discard == "discard": + --discarded $discarded + #end if# + --gtf=$addGTF.gtf + --tab=$addTab.tab + #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.addAlignOpt.alignoptions == "edit": + --align_quality=$ftype.addAlignOpt.align_quality + --align_score=$ftype.addAlignOpt.align_score + --distance=$ftype.addAlignOpt.d + --length=$ftype.addAlignOpt.length + #if int($ftype.addAlignOpt.max) > 0: + --max=$ftype.addAlignOpt.max + #end if# + $ftype.addAlignOpt.unique + $ftype.addAlignOpt.blocks + $ftype.addAlignOpt.mutations + #end if# + #if $addOpt.options == "edit": + --range=$addOpt.range + --overlap=$addOpt.overlap + $addOpt.ignore + -s $addOpt.sequence + #if int($addOpt.limit) > 0: + --limit=$addOpt.limit + #end if# + #end if# + -o $output + </command> + <version_command>/usr/local/bin/pyReadAligner.py --version</version_command> + <inputs> + + + <conditional name="geneOpt"> + <param name="alignGene" type="select" label="Do you want to align reads to genes or chromosome co-ordinates?"> + <option value="gene" selected="true">Genes</option> + <option value="chr">Chromosome Co-ordinates</option> + </param> + <when value="chr"> + <param format="interval" name="chr" type="data" label="Choose a Chromosome Coordinate File" help="Tab delimited text file contai\ +ning an identifier, chromosome name, start position, end position and strand ('-' or '+')"/> + </when> + <when value="gene"> + <param format="txt" name="genes" type="data" label="Choose a Gene List -g" help="Single column gene ID file"/> + </when> + </conditional> + <conditional name="addGTF"> + <param name="gtfFile" type="select" label="Choose GTF File from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="gtf" type="select" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"> + <options from_data_table="pycrac_gtf"/> + </param> + </when> + <when value="other"> + <param format="GTF" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/> + </when> + </conditional> + <conditional name="addTab"> + <param name="tabFile" type="select" label="Choose Genomic Reference Sequence from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="tab" type="select" label="Genomic Reference Sequence --tab" help="Tab file containing genomic reference sequence"> + <options from_data_table="pycrac_tab"/> + </param> + </when> + <when value="other"> + <param format="tabular" name="tab" type="data" label="Genomic Reference Sequence --tab" help="Tab file containing genomic reference sequence"/> + </when> + </conditional> + + + <conditional name="ftype"> + <param name="file_type" type="select" label="Input File Type --file_type"> + <option value="sam">Sam/BAM</option> + <option value="novo">Novo</option> + <option value="gtf">GTF</option> + </param> + <when value="sam"> + <param format="sam,bam" name="input" type="data" label="Input File -f" help="Alignment file of type .sam or .bam"/> + <conditional name="disc"> + <param name="discard" type="select" label="Print discarded reads to a separate file"> + <option value="" selected="true">OFF</option> + <option value="discard">ON</option> + </param> + <when value="discard"> + </when> + <when value=""> + </when> + </conditional> + <conditional name="addAlignOpt"> + <param name="alignoptions" type="select" label="Alignment Options"> + <option value="default" selected="true">Default</option> + <option value="edit">Edit</option> + </param> + <when value="edit"> + <param name="mutations" type="select" label="Filter reads by mutations --mutations" help="cross-linking sites are often highlighted by deletions and/or substitutions in the reads. You can use this option to filter reads based on whether they have mutations or not."> + <option value="" selected="true">Off</option> + <option value="--mutations=delsonly">deletions</option> + <option value="--mutations=subsonly">substitutions</option> + <option value="--mutations=TC">T->C mutations</option> + <option value="--mutations=allmuts">all mutations</option> + <option value="--mutations=nomuts">no mutations</option> + </param> + <param format="integer" name="align_quality" type="integer" label="Align Quality --align_quality " value="0" size="5" > + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="align_score" type="integer" label="Align Score --align_score " value="0" size="5" > + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="max" type="integer" label="Mapped reads to read from input file --max" help="Set to 0 to align all reads." value="0" size="10" > + <validator type="in_range" min="0" max="100000000" message="Please enter a value between 1 and 100000000 or 0 to align all reads"/> + </param> + <param format="integer" name="d" type="integer" label="Distance --distance " value="1000" size="6" help="Set the maximum number of bp allowed between two non-overlapping paired reads"> + <validator type="in_range" min="1" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="length" type="integer" label="Set the maximum length of reads --length" value="1000" size="7" help="Set the read length threshold between 15 and 1000"> + <validator type="in_range" min="15" max="1000" message="Please enter a value between 15 and 1000"/> + </param> + <param name="unique" type="select" label="Remove reads with multiple alignment locations --unique"> + <option value="" selected="true">OFF</option> + <option value="--unique">ON</option> + </param> + <param name="blocks" type="select" label="Only count reads with same start and end coords once --blocks"> + <option value="" selected="true">OFF</option> + <option value="--blocks">ON</option> + </param> + </when> + <when value="default"> + </when> + </conditional> + </when> + <when value="novo"> + <param format="tabular" name="input" type="data" label="Input File -f" help="Alignment file of type .novo" /> + <conditional name="disc"> + <param name="discard" type="select" label="Print discarded reads to a separate file"> + <option value="" selected="true">OFF</option> + <option value="discard">ON</option> + </param> + <when value="discard"> + </when> + <when value=""> + </when> + </conditional> + <conditional name="addAlignOpt"> + <param name="alignoptions" type="select" label="Alignment Options"> + <option value="default" selected="true">Default</option> + <option value="edit">Edit</option> + </param> + <when value="edit"> + <param name="mutations" type="select" label="Filter reads by mutations --mutations" help="cross-linking sites are often + highlighted by deletions and/or substitutions in the reads. You can use this option to filter reads based on whether they have mutations or not."> + <option value="" selected="true">Off</option> + <option value="--mutations=delsonly">deletions</option> + <option value="--mutations=subsonly">substitutions</option> + <option value="--mutations=TC">T->C mutations</option> + <option value="--mutations=allmuts">all mutations</option> + <option value="--mutations=nomuts">no mutations</option> + </param> + <param format="integer" name="align_quality" type="integer" label="Align Quality --align_quality " value="0" size="5" > + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="align_score" type="integer" label="Align Score --align_score " value="0" size="5" > + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="max" type="integer" label="Mapped reads to read from input file --max" help="Set to 0 to align all reads." value="0" size="10" > + <validator type="in_range" min="0" max="100000000" message="Please enter a value between 1 and 100000000 or 0 to align all reads"/> + </param> + <param format="integer" name="d" type="integer" label="Distance --distance " value="1000" size="6" help="Set the maximum number of bp allowed between two non-overlapping paired reads"> + <validator type="in_range" min="1" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="length" type="integer" label="Set the maximum length of reads --length" value="1000" size="7" help="Set the read length threshold between 15 and 1000"> + <validator type="in_range" min="15" max="1000" message="Please enter a value between 15 and 1000"/> + </param> + <param name="unique" type="select" label="Remove reads with multiple alignment locations --unique"> + <option value="" selected="true">OFF</option> + <option value="--unique">ON</option> + </param> + <param name="blocks" type="select" label="Only count reads with same start and end coords once --blocks"> + <option value="" selected="true">OFF</option> + <option value="--blocks">ON</option> + </param> + </when> + <when value="default"> + </when> + </conditional> + </when> + <when value="gtf"> + <param format="gtf" name="input" type="data" label="Input File -f" help="File of type .gtf" /> + </when> + </conditional> + + <conditional name="addOpt"> + <param name="options" type="select" label="Standard Options"> + <option value="default" selected="true">Default</option> + <option value="edit">Edit</option> + </param> + <when value="edit"> + <param format="integer" name="range" type="integer" label="Range --range" value="0" size="5" help="Manually set the length of the 5' and 3' UTRs 0>50000"> + <validator type="in_range" min="0" max="50000" message="Please enter a value between 0 and 50000"/> + </param> + <param name="ignore" type="select" label="Ignore strand information? --ignorestrand"> + <option value="" selected="true">No</option> + <option value="--ignorestrand">Yes</option> + </param> + <param format="integer" name="overlap" type="integer" label="Overlap --overlap" value="1" size="5" help="Sets the number of nucleotides a read has to overlap with a gene before it is considered a hit. "> + <validator type="in_range" min="1" message="Please enter a positive integer"/> + </param> + <param name="sequence" type="select" label="Align reads to --sequence"> + <option value="genomic" selected="true">Genomic Sequence</option> + <option value="coding">Coding Sequence</option> + </param> + <param format="integer" name="limit" type="integer" label="Limit number of reads to count that map to a particular region --limit" value="0" size="15" help="Set to 0 for unlimited reads" > + <validator type="in_range" min="0" message="Please enter a value greater than 1 or set to 0 for unlimited reads"/> + </param> + </when> + <when value="default"> + </when> + </conditional> + <param name="label" type="text" format="txt" size="30" value="pyReadAligner" label="Enter output file label -o" /> + </inputs> + <outputs> + <data format="fasta" name="output" label="${label.value}.aligned.fasta"/> + <data format="txt" name="discarded" label="${label.value}_discarded.txt"> + <filter>(ftype['file_type'] == "novo" or ftype['file_type'] == "sam") and ftype['disc']['discard'] == "discard"</filter> + </data> + </outputs> + <help> + + +.. class:: infomark + +**pyReadAligner** + +pyReadAligner is part of the pyCRAC_ package. Generates multiple sequence alignments for reads mapped to individual genes or genomic regions. +Produces a fasta output file. + + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +File input options:: + + -f FILE, --input_file=FILE + As input files you can use Novoalign native output or + SAM files as input file. By default it expects data + from the standard input. Make sure to specify the file + type of the file you want to have analyzed using the + --file_type option! + -o OUTPUT_FILE, --output_file=OUTPUT_FILE + Use this flag to override the standard output file + names. All alignments will be written to one output + file. + -g FILE, --genes_file=FILE + here you need to type in the name of your gene list + file (1 column) or the hittable file + --chr=FILE + if you simply would like to align reads against a + genomic sequence you should generate a tab delimited + file containing an identifyer, chromosome name, start + position, end position and strand + --gtf=annotation_file.gtf + type the path to the gtf annotation file that you want + to use + --tab=tab_file.tab + type the path to the tab file that contains the + genomic reference sequence + --file_type=FILE_TYPE + use this option to specify the file type (i.e. 'novo', + 'sam', 'gtf'). This will tell the program which + parsers to use for processing the files. Default = + 'novo' + +pyReadAligner specific options:: + + --limit=500 + with this option you can select how many reads mapped + to a particular gene/ORF/region you want to count. + Default = All + +Common options:: + + --ignorestrand + this flag tells the program to ignore strand + information and all overlapping reads will considered + sense reads. Useful for analysing ChIP or RIP data + --overlap=1 + sets the number of nucleotides a read has to overlap + with a gene before it is considered a hit. Default = + 1 nucleotide + -s genomic, --sequence=genomic + with this option you can select whether you want the + reads aligned to the genomic or the coding sequence. + Default = genomic + -r 100, --range=100 + allows you to set the length of the UTR regions. If + you set '-r 50' or '--range=50', then the program will + set a fixed length (50 bp) regardless of whether the + GTF file has genes with annotated UTRs. + +Options for novo, SAM and BAM files:: + + --align_quality=100, --mapping_quality=100 + with these options you can set the alignment quality + (Novoalign) or mapping quality (SAM) threshold. Reads + with qualities lower than the threshold will be + ignored. Default = 0 + --align_score=100 + with this option you can set the alignment score + threshold. Reads with alignment scores lower than the + threshold will be ignored. Default = 0 + -l 100, --length=100 + to set read length threshold. Default = 1000 + -m 100000, --max=100000 + maximum number of mapped reads that will be analyzed. + Default = All + --unique + with this option reads with multiple alignment + locations will be removed. Default = Off + --blocks + with this option reads with the same start and end + coordinates on a chromosome will only be counted once. + Default = Off + --discarded=FILE + prints the lines from the alignments file that were + discarded by the parsers. This file contains reads + that were unmapped (NM), of poor quality (i.e. QC) or + paired reads that were mapped to different chromosomal + locations or were too far apart on the same + chromosome. Useful for debugging purposes + -d 1000, --distance=1000 + this option allows you to set the maximum number of + base-pairs allowed between two non-overlapping paired + reads. Default = 1000 + --mutations=delsonly + Use this option to only track mutations that are of + interest. For CRAC data this is usually deletions + (--mutations=delsonly). For PAR-CLIP data this is + usually T-C mutations (--mutations=TC). Other options + are: do not report any mutations: --mutations=nomuts. + Only report specific base mutations, for example only + in T's, C's and G's :--mutations=[TCG]. The brackets + are essential. Other nucleotide combinations are also + possible + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyReadCounters.pl Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,60 @@ +#!/usr/bin/perl -w +use strict; +use Getopt::Long; + +my %opt; + + +GetOptions(\%opt, "f=s","file_type=s","version","gtf=s","align_quality=i","align_score=i","range=i","length=i","max=i","distance=i","ignorestrand","overlap=i","unique","blocks","mutations=s","countoutput=s","stats=s","hittable=s","intronUTRoverlap=s","discarded=s","options","alignOpt","id=s"); + +my $cmnd; +my $prefix = "rc_$opt{id}"; + + +if (exists $opt{version}){ + $cmnd = "python /usr/local/bin/pyReadCounters.py --version"; +} +else{ + $cmnd = "python /usr/local/bin/pyReadCounters.py -f $opt{f} --file_type $opt{file_type} --gtf $opt{gtf} -o $prefix"; + + if(exists $opt{options}){ + + $cmnd .= " --range=$opt{range} --overlap=$opt{overlap}"; + + if(exists $opt{ignorestrand}){ $cmnd .= " --ignorestrand";} + } + + if(exists $opt{alignOpt}){ + $cmnd .= " --align_quality=$opt{align_quality} --align_score=$opt{align_score} --length=$opt{length} --distance=$opt{distance}"; + if(exists $opt{max}){$cmnd .= " --max=$opt{max}";} + if(exists $opt{unique}){$cmnd .= " --unique";} + if(exists $opt{blocks}){$cmnd .= " --blocks";} + if(exists $opt{mutations}){$cmnd .= " --mutations=$opt{mutations}";} + if(exists $opt{discarded}){$cmnd .= " --discarded=$opt{discarded}";} + + } +} + + +system $cmnd; +print STDOUT $cmnd; + + +if(exists $opt{blocks}){ + system "mv $prefix"."_hittable_cDNAs.txt $opt{hittable}"; + system "mv $prefix"."_file_statistics_cDNAs.txt $opt{stats}"; + system "mv $prefix"."_intron_and_UTR_overlap_cDNAs.gtf $opt{intronUTRoverlap}"; + + if($opt{file_type} ne "gtf"){ + system "mv $prefix"."_count_output_cDNAs.gtf $opt{countoutput}"; + } +} +else{ + system "mv $prefix"."_hittable_reads.txt $opt{hittable}"; + system "mv $prefix"."_file_statistics_reads.txt $opt{stats}"; + system "mv $prefix"."_intron_and_UTR_overlap_reads.gtf $opt{intronUTRoverlap}"; + + if($opt{file_type} ne "gtf"){ + system "mv $prefix"."_count_output_reads.gtf $opt{countoutput}"; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pyReadCounters.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,359 @@ +<tool id ="pyReadCounters" name="pyReadCounters" force_history_refresh="True"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="perl"> + pyReadCounters.pl + -f $ftype.input + --file_type $ftype.file_type + --gtf $addGTF.gtf + #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.disc.discard == "discard": + --discarded $discarded + #end if# + #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.addAlignOpt.alignoptions == "edit": + --alignOpt + --align_quality $ftype.addAlignOpt.align_quality + --align_score $ftype.addAlignOpt.align_score + #if int($ftype.addAlignOpt.max) > 0: + --max $ftype.addAlignOpt.max + #end if# + --distance $ftype.addAlignOpt.d + --length $ftype.addAlignOpt.length + $ftype.addAlignOpt.unique + $ftype.addAlignOpt.blocks + $ftype.addAlignOpt.mutations + #end if# + #if $addOpt.options == "edit": + --options + --range $addOpt.range + $addOpt.ignore + --overlap $addOpt.overlap + #end if# + + --stats $stats + --hittable $hittable + --intronUTRoverlap $intronUTRoverlap + + #if $ftype.file_type == "novo" or $ftype.file_type == "sam": + --countoutput $countoutput + #end if# + + --id $stats.id + </command> + <version_command>/usr/local/bin/pyReadCounters.py --version</version_command> + <inputs> + <conditional name="addGTF"> + <param name="gtfFile" type="select" label="Choose GTF File from"> + <option value="default" selected="true">Defaults</option> + <option value="other">History</option> + </param> + <when value="default"> + <param name="gtf" type="select" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"> + <options from_data_table="pycrac_gtf"/> + </param> + </when> + <when value="other"> + <param format="GTF" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/> + </when> + </conditional> + <conditional name="ftype"> + <param name="file_type" type="select" label="Input File Type --file_type" help="Use .novo or .sam input files"> + <option value="novo" selected="true">Novo</option> + <option value="sam">Sam/Bam</option> + <option value="gtf">GTF</option> + </param> + <when value="novo"> + <param format="tabular" name="input" type="data" label="Input File --input_file" help="Alignment file of type .novo" /> + <conditional name="disc"> + <param name="discard" type="select" label="Print discarded reads to a separate file"> + <option value="" selected="true">OFF</option> + <option value="discard">ON</option> + </param> + <when value="discard"> + </when> + <when value=""> + </when> + </conditional> + <conditional name="addAlignOpt"> + <param name="alignoptions" type="select" label="Alignment Options"> + <option value="default" selected="true">Default</option> + <option value="edit">Edit</option> + </param> + <when value="edit"> + <param name="mutations" type="select" label="Option for selecting type of mutations to report --mutations" help="cross-linking sites are often highlighted by deletions and/or substitutions in the reads. You can use this option to select specific mutations that you want to have reported in the GTF output file."> + <option value="" selected="true">Off</option> + <option value="--mutations delsonly">deletions</option> + <option value="--mutations subsonly">substitutions</option> + <option value="--mutations TC">T->C substitutions</option> + <option value="--mutations nomuts">no mutations</option> + </param> + <param format="integer" name="align_quality" type="integer" label="Align Quality --align_quality " value="0" size="5" > + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="align_score" type="integer" label="Align Score --align_score " value="0" size="5" > + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="max" type="integer" label="Mapped reads to read from input file --max" help="Set to 0 to align all reads." value="0" size="10" > + <validator type="in_range" min="0" max="100000000" message="Please enter a value between 1 and 100000000 or 0 to align all reads"/> + </param> + <param format="integer" name="d" type="integer" label="Distance --distance " value="1000" size="6" help="Set the maximum number of bp allowed between two non-overlapping paired reads"> + <validator type="in_range" min="1" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="length" type="integer" label="Set the maximum length of reads --length" value="1000" size="7" help="Set the read length threshold between 15 and 1000"> + <validator type="in_range" min="15" max="1000" message="Please enter a value between 15 and 1000"/> + </param> + <param name="unique" type="select" label="Remove reads with multiple alignment locations --unique"> + <option value="" selected="true">OFF</option> + <option value="--unique">ON</option> + </param> + <param name="blocks" type="select" label="Only count reads with same start and end coords once --blocks"> + <option value="" selected="true">OFF</option> + <option value="--blocks">ON</option> + </param> + </when> + <when value="default"> + </when> + </conditional> + </when> + <when value="sam"> + <param format="sam,bam" name="input" type="data" label="Input File --input_file" help="Alignment file of type .sam or .bam" /> + <conditional name="disc"> + <param name="discard" type="select" label="Print discarded reads to a separate file"> + <option value="" selected="true">OFF</option> + <option value="discard">ON</option> + </param> + <when value="discard"> + </when> + <when value=""> + </when> + </conditional> + <conditional name="addAlignOpt"> + <param name="alignoptions" type="select" label="Alignment Options"> + <option value="default" selected="true">Default</option> + <option value="edit">Edit</option> + </param> + <when value="edit"> + <param name="mutations" type="select" label="Option for selecting type of mutations to report --mutations" help="cross-linking sites are often highlighted by deletions and/or substitutions in the reads. You can use this option to select specific mutations that you want to have reported in the GTF output file."> + <option value="" selected="true">Off</option> + <option value="--mutations delsonly">deletions</option> + <option value="--mutations subsonly">substitutions</option> + <option value="--mutations TC">T->C mutations</option> + <option value="--mutations nomuts">no mutations</option> + </param> + <param format="integer" name="align_quality" type="integer" label="Align Quality --align_quality " value="0" size="5" > + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="align_score" type="integer" label="Align Score --align_score " value="0" size="5" > + <validator type="in_range" min="0" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="max" type="integer" label="Mapped reads to read from input file --max" help="Set to 0 to align all reads." value="0" size="10" > + <validator type="in_range" min="0" max="100000000" message="Please enter a value between 1 and 100000000 or 0 to align all reads"/> + </param> + <param format="integer" name="d" type="integer" label="Distance --distance " value="1000" size="6" help="Set the maximum number of bp allowed between two non-overlapping paired reads"> + <validator type="in_range" min="1" message="Please enter a value >= 0"/> + </param> + <param format="integer" name="length" type="integer" label="Set the maximum length of reads --length" value="1000" size="7" help="Set the read length threshold between 15 and 1000"> + <validator type="in_range" min="15" max="1000" message="Please enter a value between 15 and 1000"/> + </param> + <param name="unique" type="select" label="Remove reads with multiple alignment locations --unique"> + <option value="" selected="true">OFF</option> + <option value="--unique">ON</option> + </param> + <param name="blocks" type="select" label="Only count reads with same start and end coords once --blocks"> + <option value="" selected="true">OFF</option> + <option value="--blocks">ON</option> + </param> + </when> + <when value="default"> + </when> + </conditional> + </when> + <when value="gtf"> + <param format="gtf" name="input" type="data" label="Input File --input_file" help="File of type .gtf" /> + </when> + </conditional> + <conditional name="addOpt"> + <param name="options" type="select" label="Standard Options"> + <option value="default" selected="true">Default</option> + <option value="edit">Edit</option> + </param> + <when value="edit"> + <param format="integer" name="range" type="integer" label="Range --range" value="0" size="5" help="Manually set the length of the 5' and 3' UTRs 0>50000"> + <validator type="in_range" min="0" max="50000" message="Please enter a value between 0 and 50000"/> + </param> + <param name="ignore" type="select" label="Ignore strand information? --ignorestrand"> + <option value="" selected="true">No</option> + <option value="--ignorestrand">Yes</option> + </param> + <param format="integer" name="overlap" type="integer" label="Overlap --overlap" value="1" size="5" help="Sets the number of nucleotides a read has to overlap with a gene before it is considered a hit. "> + <validator type="in_range" min="1" message="Please enter a positive integer"/> + </param> + </when> + <when value="default"> + </when> + </conditional> + <param name="label" type="text" format="txt" size="30" value="pyReadCounters" label="Enter output file label -o" /> + </inputs> + <outputs> + <data format="tabular" name="stats" label="${label.value}_file_statistics.txt"/> + <data format="tabular" name="hittable" label="${label.value}_hittable.txt"/> + <data format="gtf" name="intronUTRoverlap" label="${label.value}_intron_and_UTR_overlap.txt"/> + <data format="gtf" name="countoutput" label="${label.value}_count_output.gtf"> + <filter>ftype['file_type'] == "novo" or ftype['file_type'] == "sam"</filter> + </data> + <data format="txt" name="discarded" label="${label.value}_discarded.txt"> + <filter>(ftype['file_type'] == "novo" or ftype['file_type'] == "sam") and ftype['disc']['discard'] == "discard"</filter> + </data> + </outputs> + <help> + +.. class:: infomark + +**pyReadCounters** + +pyReadCounters is part of the pyCRAC_ package. Produces a gene hittable file, two GTF output files showing to which genomic features the reads overlap. +Finally the tool produces a read statistics file that provides information about the complexity of your dataset. + +**Output file examples** + +A hittable file:: + + # generated by pyReadCounters version 1.1.0, Mon Apr 16 20:34:22 2012 + # /usr/local/bin/pyReadCounters.py -f RNAseq_data.novo -c 1 --unique + # total number of reads 12534556 + # total number of paired reads 10947376 + # total number of single reads 483095 + # total number of mapped reads: 11430471 + # total number of overlapping genomic features 7019550 + # sense 5960669 + # anti-sense 1058881 + # feature sense_overlap anti-sense_overlap number of reads + + ## protein_coding 3190701 + YEF3 49930 3629 24221 + PMA1 32621 2650 21776 + COX1 24559 1037 15174 + TFP1 21539 1689 13506 + HSC82 21177 1458 12729 + ADH1 20245 1467 11351 + AI5_ALPHA 20022 918 13101 + AI4 19390 886 12638 + AI3 17823 798 11473 + AI2 17590 790 11297 + RPL10 16822 1113 8797 + ENO2 16336 1125 8913 + TEF1 15578 1333 5450 + +An example of a GTF 'count_output' file:: + + ##gff-version 2 + # generated by Counters version 1.2.0, Tue Jan 8 22:47:29 2013 + # pyReadCounters.py -f PAR_CLIP_unique.novo --mutations=TC -v + # total number of reads: 2455251 + # total number of paired reads: 0 + # total number of single reads: 2455251 + # total number of mapped reads: 2455251 + # total number of overlapping genomic features: 5153943 + # sense: 2640600 + # anti-sense: 2513343 + chrXIV reads exon 661572 661605 2 + . gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661596S; + chrXIV reads exon 661720 661738 1 + . gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661726S; + chrXIV reads exon 661839 661878 4 + . gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661875S; + +This output file also reports whether a read contains a mutation. + +For example:: + + # 661596S + +Indicates that the read had a nucleotide substitution ("S") at genomic coordinate 661596. The chromosome name can be found in the first column. + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +File input options:: + + -f FILE, --input_file=FILE + provide the path to your novo, SAM/BAM or gtf data + file. Default is standard input. Make sure to specify + the file type of the file you want to have analyzed + using the --file_type option! + -o OUTPUT_FILE, --output_file=OUTPUT_FILE + Use this flag to override the standard file names. Do + NOT add an extension. + --file_type=FILE_TYPE + use this option to specify the file type (i.e. + 'novo','sam' or 'gtf'). This will tell the program + which parsers to use for processing the files. Default + = 'novo' + --gtf=annotation_file.gtf + type the path to the gtf annotation file that you want + to use + +Common pyCRAC options:: + + --ignorestrand + To ignore strand information and all reads overlapping + with genomic features will be considered sense reads. + Useful for analysing ChIP or RIP data + --overlap=1 + sets the number of nucleotides a read has to overlap + with a gene before it is considered a hit. Default = + 1 nucleotide + -r 100, --range=100 + allows you to add regions flanking the genomic + feature. If you set '-r 50' or '--range=50', then the + program will add 50 nucleotides to each feature on + each side regardless of whether the GTF file has genes + with annotated UTRs + +Options for SAM/BAM and Novo files:: + + --mutations=delsonly + Use this option to only track mutations that are of + interest. For CRAC data this is usually deletions + (--mutations=delsonly). For PAR-CLIP data this is + usually T-C mutations (--mutations=TC). Other options + are\: do not report any mutations: --mutations=nomuts. + Only report specific base mutations, for example only + in T's, C's and G's :--mutations=[TCG]. The brackets + are essential. Other nucleotide combinations are also + possible + --align_quality=100, --mapping_quality=100 + with these options you can set the alignment quality + (Novoalign) or mapping quality (SAM) threshold. Reads + with qualities lower than the threshold will be + ignored. Default = 0 + --align_score=100 + with this option you can set the alignment score + threshold. Reads with alignment scores lower than the + threshold will be ignored. Default = 0 + --unique + with this option reads with multiple alignment + locations will be removed. Default = Off + --blocks + with this option reads with the same start and end + coordinates on a chromosome will be counted as one + cDNA. Default = Off + -m 100000, --max=100000 + maximum number of mapped reads that will be analyzed. + Default = All + -d 1000, --distance=1000 + this option allows you to set the maximum number of + base-pairs allowed between two non-overlapping paired + reads. Default = 1000 + --discarded=FILE + prints the lines from the alignments file that were + discarded by the parsers. This file contains reads + that were unmapped (NM), of poor quality (i.e. QC) or + paired reads that were mapped to different chromosomal + locations or were too far apart on the same + chromosome. Useful for debugging purposes + -l 100, --length=1000 + to set read length threshold. Default = 1000 + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pySelectMotifsFromGTF.xml Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,75 @@ + <tool id ="pySelectMotifsFromGTF" name="pySelectMotifsFromGTF"> + <requirements> + <requirement type="package">pyCRAC</requirement> + </requirements> + <command interpreter="python"> + /usr/local/bin/pySelectMotifsFromGTF.py + --gtf $input + -m $motif + -o $out + -l $length + -z $zscore + </command> + <version_command>/usr/local/bin/pySelectMotifsFromGTF.py --version</version_command> + <inputs> + <param format="gtf" name="input" type="data" label="Input File -f" help="pyMotif gtf output files" /> + <param format="txt" name="motif" type="text" size="200" value="KBCTTG" label="motif string" help="Enter motif (all uppercase) you want to extract from the pyMotif gtf output file"> + <validator type="empty_field" /> + </param> + <param format="integer" type="integer" value="6" size="5" name="length" label="Length --length" help="Set a Kmer Length. Note that the length has to be at least as long as your k-mer sequence, otherwise the program will not run correctly" /> + <param format="float" type="float" value="0" size="5" name="zscore" label="Z Score --Z_score" help="Set a minimum Kmer Z_score" /> + <param name="label" type="text" format="txt" size="30" value="pySelectMotifsFromGTF" label="Enter output file label -o" /> + </inputs> + + <outputs> + <data format="gtf" name="out" label="${label.value}_${motif.value}.gtf"/> + </outputs> + <help> + + +.. class:: infomark + +**pySelectMotifsFromGTF** + +pySelectMotifsFromGTF is part of the pyCRAC_ package. Extracts your favourite k-mer sequence from pyMotif GTF output files. +Note that you can include degenerate nucleotides in your motif string:: + + N = A, G, C or T + R = A or G = puRine + Y = C or T = pYrimidine + M = A or C = aroMatic + S = G or C + W = A or T + K = G or T = Keto + V = A, C or G = Not T (letter after) + D = A, G or T = Not C + H = A, C or T = Not G + B = C, G or T = Not A + +So if you enter KBCTTG as search string and length=6, then the program will extract a large number of six-mers from your data. +If you set length = 8, it will look for this pattern in a stretch of 8 nucleotides. + +.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html + +------ + +**Parameter list** + +Options:: + + --gtf=Yourfavoritegtf.gtf + type the path to the gtf file that you want to use. By + default it expects data from the standard input + -o FILE, --output=FILE + Optional.Specify the name of the output file. Default + is standard output. Make sure it has the .gtf + extension! + -m KBCTTG, --motif=KBCTTG + Specify the motif you want extract from the GTF file. + -z 15.0, --Z_score=15.0 + Set a minimum k-mer Z-score. Default=0 + -l 4, --length=4 + Set a k-mer length. Default is no length filtering + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pycrac.chr.loc.sample Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,1 @@ +s.cerevisiae EF2 /usr/local/pyCRAC/db/Saccharomyces_cerevisiae.EF2.59.1.0_chr_lengths.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pycrac.fasta.loc.sample Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,1 @@ +s.cerevisiae EF2 /usr/local/pyCRAC/db/Saccharomyces_cerevisiae.EF2.59.1.0.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pycrac.gtf.loc.sample Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,1 @@ +s.cerevisiae EF2 /usr/local/pyCRAC/db/Saccharomyces_cerevisiae.EF2.59.1.2.gtf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/pycrac.tab.loc.sample Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,1 @@ +s.cerevisiae EF2 /usr/local/pyCRAC/db/Saccharomyces_cerevisiae.EF2.59.1.0.fa.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyCRAC/tool_data_table_conf.xml.sample Tue Jun 18 09:11:00 2013 -0400 @@ -0,0 +1,23 @@ +<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc--> +<tables> + <!-- Locations of default fasta files for pycrac --> + <table name="pycrac_fasta"> + <columns>name, value</columns> + <file path="tool-data/pyCRAC/pycrac.fasta.loc" /> + </table> + <!-- Locations of default gtf files for pycrac --> + <table name="pycrac_gtf"> + <columns>name, value</columns> + <file path="tool-data/pyCRAC/pycrac.gtf.loc" /> + </table> + <!-- Locations of default tab files for pycrac --> + <table name="pycrac_tab"> + <columns>name, value</columns> + <file path="tool-data/pyCRAC/pycrac.tab.loc" /> + </table> + <!-- Locations of default chrom length files for pycrac --> + <table name="pycrac_chr"> + <columns>name, value</columns> + <file path="tool-data/pyCRAC/pycrac.chr.loc" /> + </table> +</tables>