# HG changeset patch
# User swebb
# Date 1371561060 14400
# Node ID 19b20927172db70a47f4ddc10f84c524155ebbf9
Uploaded
diff -r 000000000000 -r 19b20927172d pyCRAC/pyAlignment2Tab.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyAlignment2Tab.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,139 @@
+
+ converter
+
+ pyCRAC
+
+ /usr/local/bin/pyAlignment2Tab.py -f $input --limit $limit -o $output --singlefile
+
+ /usr/local/bin/pyAlignment2Tab.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pyAlignment2Tab**
+
+pyAlignment2Tab is part of the pyCRAC_ package. Converts pyReadAligner fasta output to a tabular alignment output.
+
+Example::
+
+ The tool expects a standard pyReadAligner fasta-formatted output file:
+
+ >GeneX
+ ATGTCTCGTACTAACATGGATACAAGACACGCACATTCTGCTTTACTGGCAGCACCACAGAGTGCTACTGCAAATAGCAGGAGCAGCAACAGCAGCAGCGAGAGTAGTAGTAACAAAAACAATATCAATGTCGGCGTCGGTGACGATAGCGGTAA
+ >257930-10
+ ---TCTCGTACcAACATGGATACAAGACACGCACATTCTGCTT----------------------------------------------------------------------------------------------------------------
+ >3664964-1
+ ---TCTCGcACcAACATGGATACAAGACACGCACATTtTGCTT----------------------------------------------------------------------------------------------------------------
+ >4033560-1
+ ---TCTCGTACcAACATGGATACAAGACACGCACATTCTGtTT----------------------------------------------------------------------------------------------------------------
+ >8571880-1
+ ---TCTCGTACcAACATGGATACAAGACACGCAgATTCTGCTT----------------------------------------------------------------------------------------------------------------
+ >9617396-1
+ ---TCTCGTACcAACATGGATACAAGACACGCcCATTCTGCTT----------------------------------------------------------------------------------------------------------------
+ >843368-5
+ ------------AACAcGGATACAAGACACGCACATTCTG-------------------------------------------------------------------------------------------------------------------
+ >854553-5
+ ------------AACATGGATACAAGACACGCAC--TCTG-------------------------------------------------------------------------------------------------------------------
+ >1522401-2
+ --------------CATGGATACAAGACACGCACATTCTGCTTTACTGGCAGgA-----------------------------------------------------------------------------------------------------
+ >5981234-1
+ --------------CATGGATACAAGACACGCACAcTCTGCTTTACTGGCAGCA-----------------------------------------------------------------------------------------------------
+ >997684-4
+ --------------CATGGATACAAGACACGCACATTCTGCTTTACTGGCAGCA-----------------------------------------------------------------------------------------------------
+ >1046653-4
+ ---------------ATGGATACAAGACACGCACATTCTGCTTTACTGGCAGgAC----------------------------------------------------------------------------------------------------
+ >1103730-4
+ ---------------ATGGATACAAGACACGCACAcTCTG-------------------------------------------------------------------------------------------------------------------
+ >1603913-2
+ ---------------ATGGATACAAGAaACGCACAcTCTG-------------------------------------------------------------------------------------------------------------------
+ >180349-12
+ ---------------ATGGATACAAGACACGCACATTCcGCTTTACTG-----------------------------------------------------------------------------------------------------------
+ >1985106-1
+ ---------------ATGGATACAAGACACGCACATTCgGCTTTACTGGCAGCcC----------------------------------------------------------------------------------------------------
+ >1987775-1
+ ---------------ATGGATACccGACACGCACATTCTGCTTTACTGcCAGCAC----------------------------------------------------------------------------------------------------
+ >2258725-1
+ ---------------ATGGATACAAGACACGCACATTCTGCTTTgCTGGCAGCAC----------------------------------------------------------------------------------------------------
+ >2631987-1
+ ---------------ATGGATACAAGACACGCACATTCTGCTTTACcGGCAGgAC----------------------------------------------------------------------------------------------------
+
+ This will be converted into:
+
+ 1 .........|.........|.........|.........|.........|.........|.........|.........|.........| 90
+ >GeneX ATGTCTCGTACTAACATGGATACAAGACACGCACATTCTGCTTTACTGGCAGCACCACAGAGTGCTACTGCAAATAGCAGGAGCAGCAAC
+ >257930-10 ---TCTCGTACcAACATGGATACAAGACACGCACATTCTGCTT-----------------------------------------------
+ >3664964-1 ---TCTCGcACcAACATGGATACAAGACACGCACATTtTGCTT-----------------------------------------------
+ >4033560-1 ---TCTCGTACcAACATGGATACAAGACACGCACATTCTGtTT-----------------------------------------------
+ >8571880-1 ---TCTCGTACcAACATGGATACAAGACACGCAgATTCTGCTT-----------------------------------------------
+ >9617396-1 ---TCTCGTACcAACATGGATACAAGACACGCcCATTCTGCTT-----------------------------------------------
+ >843368-5 ------------AACAcGGATACAAGACACGCACATTCTG--------------------------------------------------
+ >854553-5 ------------AACATGGATACAAGACACGCAC--TCTG--------------------------------------------------
+ >1522401-2 --------------CATGGATACAAGACACGCACATTCTGCTTTACTGGCAGgA------------------------------------
+ >5981234-1 --------------CATGGATACAAGACACGCACAcTCTGCTTTACTGGCAGCA------------------------------------
+ >997684-4 --------------CATGGATACAAGACACGCACATTCTGCTTTACTGGCAGCA------------------------------------
+ >1046653-4 ---------------ATGGATACAAGACACGCACATTCTGCTTTACTGGCAGgAC-----------------------------------
+ >1103730-4 ---------------ATGGATACAAGACACGCACAcTCTG--------------------------------------------------
+ >1603913-2 ---------------ATGGATACAAGAaACGCACAcTCTG--------------------------------------------------
+ >180349-12 ---------------ATGGATACAAGACACGCACATTCcGCTTTACTG------------------------------------------
+ >1985106-1 ---------------ATGGATACAAGACACGCACATTCgGCTTTACTGGCAGCcC-----------------------------------
+ >1987775-1 ---------------ATGGATACccGACACGCACATTCTGCTTTACTGcCAGCAC-----------------------------------
+ >2258725-1 ---------------ATGGATACAAGACACGCACATTCTGCTTTgCTGGCAGCAC-----------------------------------
+ >2631987-1 ---------------ATGGATACAAGACACGCACATTCTGCTTTACcGGCAGgAC-----------------------------------
+ >337206-9 ---------------ATGGATACAAGACACGCACATTCTGCTTTACTGGCAGCAC-----------------------------------
+ >4616761-1 ---------------ATGGATAgAAGACACGCACATTCTGCTTTACTGGtAGCAC-----------------------------------
+ >4756312-1 ---------------ATGGATACAAcACACGCACAcTCTG--------------------------------------------------
+ >4763682-1 ---------------ATGGATACAAGACACGCACATTCcGCTTTcCTG------------------------------------------
+ >5971268-1 ---------------ATGGATACAAGACACGCACATTCcGCTcTACTc------------------------------------------
+ >6644790-1 ---------------ATGGATACAAGACACGCACATTCTGCTTTACTcGCAGCAC-----------------------------------
+ >7112423-1 ---------------ATGGATACAAGACACGCACATTCTGCTTTACTGtCAGCAC-----------------------------------
+ >7559990-1 ---------------ATGGATACAAGACACGCACATTCTGCTTTACTGGCcGgAC-----------------------------------
+ >8007281-1 ---------------ATGGATAtAAGACACGCACAcTCTG--------------------------------------------------
+ >9150255-1 ---------------ATGGATACAcGACACGCACATTCcGCTTTcCTG------------------------------------------
+ >9180814-1 ---------------ATGGATACAAGACACGCACATTCTGCTTTACTGGCAGgcC-----------------------------------
+ >963117-4 ---------------ATGGATACAAGACACGCACATTCTGCTTTACcGGCAGCAC-----------------------------------
+ >9672073-1 ---------------ATGGATACAAGACACGCACATTCTGCTTTACTGGCAGCcC-----------------------------------
+ >971218-4 ---------------ATGGATACAAGACACGCACATcCTGCTTTACTGG-AGCACC----------------------------------
+ >10040274-1 -------------------ATACAAGACACGCACATTCTGCTTTACTGGCAGgACCACA-------------------------------
+ >1063072-4 -------------------ATACAAGACACGCACATTCTGCTTcACTGGCAGCACCACA-------------------------------
+ >1430188-2 -------------------ATACAAGACACGCACATTCTGCTTTACTGGCAGCACCACA-------------------------------
+ >5196741-1 -------------------ATACAAGACACGCACATTCTGCTTcACTGGCcGCACCACA-------------------------------
+ >6017337-1 -------------------ATACAAGACACGCACATTCTGCTTcACTGtCAGaACCcCA-------------------------------
+ >7159053-1 -------------------ATACAAGACACGCACATTCTGCTTTACTGGCAGCACCcaA-------------------------------
+ >7528336-1 -------------------ATACAAGACACGCACATTCTGCTTcACTGGCAGCAaCACA-------------------------------
+ >735584-6 --------------------------------------------------------ACAGAGTGCTACTGCAAAcAGCAGGAGCAGCAAC
+ >8551047-1 --------------------------------------------------------ACAGAGTGCTAtTGCAAAcAGCAGGAGtAGtAAC
+ >3000121-1 ------------------------------------------------------------AGTcCTACcGCAAATAGCAGcAGCAGCAAC
+ >928481-5 ------------------------------------------------------------AGTGCTACcGCAAATAGCAGGAGCAGCAAC
+ >126987-15 ----------------------------------------------------------------------CAAATAGCAGGAGCAGCAAC
+ >3122797-1 ----------------------------------------------------------------------CAAATAGCAGGcGCAGCAAC
+ >6684686-1 ----------------------------------------------------------------------CAAATAGCAGGAGCAGCAAC
+
+ Note that the column width here was set to 90 characters
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+Options::
+
+ -f data.fasta
+ Type the path to the fasta file that you want to use.
+ --limit=90
+ Allows the user to set the column width of the alignment. Default=90 characters
+ -o output.fasta
+ Provide the name of your output file
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyBarcodeFilter.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyBarcodeFilter.pl Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,71 @@
+#!/usr/bin/perl -w
+use strict;
+use Getopt::Long;
+
+my %opt;
+
+
+GetOptions(\%opt, "f=s", "b=s", "out=s", "output_path=s","id=s","m=i", "file_type=s", "both", "r=s", "version", "i");
+
+my $cmnd;
+
+if (exists $opt{version}){
+ $cmnd = "python /usr/local/bin/pyBarcodeFilter.py --version";
+}
+else{
+ $cmnd = "python /usr/local/bin/pyBarcodeFilter.py -f $opt{f} -b $opt{b} -m $opt{m} --file_type $opt{file_type}";
+
+ if(defined $opt{r}){
+
+ $cmnd.= " -r $opt{r}";
+
+ if(exists $opt{both}){
+ $cmnd .= " --both";
+ }
+ }
+
+ if(exists $opt{i}){
+ $cmnd .= " -i";
+ }
+}
+
+# Create the output directory (for the multiple output files)
+my $output_path = $opt{output_path};
+
+system $cmnd;
+
+open(BC,$opt{b}) || die "Cannot open barcode file";
+my %bc;
+while(my $line = ){
+ chomp($line);
+ my ($barcode,$sample) = (split(/\t/,$line))[0,1];
+ $bc{$barcode}=$sample;
+}
+
+system "mv barcode_statistics.txt $opt{out}";
+
+my $ft = lc($opt{file_type});
+
+foreach my $key(keys %bc){
+ my @split = (split(/\//,$opt{f}));
+ my $l = @split;
+ my $output = $split[$l-1];
+ $output = (split(/\./,$output))[0];
+ $output = "$output"."_"."$key"."_"."$bc{$key}"."."."$ft";
+ my $rename = "$output_path/primary_$opt{id}_$bc{$key}-1"."_visible_"."$ft";
+ system "mv $output $rename";
+
+ if(defined $opt{r}){
+ my @split2 = (split(/\//,$opt{r}));
+ $l = @split2;
+ $output = $split2[$l-1];
+ $output = (split(/\./,$output))[0];
+ $output = "$output"."_"."$key"."_"."$bc{$key}"."."."$ft";
+ $rename = "$output_path/primary_$opt{id}_$bc{$key}-2"."_visible_"."$ft";
+ system "mv $output $rename";
+ }
+}
+
+
+close BC;
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyBarcodeFilter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyBarcodeFilter.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,125 @@
+
+
+ pyCRAC
+
+
+ /usr/local/bin/pyBarcodeFilter.pl
+ --file_type $ftype.type
+ -f $ftype.f
+ -b $barcode
+ -m $mismatch
+ $index
+ --out $out
+ --id $out.id
+ --output_path $__new_file_path__
+ #if $ftype.reverse.rev == "yes":
+ -r=$ftype.reverse.r
+ $ftype.reverse.both
+ #end if#
+
+ pyBarcodeFilter.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pySolexaBarcodeFilter**
+
+pySolexaBarcodeFilter is part of the pyCRAC_ package. Filters sequence files by barcodes.
+
+This tool requires FASTA or FASTQ input files containing the raw data and a text file containing barcode information.
+To process paired end data, use -f and the -r flags to indicate the path to the forward and reverse sequencing reactions, respectively.
+The barcodes file should two columns separated by a tab (see the table below). The first column should contain the barcode nucleotide sequences.
+The second column should contain an identifier, for example, the name of the barcode or the name of the experiment.
+The āNā in the barcode sequence indicates a random nucleotide. Make sure to use a simple text editor like TextEdit (MacOS X), gedit (Linux/Unix) or use a text editor in the terminal.
+The program is case sensitive: all the nucleotide sequences should be upper case.
+You can freely combine different barcodes but if you are mixing samples containing random nucleotide barcodes and normal barcodes.
+**NOTE!** make sure to place the regular barcode sequence below the sequence with random nucleotides and make sure the shortest sequence is ALWAYS at the bottom in the column (see below)
+
+Example of a barcode text file::
+
+ NNNCGCTTAGC mutant2
+ NNNGCGCAGC mutant1
+ NNNATTAG control
+ NNNTAAGC myfavprotein
+ AGC oldcontrol
+ AC veryfirstbarcodedsample
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+Options::
+
+ -f FILE, --input_file=FILE
+ name of the FASTQ or FASTA input file
+ -r FILE, --reverse_input_file=FILE
+ name of the paired (or reverse) FASTQ or FASTA input file
+ --file_type=FASTQ
+ type of file, uncompressed (fasta or fastq) or compressed (fasta.gz or fastq.gz, gzip/gunzip
+ compressed). Default is fastq
+ -b FILE, --barcode_list=FILE
+ name of tab-delimited file containing barcodes and barcode names
+ -m 1, --mismatches=1
+ to set the number of allowed mismatches in a barcode. A maximum of one mismatch is allowed. Default = 0
+ -i, --index
+ use this option if you want to split the data using the Illumina indexing barcode information
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyBinCollector.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyBinCollector.pl Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,47 @@
+#!/usr/bin/perl -w
+use strict;
+use Getopt::Long;
+
+my %opt=(s=>"genomic",numberofbins=>20);
+
+
+GetOptions(\%opt, "f=s","version","gtf=s","range=i","annotation=s", "numberofbins=i","min_length=i","max_length=i","s=s","o=s","ignorestrand","outputall","sd=s","ssub=s","sdel=s","asd=s","assub=s","asdel=s","out=s","options","bins1=i","bins2=i","id=s");
+
+my $cmnd;
+
+my $prefix = "bc_$opt{id}";
+
+if (exists $opt{version}){
+ $cmnd = "python /usr/local/bin/pyBinCollector.py --version";
+}
+else{
+ $cmnd = "python /usr/local/bin/pyBinCollector.py -f $opt{f} --gtf $opt{gtf} --annotation $opt{annotation} -o $prefix";
+
+ if(exists $opt{outputall})
+ {
+ $cmnd .= " --outputall";
+ }
+
+ if(exists $opt{options}){
+
+ $cmnd .= " --range=$opt{range} --numberofbins $opt{numberofbins} --min_length $opt{min_length} --max_length $opt{max_length} -s $opt{s}";
+
+ if(exists $opt{ignorestrand}){ $cmnd .= " --ignorestrand";}
+ if(exists $opt{bins1}){ $cmnd .= " --binselect $opt{bins1} $opt{bins2}";}
+ }
+}
+
+
+system $cmnd;
+if(exists $opt{outputall}){
+
+ system "mv sense_data_$prefix.txt $opt{sd}";
+ system "mv sense_subs_$prefix.txt $opt{ssub}";
+ system "mv sense_dels_$prefix.txt $opt{sdel}";
+ system "mv anti_sense_data_$prefix.txt $opt{asd}";
+ system "mv anti_sense_subs_$prefix.txt $opt{assub}";
+ system "mv anti_sense_dels_$prefix.txt $opt{asdel}";
+}
+else{
+ system "mv $prefix"."_cumulative_densities_$opt{annotation}"."_$opt{s}_"."$opt{numberofbins}_bins.pileup $opt{out}";
+}
diff -r 000000000000 -r 19b20927172d pyCRAC/pyBinCollector.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyBinCollector.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,290 @@
+
+
+ pyCRAC
+
+
+ pyBinCollector.pl
+ -f $input
+ --gtf $addGTF.gtf
+ #if $addGTF.gtfFile == "default" and $addGTF.annotate.annotations == "auto":
+ --annotation $addGTF.annotate.scan.annotation
+ #else:
+ --annotation $addGTF.annotate.annotation
+ #end if#
+ #if $addOpt.options == "edit":
+ --options
+ --range $addOpt.range
+ --min_length $addOpt.min_length
+ --max_length $addOpt.max_length
+ --numberofbins $addOpt.numberofbins
+ -s $addOpt.sequence
+ #if $addOpt.limitBins.binselect == "yes":
+ --bins1 $addOpt.limitBins.bs_first
+ --bins2 $addOpt.limitBins.bs_last
+ #end if#
+ $addOpt.ignore
+ $addOpt.oall.outputall
+ #end if#
+ -o "$input.name"
+ #if $addOpt.options == "edit" and $addOpt.oall.outputall == "--outputall":
+ --id $sd.id
+ --sd $sd
+ --ssub $ssub
+ --sdel $sdel
+ --asd $asd
+ --assub $assub
+ --asdel $asdel
+ #else:
+ --out $out
+ --id $out.id
+ #end if#
+
+ /usr/local/bin/pyBinCollector.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ addOpt['oall']['outputall'] == ""
+
+
+ addOpt['options'] == "edit" and addOpt['oall']['outputall'] == "--outputall"
+
+
+ addOpt['options'] == "edit" and addOpt['oall']['outputall'] == "--outputall"
+
+
+ addOpt['options'] == "edit" and addOpt['oall']['outputall'] == "--outputall"
+
+
+ addOpt['options'] == "edit" and addOpt['oall']['outputall'] == "--outputall"
+
+
+ addOpt['options'] == "edit" and addOpt['oall']['outputall'] == "--outputall"
+
+
+ addOpt['options'] == "edit" and addOpt['oall']['outputall'] == "--outputall"
+
+
+
+
+
+.. class:: infomark
+
+**pyBinCollector**
+
+pyBinCollector is part of the pyCRAC_ package. Allows the user to generate genome-wide coverage plots. Normalises gene lengths by dividing genes into a
+fixed number of bins and then calculates the hit density in each bin. The program also allows the user to input specific bin numbers to extract
+blocks/clusters present in these bins.
+
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+File input options::
+
+ -f FILE, --input_file=FILE
+ Provide the path and name of the pyReadCounters.py or
+ pyMotif.py GTF file. By default the program expects
+ data from the standard input.
+ -o OUTPUT_FILE, --output_file=OUTPUT_FILE
+ To set an output file name. Do not add a file
+ extension. By default, if the --outputall flag is not
+ used, the program writes to the standard output.
+ --gtf=yeast.gtf
+ type the path to the gtf annotation file that you want
+ to use. Default is /usr/local/pyCRAC/db/Saccharomyces_
+ cerevisiae.EF2.59.1.2.gtf
+
+pyBinCollector.py specific options::
+
+ -a protein_coding, --annotation=protein_coding
+ select which annotation (i.e. protein_coding, ncRNA,
+ sRNA, rRNA, tRNA, snoRNA, all) you would like to focus
+ your search on. Default = all
+ --min_length=20
+ to set a minimum length threshold for genes. Genes
+ shorter than the minimal length will be discarded.
+ Default = 1
+ --max_length=10000
+ to set a maximum length threshold for genes. Genes
+ larger than the maximum length will be discarded.
+ Default = 100000000
+ -n 20, --numberofbins=20
+ select the number of bins you want to generate.
+ Default=20
+ --binselect=2 4
+ allows selection of sequences that were mapped to
+ specific bins. This option expects two numbers, one
+ for each bin, separated by a space. For example:
+ --binselect 20 30.
+ --outputall
+ use this flag to output the normalized distribution
+ for each individual gene, rather than making a
+ cumulative coverage plot. Useful for making box plots
+ or for making heat maps.
+
+Common options::
+
+ -r 100, --range=100
+ allows you to set the length of the UTR regions. If
+ you set '-r 50' or '--range=50', then the program will
+ set a fixed length (50 bp) regardless of whether the
+ GTF file has genes with annotated UTRs.
+ -s intron, --sequence=intron
+ with this option you can select whether you want to
+ generate bins from the coding or genomic sequence or
+ introns,exon,CDS, or UTR coordinates. Default =
+ genomic
+ --ignorestrand
+ To ignore strand information and all reads overlapping
+ with genomic features will be considered sense reads.
+ Useful for analysing ChIP or RIP data
+
+
+
+
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyCalculateChromosomeLengths.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyCalculateChromosomeLengths.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,53 @@
+
+
+ pyCRAC
+
+
+ /usr/local/bin/pyCalculateChromosomeLengths.py
+ -f $ftype.input
+ --file_type $ftype.filetype
+ -o $output
+ /usr/local/bin/pyCalculateChromosomeLengths.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pyCalculateChromosomeLengths**
+
+pyCalculateChromosomeLengths is part of the pyCRAC_ package. Takes a genome sequence in fasta or tab format and generates a tab-delimited file showing chromosome name and chromosome length.
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+-------
+
+**Parameter list**
+
+Options::
+
+ -f chromosomes.fasta, --input_file=chromosomes.fasta
+ provide the name and path of your fasta or tab genomic
+ sequence file. Default is standard input.
+ --file_type=fasta
+ provide the file type (fasta or tab). Default is fasta
+
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyCalculateFDRs.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyCalculateFDRs.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,247 @@
+
+
+ pyCRAC
+
+
+ /usr/local/bin/pyCalculateFDRs.py
+ -f $ftype.input
+ --file_type $ftype.file_type
+ --gtf=$addGTF.gtf
+
+ #if $addGTF.annotate.annotations != "all":
+ #if $addGTF.gtfFile == "default" and $addGTF.annotate.annotations == "auto":
+ --annotation $addGTF.annotate.scan.annotation
+ #else:
+ --annotation $addGTF.annotate.annotation
+ #end if#
+ #end if#
+ --chromfile=$addChr.chr
+ #if $addOpt.options == "edit"
+ -s $addOpt.sequence
+ --min $addOpt.min
+ --minfdr $addOpt.minfdr
+ --iterations=$addOpt.iterations
+ --range $addOpt.range
+ #end if#
+ -o $output
+
+
+ /usr/local/bin/pyCalculateFDRs.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pyCalculateFDRs**
+
+By default the FDR value is set to 0.05, meaning that there is a 5% chance that the interval is not significantly enriched.
+The tool reports significant intervals in the GTF format and reports overlapping genomic features.
+Mutation frequencies are not included but these can be added using the pyCalculateMutationFrequencies tool
+
+**NOTE!** By default it calls each significant interval an "exon" but this has no meaning! It may overlap with an intron.
+Use bedtools to extract those intervals that overlap with introns or other features
+
+Example of an output file::
+
+ ##gff-version 2
+ # generated by pyCalculateFDRs version 0.0.3, Sat Jun 1 21:16:23 2013
+ # pyCalculateFDRs.py -f test_count_output_reads.gtf -r 200 -o test_count_output_FDRs_005.gtf -v -m 0.05
+ # chromosome feature source start end minimal_coverage strand . attributes
+ chrI protein_coding exon 140846 140860 5 - . gene_id "YAL005C"; gene_name "SSA1";
+ chrI intergenic_region exon 223118 223164 4 - . gene_id "INT_0_179"; gene_name "INT_0_179";
+ chrI intergenic_region exon 71889 71922 3 + . gene_id "INT_0_94"; gene_name "INT_0_94";
+ chrII intergenic_region exon 296127 296158 3 - . gene_id "INT_0_365"; gene_name "INT_0_365";
+ chrII intergenic_region exon 680697 680722 4 - . gene_id "INT_0_626"; gene_name "INT_0_626";
+ chrII intergenic_region exon 680827 680846 4 - . gene_id "INT_0_626"; gene_name "INT_0_626";
+ chrII snRNA exon 680827 680838 5 - . gene_id "LSR1"; gene_name "LSR1";
+ chrII snRNA exon 680951 681001 5 - . gene_id "LSR1"; gene_name "LSR1";
+ chrII intergenic_region exon 577985 577996 3 - . gene_id "INT_0_556"; gene_name "INT_0_556";
+ chrII protein_coding exon 203838 203887 3 + . gene_id "YBL011W"; gene_name "SCT1";
+ chrII protein_coding exon 296127 296158 3 - . gene_id "YBR028C"; gene_name "YBR028C";
+
+
+pyCalculateFDRs is part of the pyCRAC_ package. Takes interval information in GTF or bed format and calculates False Discovery Rates (FDRs).
+
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+Options::
+
+ -f read_file, --readdatafile=read_file
+ Name of the bed/gff/gtf file containing the read/cDNA
+ coordinates
+ --file_type=FILE_TYPE
+ this tool supports bed6, gtf and gff input files.
+ Please select from 'bed','gtf' or 'gff'. Default=gtf
+ -o outfile.gtf, --outfile=outfile.gtf
+ Optional. Provide the name of the output file. Default
+ is 'selected_intervals.gtf'
+ -r 100, --range=100
+ allows you to set the length of the UTR regions. If
+ you set '-r 50' or '--range=50', then the program will
+ set a fixed length (50 bp) regardless of whether the
+ GTF file has genes with annotated UTRs.
+ -a protein_coding, --annotation=protein_coding
+ select which annotation (i.e. protein_coding, ncRNA,
+ sRNA, rRNA,snoRNA,snRNA, depending on the source of
+ your GTF file) you would like to focus your analysis
+ on. Default = all annotations
+ -c yeast.txt, --chromfile=yeast.txt
+ Location of the chromosome info file. This file should
+ have two columns: first column is the names of the
+ chromosomes, second column is length of the
+ chromosomes. Default is yeast
+ --gtf=yeast.gtf
+ Name of the annotation file. Default is /usr/local/pyC
+ RAC/db/Saccharomyces_cerevisiae.EF2.59.1.2.gtf
+ -m MINFDR, --minfdr=MINFDR
+ To set a minimal FDR threshold for filtering interval
+ data. Default is 0.05
+ --min=MIN
+ to set a minimal read coverages for a region. Regions
+ with coverage less than minimum will be ignoredve an
+ FDR of zero
+ --iterations=ITERATIONS
+ to set the number of iterations for randomization of
+ read coordinates. Default=100
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyCalculateMutationFrequencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyCalculateMutationFrequencies.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,126 @@
+
+
+ pyCRAC
+
+
+ /usr/local/bin/pyCalculateMutationFrequencies.py
+ -r $readdatafile
+ -i $intervaldatafile
+ -c $addChr.chr
+ -o $output
+ --mutsfreq $mutsfreq
+
+ /usr/local/bin/pyCalculateMutationFrequencies.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pyCalculateMutationFrequencies**
+
+pyCalculateMutationFrequencies is part of the pyCRAC_ package. Takes an interval file and a pyReadCounters GTF file and calculates (cross-linking induced) mutation frequencies fore each interval.
+This tool can be used to calculate mutation frequencies for significant intervals (pyCalculateFDRs output file) or over-represented motifs (pyMotif GTF output file).
+It expects a pyCRAC GTF count_output_reads.gtf file and a GTF file with the intervals.
+
+For example::
+
+ This pyCalculateFDRs GTF output file::
+
+ ##gff-version 2
+ # generated by pyCalculateFDRs version 0.0.3, Sat Jun 1 21:16:23 2013
+ # pyCalculateFDRs.py -f test_count_output_reads.gtf -r 200 -o test_count_output_FDRs_005.gtf -v -m 0.05
+ # chromosome feature source start end minimal_coverage strand . attributes
+ chrII protein_coding exon 203838 203887 3 + . gene_id "YBL011W"; gene_name "SCT1";
+ chrII intergenic_region exon 407669 407708 3 + . gene_id "INT_0_445"; gene_name "INT_0_445";
+ chrII intergenic_region exon 585158 585195 2 + . gene_id "INT_0_562"; gene_name "INT_0_562";
+ chrII protein_coding exon 372390 372433 4 - . gene_id "YBR067C"; gene_name "TIP1";
+ chrII intergenic_region exon 380754 380815 6 - . gene_id "INT_0_431"; gene_name "INT_0_431";
+ chrIII protein_coding exon 138001 138044 5 + . gene_id "YCR012W"; gene_name "PGK1";
+ chrIII intergenic_region exon 227997 228036 5 + . gene_id "INT_0_885"; gene_name "INT_0_885";
+ chrIII intergenic_region exon 227997 228037 4 + . gene_id "INT_0_887"; gene_name "INT_0_887";
+ chrIII tRNA exon 227997 228037 4 + . gene_id "tS(CGA)C"; gene_name "SUP61";
+
+ Will be converted into::
+
+ ##gff-version 2
+ # generated by pyCalculateFDRs version 0.0.3, Sat Jun 1 21:16:23 2013
+ # /Library/Frameworks/EPD64.framework/Versions/Current/bin/pyCalculateFDRs.py -f test_count_output_reads.gtf -r 200 -o test_count_output_FDRs_005.gtf -v -m 0.05
+ # chromosome feature source start end minimal_coverage strand . attributes
+ chrII protein_coding exon 203838 203887 3 + . gene_id "YBL011W"; gene_name "SCT1"; # 203882D33.3,203883D33.3,203884D33.3;
+ chrII intergenic_region exon 407669 407708 3 + . gene_id "INT_0_445"; gene_name "INT_0_445"; # 407680D33.3,407681D33.3;
+ chrII intergenic_region exon 585158 585195 2 + . gene_id "INT_0_562"; gene_name "INT_0_562"; # 585171D100.0,585172D100.0,585173D100.0;
+ chrII protein_coding exon 372390 372433 4 - . gene_id "YBR067C"; gene_name "TIP1"; # 372412D50.0,372413D50.0;
+ chrII intergenic_region exon 380754 380815 6 - . gene_id "INT_0_431"; gene_name "INT_0_431"; # 380786D90.2,380787D90.2;
+ chrIII protein_coding exon 138001 138044 5 + . gene_id "YCR012W"; gene_name "PGK1"; # 138025D40.0,138026D30.0,138027D40.0;
+ chrIII intergenic_region exon 227997 228036 5 + . gene_id "INT_0_885"; gene_name "INT_0_885"; # 228006D85.7,228007D100.0;
+ chrIII intergenic_region exon 227997 228037 4 + . gene_id "INT_0_887"; gene_name "INT_0_887"; # 228006D85.7,228007D100.0;
+ chrIII tRNA exon 227997 228037 4 + . gene_id "tS(CGA)C"; gene_name "SUP61"; # 228006D85.7,228007D100.0;
+
+
+The hash character at the end of each line (#) shows chromosomal coordinates of mutated nucleotides within the cluster interval and their mutation frequencies.
+
+For example::
+
+ # 228007D100.0
+
+indicates that 100% of the nucleotides in position 228007 were deleted in the interval.
+
+By setting the --mutsfreq flag you can set a limit for the lowest mutation frequency that you want to have reported.
+This makes it relatively easy to select those significant regions that have nucleotides with high mutation frequencies.
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+Options::
+
+ -i intervals.gtf, --intervaldatafile=intervals.gtf
+ provide the path to your GTF interval data file.
+ -r reads.gtf, --readdatafile=reads.gtf
+ provide the path to your GTF read data file.
+ -c yeast.txt, --chromfile=yeast.txt
+ Location of the chromosome info file. This file should
+ have two columns: first column is the names of the
+ chromosomes, second column is length of the
+ chromosomes. Default is yeast
+ -o intervals_with_muts.gtf, --output_file=intervals_with_muts.gtf
+ provide a name for an output file. By default it
+ writes to the standard output
+ --mutsfreq=10, --mutationfrequency=10
+ sets the minimal mutations frequency for an interval
+ that you want to have written to our output file.
+ Default = 0%. Example: if the mutsfrequency is set at
+ 10 and an interval position has a mutated in less than
+ 10% of the reads,then the mutation will not be
+ reported.
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 19b20927172d pyCRAC/pyCheckGTFfile.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyCheckGTFfile.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,53 @@
+
+
+ pyCRAC
+
+
+ /usr/local/bin/pyCheckGTFfile.py --gtf $addGTF.gtf -o $out
+
+ /usr/local/bin/pyCheckGTFfile.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pyCheckGTFfile**
+
+pyCheckGTFfile is part of the pyCRAC_ package. Renames duplicated gene names in your GTF annotation file.
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+Options::
+
+ --gtf=gtf input file
+ type the path to the gtf file that you want to use.
+ -o FILE, --output=FILE
+ Optional. Specify the name of the output file. Default
+ is standard output. Make sure it has the .gtf
+ extension!
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyClusterReads.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyClusterReads.pl Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,10 @@
+#!/usr/bin/perl -w
+use strict;
+use Getopt::Long;
+
+print join(" ",@ARGV,"\n");
+
+
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyClusterReads.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyClusterReads.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,230 @@
+
+
+ pyCRAC
+
+
+ /usr/local/bin/pyClusterReads.py
+ -f $input
+ --gtf=$addGTF.gtf
+ #if $addGTF.annotate.annotations != "all":
+ #if $addGTF.gtfFile == "default" and $addGTF.annotate.annotations == "auto":
+ --annotation=$addGTF.annotate.scan.annotation
+ #else:
+ --annotation=$addGTF.annotate.annotation
+ #end if#
+ #end if#
+ -o $output
+ #if $addOpt.options == "edit":
+ --range=$addOpt.range
+ --cic=$addOpt.cic
+ --co=$addOpt.co
+ --ch=$addOpt.ch
+ --cl=$addOpt.cl
+ --mutsfreq=$addOpt.mutsfreq
+ #end if#
+
+ /usr/local/bin/pyClusterReads.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pyClusterReads**
+
+pyClusterReads is part of the pyCRAC_ package. Takes a reads_count_output GTF file from pyReadCounters generates clusters from the interval coordinates.
+Produces a GTF output file with cluster intervals and overlapping genomic features.
+It also includes mutation frequencies (after the # character) for nucleotides in intervals using chromosomal coordinates
+The pyClusterReads GTF output file essentially has the same layout as other pyCRAC GTF output files.
+
+**NOTE!** By default it calls each cluster an "exon" but this has no meaning. It may overlap with an intron.
+Use bedtools to extract those intervals that overlap with introns or other features
+
+The maximum height of the cluster is indicated in column 8.
+The hash character at the end of each line (#) shows chromosomal coordinates of mutated nucleotides within the cluster interval and their mutation frequencies.
+
+For example::
+
+ # 114099S100.0
+
+indicates that 100% of the nucleotides in position 114099 were substituted in the cluster.
+
+An example of a pyClusterReads output file::
+
+ ##gff-version 2
+ # generated by pyClusterReads.py version 0.0.1, Fri Jan 18 11:59:42 2013
+ # pyClusterReads.py -f count_output_reads.gtf -o count_output_clusters.gtf -v
+ # chromosome feature source start end cDNAs strand height attributes
+ chrI cluster exon 112583 112643 6 - 5 gene_id "INT_0_114,YAL021C"; gene_name "INT_0_114,CCR4"; # 112612S75.0;
+ chrI cluster exon 113176 113232 3 - 3 gene_id "INT_0_114,YAL021C"; gene_name "INT_0_114,CCR4"; # 113184S100.0;
+ chrI cluster exon 113334 113386 2 - 2 gene_id "INT_0_114,YAL021C"; gene_name "INT_0_114,CCR4"; # 113349S50.0,113379S100.0;
+ chrI cluster exon 113534 113564 3 - 3 gene_id "INT_0_119,INT_0_114"; gene_name "INT_0_119,INT_0_114"; # 113554S33.3,113556S33.3,113557S33.3;
+ chrI cluster exon 113644 113691 5 - 4 gene_id "YAL020C,INT_0_114"; gene_name "ATS1,INT_0_114"; # 113649S50.0,113657S33.3,113679S25.0
+ chrI cluster exon 113912 113958 2 - 2 gene_id "YAL020C,INT_0_114"; gene_name "ATS1,INT_0_114"; # 113932S50.0,113946S50.0;
+ chrI cluster exon 113966 114066 5 - 3 gene_id "YAL020C,INT_0_114"; gene_name "ATS1,INT_0_114"; # 113987S50.0,114033S33.3,114039S33.3;
+ chrI cluster exon 114067 114130 3 - 3 gene_id "YAL020C,INT_0_114"; gene_name "ATS1,INT_0_114"; # 114099S100.0;
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+
+File input options::
+
+ -f reads.gtf, --input_file=reads.gtf
+ provide the path to your GTF read data file. NOTE the
+ file has to be correctly sorted! If you used
+ pyReadCounters to generate the file you should be
+ fine. If you modified it, use the sort command
+ described in the manual to sort your file first by
+ chromosome, then by strand and then by start position.
+ -o clusters.gtf, --output_file=clusters.gtf
+ provide a name for an output file. By default it
+ writes to the standard output
+ --gtf=Yourfavoritegtf.gtf
+ type the path to the gtf annotation file that you want
+ to use
+
+Common pyCRAC options::
+
+ -r 100, --range=100
+ allows you to set the length of the UTR regions. If
+ you set '-r 50' or '--range=50', then the program will
+ set a fixed length (50 bp) regardless of whether the
+ GTF annotation file has genes with annotated UTRs.
+ -a protein_coding, --annotation=protein_coding
+ select which annotation (i.e. protein_coding, ncRNA,
+ sRNA, rRNA,snoRNA,snRNA, depending on the source of
+ your GTF file) you would like to focus your analysis
+ on. Default = all annotations
+
+Options for cluster analysis::
+
+ --cic=2, --cdnasinclusters=2
+ sets the minimal number of overlapping cDNAs in each
+ cluster. Default = 2
+ --co=5, --clusteroverlap=5
+ sets the number of nucleotides cDNA sequences have to
+ overlap to form a cluster. Default = 1 nucleotide
+ --ch=5, --clusterheight=5
+ sets the minimal height of the cluster. Default = 2
+ nucleotides
+ --cl=100, --clusterlength=100
+ to set the maximum cluster sequence length
+ --mutsfreq=10, --mutationfrequency=10
+ sets the minimal mutations frequency for a cluster
+ position in the GTF output file. Default = 0%.
+ Example: if the mutsfrequency is set at 10 and a
+ cluster position has a mutated in less than 10% of the
+ reads, then the mutation will not be reported.
+
+
\ No newline at end of file
diff -r 000000000000 -r 19b20927172d pyCRAC/pyExtractLinesFromGTF.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyExtractLinesFromGTF.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,77 @@
+
+
+ pyCRAC
+
+
+ /usr/local/bin/pyExtractLinesFromGTF.py --gtf $addGTF.gtf --genes_file $g --attribute $attribute $v -o $out
+
+ /usr/local/bin/pyExtractLinesFromGTF.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pyExtractLinesFromGTF**
+
+pyExtractLinesFromGTF is part of the pyCRAC_ package. Extracts lines from a GTF file that contain gene names of interest.
+
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+Options::
+
+ --gtf=Yourfavoritegtf.gtf
+ type the path to the gtf file that you want to use. By
+ default it expects data from the standard input.
+ -g FILE, --genes_file=FILE
+ name of your gene list or annotations list file (1
+ column)
+ -o OUTFILE, --outfile=OUTFILE
+ type the name and path of the file you want to write
+ the output to. Default is standard output
+ -a ATTRIBUTE, --attribute=ATTRIBUTE
+ from which attribute do you want to extract names?
+ Choices: gene_name, gene_id, transcript_name,
+ transcript_id
+ -v
+ similar to grep -v option. Remove the genes from the
+ GTF that are in the gene list
+
+
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyFasta2tab.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyFasta2tab.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,51 @@
+
+ converter
+
+ pyCRAC
+
+ /usr/local/bin/pyFasta2tab.py -f $input -o $output
+
+ /usr/local/bin/pyFasta2tab.py --version
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pyFasta2Tab**
+
+pyFasta2Tab is part of the pyCRAC_ package. Converts fasta to tabular format. Is used to convert your reference sequences in fasta format to the tabular format that pyCRAC uses for almost all tools.
+
+Example::
+
+ >sequence1
+ ATAGGATACATAACCATATTATGAGACC
+
+Is converted into::
+
+ sequence1 ATAGGATACATAACCATATTATGAGACC
+
+The pyCRAC package lo
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+-------
+
+**Parameter list**
+
+Options::
+
+ -f fasta_file, --input_file=fasta_file
+ provide the name and path of your fasta input file.
+ Default is standard input.
+
+
+
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyFastqDuplicateRemover.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyFastqDuplicateRemover.pl Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,34 @@
+#!/usr/bin/perl -w
+use strict;
+use Getopt::Long;
+
+my %opt;
+
+
+GetOptions(\%opt, "f=s", "r=s", "o=s", "out2=s", "version","id=s");
+
+my $cmnd;
+
+if (exists $opt{version}){
+ $cmnd = "python /usr/local/bin/pyFastqDuplicateRemover.py --version";
+}
+else{
+ $cmnd = "python /usr/local/bin/pyFastqDuplicateRemover.py -f $opt{f} -o $opt{id}";
+
+ if(defined $opt{r}){
+ $cmnd.= " -r $opt{r}";
+ }
+}
+
+system $cmnd;
+
+
+
+if(defined $opt{r}){
+ system "mv $opt{id}"."_1.fasta $opt{o}";
+ system "mv $opt{id}"."_2.fasta $opt{out2}";
+}
+else{
+ system "mv $opt{id} $opt{o}";
+}
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyFastqDuplicateRemover.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyFastqDuplicateRemover.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,117 @@
+
+
+ pyCRAC
+
+
+ pyFastqDuplicateRemover.pl
+ -f $ftype.f
+ #if $ftype.reverse.rev == "yes":
+ -r=$ftype.reverse.r
+ --out2 $out2
+ #end if#
+ -o $out
+ --id $out.id
+
+ pyFastqDuplicateRemover.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ftype['reverse']['rev'] == "yes"
+
+
+
+
+.. class:: infomark
+
+**pyFastqDuplicateRemover**
+
+pyFastqDuplicateRemover is part of the pyCRAC_ package. Removes identical sequences from fastq and fasta files and returns a fasta file with collapsed data.
+
+Can also process paired-end data.
+
+**Examples**
+
+Unprocessed fastq data with six random nucleotides at 5' end of the read::
+
+ @FCC102EACXX:3:1101:3231:2110#TGACCAAT/1
+ GCGCCTGCCAATTCCATCGTAATGATTAATAGGGACGGTCGGGGGCATC
+ +
+ bb_ceeeegggggiiiiiifghiihiihiiiiiiiiiifggfhiecccc
+
+After pyBarcodeFilter::
+
+ @FCC102EACXX:3:1101:3231:2110#TGACCAAT/1##GCGCCT
+ TCCATCGTAATGATTAATAGGGACGGTCGGGGGCATC
+ +
+ giiiiiifghiihiihiiiiiiiiiifggfhiecccc
+
+ This entry is printed to the NNNNNNGCCAAT barcode file.
+
+After pyFastqDuplicateRemover::
+
+ >1_GCGCCT_5/1
+ TCCATCGTAATGATTAATAGGGACGGTCGGGGGCATC
+
+ The '1' indicates that this is the first unique cDNA in the data
+ GCGCCT is the random barcode sequence
+ the '5' indicates that 5 reads were found with identical read and random barcode sequences
+ the '/1' indicates that the seqeuence originates from the forward sequencing reaction
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+Options::
+
+ -f FILE, --input_file=FILE
+ name of the FASTQ or FASTA input file
+
+ -r FILE, --reverse_input_file=FILE
+ name of the paired (or reverse) FASTQ or FASTA input file
+
+ -o FILE, --output_file=FILE
+ Provide the path and name of the fastq or fasta output file. Default is standard output.
+ For paired-end data just provide a file name without file extension (!)
+
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyFastqJoiner.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyFastqJoiner.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,139 @@
+
+
+ pyCRAC
+
+
+ /usr/local/bin/pyFastqJoiner.py
+ -f $ftype.f1 $ftype.f2
+ -o $out
+ --file_type=$ftype.type
+ #if $joinc.ch == "-c":
+ -c $joinc.c
+ #end if#
+
+ /usr/local/bin/pyFastqJoiner.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pyFastqJoiner**
+
+pyFastqJoiner is part of the pyCRAC_ package. Merges paired sequences from two fastq or fasta formatted files.
+
+Example::
+
+ Forward reaction:
+
+ @FCC102EACXX:3:1101:1343:2181#ATCACGAT/1##CAATAG
+ CAAATTAGAGTGTTCAAAGCAGGCGTATTGCTCGAAT
+ +
+ `efhYb][bdQQ`eeaeaYbeY^ceU__IXa[^ZYae
+ @FCC102EACXX:3:1101:1424:2248#ATCACGAT/1##CCAGGA
+ CTAACCATAAACTATGCCTACTAGGGATCCAGAGGTG
+ +
+ ^_adddhJbaehbedd`dIb_^cXaRI^BBBBBBBBB
+ @FCC102EACXX:3:1101:1623:2036#ATCACGAN/1##CTCAGC
+ CAAAGTTAGGGGATCGAAGATGATCAGATACCGTCGT
+ +
+ bghfc^YbgbeadggfdffeaS^ac_X^cegaGZ_ef
+ @FCC102EACXX:3:1101:1574:2214#ATCACGAT/1##CGTTTT
+ CTAATGACCCACTCGGCACCTTACGAAATCAAAGTCT
+ +
+ cdfgYY`cefhhZef\eaggXaceeghfQaeghWNW\
+
+ Reverse reaction:
+
+ @FCC102EACXX:3:1101:1343:2181#ATCACGAT/2
+ AGCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGA
+ +
+ YJaSJ`Z`K`YbSb[[daeJRR[YeWd_I^I^ecgc]OV\bdeaegbXb
+ @FCC102EACXX:3:1101:1424:2248#ATCACGAT/2
+ AAGTCCTTTAAGTTACAGCCTTGCGACCATACTACACCCAGAACCCAAA
+ +
+ YJJ\`JQY\`KJ`gY[[QRYY[[`H[_ceI^e[PYO^IWOHW^eaefhh
+ @FCC102EACXX:3:1101:1623:2036#ATCACGAN/2
+ GGCCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTC
+ +
+ PP\`ccQ`eY[bQQ[d`ghehaghfgdg[`gb^bd[ePbH^c_c\a_eg
+
+ Here the ":" character is used to split the two sequences. This character tells pyFastqSplitter where to split the sequences.
+ This character is ignored by pyFastqDuplicateRemover
+
+ Result:
+
+ @FCC102EACXX:3:1101:1343:2181#ATCACGAT/1##CAATAG@FCC102EACXX:3:1101:1343:2181#ATCACGAT/2
+ CAAATTAGAGTGTTCAAAGCAGGCGTATTGCTCGAAT:AGCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGA
+ +
+ `efhYb][bdQQ`eeaeaYbeY^ceU__IXa[^ZYaeYJaSJ`Z`K`YbSb[[daeJRR[YeWd_I^I^ecgc]OV\bdeaegbXb
+ @FCC102EACXX:3:1101:1424:2248#ATCACGAT/1##CCAGGA@FCC102EACXX:3:1101:1424:2248#ATCACGAT/2
+ CTAACCATAAACTATGCCTACTAGGGATCCAGAGGTG:AAGTCCTTTAAGTTACAGCCTTGCGACCATACTACACCCAGAACCCAAA
+ +
+ ^_adddhJbaehbedd`dIb_^cXaRI^BBBBBBBBBYJJ\`JQY\`KJ`gY[[QRYY[[`H[_ceI^e[PYO^IWOHW^eaefhh
+ @FCC102EACXX:3:1101:1623:2036#ATCACGAN/1##CTCAGC@FCC102EACXX:3:1101:1623:2036#ATCACGAN/2
+ CAAAGTTAGGGGATCGAAGATGATCAGATACCGTCGT:GGCCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTC
+ +
+ bghfc^YbgbeadggfdffeaS^ac_X^cegaGZ_efPP\`ccQ`eY[bQQ[d`ghehaghfgdg[`gb^bd[ePbH^c_c\a_eg
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+Options::
+
+ -f fastq_file1 fastq_file2
+ Provide the names of two raw data files separated by a single space.
+ Make sure the first file is the data file of the forward (/1) sequencing reaction.
+
+ --file_type=FASTQ
+ Can join fasta and fastq files. Fastq is default
+
+ -o mergedfastq.fastq, --outfile=mergedfastq.fastq
+ provide the name of the output file. By default it
+ will be printed to the standard output
+
+ -c :
+ This option adds the '|' character between the DNA
+ sequences so that it is much easier to split the data
+ again later on
+
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyFastqSplitter.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyFastqSplitter.pl Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,27 @@
+#!/usr/bin/perl -w
+use strict;
+use Getopt::Long;
+
+my %opt;
+
+
+GetOptions(\%opt, "f=s", "c=s", "o1=s", "o2=s","file_type=s", "version","id=s");
+
+my $cmnd;
+
+if (exists $opt{version}){
+ $cmnd = "python /usr/local/bin/pyFastqSplitter.py --version";
+}
+else{
+ $cmnd = "python /usr/local/bin/pyFastqSplitter.py -f $opt{f} -o $opt{id} --file_type=$opt{file_type}";
+
+ if(defined $opt{c}){
+ $cmnd.= " -c $opt{c}";
+ }
+
+}
+
+system $cmnd;
+system "mv $opt{id}_1.$opt{file_type} $opt{o1}";
+system "mv $opt{id}_2.$opt{file_type} $opt{o2}";
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyFastqSplitter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyFastqSplitter.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,140 @@
+
+
+ pyCRAC
+
+
+ pyFastqSplitter.pl
+ -f $f
+ --o1 $out1
+ --id $label.value
+ --o2 $out2
+ --file_type $ftype.type
+ #if $joinc.ch == "-c":
+ -c $joinc.c
+ #end if#
+
+ /usr/local/bin/pyFastqSplitter.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pyFastqSplitter**
+
+pyFastqSplitter is part of the pyCRAC_ package. Splits a merged fastq file (pyFastqJoiner output) in to two files.
+
+Example::
+
+ Here the ":" character was used to separate the two sequences. By using the -c flag you can tell pyFastqSplitter where to split the sequences.
+ This character is ignored by pyFastqDuplicateRemover
+
+
+ @FCC102EACXX:3:1101:1343:2181#ATCACGAT/1##CAATAG@FCC102EACXX:3:1101:1343:2181#ATCACGAT/2
+ CAAATTAGAGTGTTCAAAGCAGGCGTATTGCTCGAAT:AGCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGA
+ +
+ `efhYb][bdQQ`eeaeaYbeY^ceU__IXa[^ZYaeYJaSJ`Z`K`YbSb[[daeJRR[YeWd_I^I^ecgc]OV\bdeaegbXb
+ @FCC102EACXX:3:1101:1424:2248#ATCACGAT/1##CCAGGA@FCC102EACXX:3:1101:1424:2248#ATCACGAT/2
+ CTAACCATAAACTATGCCTACTAGGGATCCAGAGGTG:AAGTCCTTTAAGTTACAGCCTTGCGACCATACTACACCCAGAACCCAAA
+ +
+ ^_adddhJbaehbedd`dIb_^cXaRI^BBBBBBBBBYJJ\`JQY\`KJ`gY[[QRYY[[`H[_ceI^e[PYO^IWOHW^eaefhh
+ @FCC102EACXX:3:1101:1623:2036#ATCACGAN/1##CTCAGC@FCC102EACXX:3:1101:1623:2036#ATCACGAN/2
+ CAAAGTTAGGGGATCGAAGATGATCAGATACCGTCGT:GGCCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTC
+ +
+ bghfc^YbgbeadggfdffeaS^ac_X^cegaGZ_efPP\`ccQ`eY[bQQ[d`ghehaghfgdg[`gb^bd[ePbH^c_c\a_eg
+
+ Result:
+
+ Forward reaction:
+
+ @FCC102EACXX:3:1101:1343:2181#ATCACGAT/1##CAATAG
+ CAAATTAGAGTGTTCAAAGCAGGCGTATTGCTCGAAT
+ +
+ `efhYb][bdQQ`eeaeaYbeY^ceU__IXa[^ZYae
+ @FCC102EACXX:3:1101:1424:2248#ATCACGAT/1##CCAGGA
+ CTAACCATAAACTATGCCTACTAGGGATCCAGAGGTG
+ +
+ ^_adddhJbaehbedd`dIb_^cXaRI^BBBBBBBBB
+ @FCC102EACXX:3:1101:1623:2036#ATCACGAN/1##CTCAGC
+ CAAAGTTAGGGGATCGAAGATGATCAGATACCGTCGT
+ +
+ bghfc^YbgbeadggfdffeaS^ac_X^cegaGZ_ef
+ @FCC102EACXX:3:1101:1574:2214#ATCACGAT/1##CGTTTT
+ CTAATGACCCACTCGGCACCTTACGAAATCAAAGTCT
+ +
+ cdfgYY`cefhhZef\eaggXaceeghfQaeghWNW\
+
+ Reverse reaction:
+
+ @FCC102EACXX:3:1101:1343:2181#ATCACGAT/2
+ AGCCTTTAAGTTTCAGCCTTGCGACCATACTCCCCCCAGAACCCAAAGA
+ +
+ YJaSJ`Z`K`YbSb[[daeJRR[YeWd_I^I^ecgc]OV\bdeaegbXb
+ @FCC102EACXX:3:1101:1424:2248#ATCACGAT/2
+ AAGTCCTTTAAGTTACAGCCTTGCGACCATACTACACCCAGAACCCAAA
+ +
+ YJJ\`JQY\`KJ`gY[[QRYY[[`H[_ceI^e[PYO^IWOHW^eaefhh
+ @FCC102EACXX:3:1101:1623:2036#ATCACGAN/2
+ GGCCAATCCTTATTGTGTCTGGACCTGGTGAGTTTCCCCGTGTTGAGTC
+ +
+ PP\`ccQ`eY[bQQ[d`ghehaghfgdg[`gb^bd[ePbH^c_c\a_eg
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+Options::
+
+ -f fastq_file, --filename=fastq_file
+ To provide the names of two raw data files separated
+ by a single space. Default = standard input
+ --file_type=FASTQ
+ Can split joined fasta and fastq files. Fastq is default
+ If there isn't a specific character splitting the two reads
+ the tool assumes that the two reads were of equal length
+ -o splitfastq, --outfile=splitfastq
+ Provide the name of the output files (WITHOUT file
+ extension). By default the data will be printed to the
+ standard output
+ -c :, --character=:
+ If the joined sequences were separated by a specific
+ character then the program can divide the sequences by
+ looking for that character
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyGTF2bed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyGTF2bed.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,107 @@
+
+ converter
+
+ pyCRAC
+
+ /usr/local/bin/pyGTF2bed.py --gtf $input -o $output
+ #if $addtrack.track == "--track":
+ --track
+ --name $addtrack.name
+ --description $addtrack.description
+ #if $addtrack.colorscheme.colorsel == "default":
+ -c $addtrack.colorscheme.color
+ #else:
+ -s '$addtrack.colorscheme.plus,$addtrack.colorscheme.minus'
+ #end if#
+ #end if#
+
+ /usr/local/bin/pyGTF2bed.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pyGTF2bed**
+
+pyGTF2bed is part of the pyCRAC_ package. Converts GTF files to the bed 6 format. Gene names present in the GTF file will be included in the bed file.
+
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+These options can be used to add and modify a track line for the UCSC genome browser::
+
+ --track
+ Use this flag to add a UCSC genome browser track line
+ to the beginning of your file
+ -n NAME, --name=NAME
+ For the USCS track line: provide a track name. Default
+ = 'User_supplied_track'
+ -d DESCRIPTION, --description=DESCRIPTION
+ For the USCS track line: provide a track description.
+ Default = 'User_supplied_track'
+ -c COLOR, --color=COLOR
+ select the track color. Default = black
+ -s STRANDS, --colorstrands=STRANDS
+ select the colors for each strand. Default =
+ 'red,blue'
+
+File input options::
+
+ --gtf=Yourfavoritegtf.gtf
+ type the path to the gtf file that you want to
+ convert. Default is standard input
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyGTF2bedGraph.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyGTF2bedGraph.pl Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,38 @@
+#!/usr/bin/perl -w
+use strict;
+use Getopt::Long;
+
+my %opt;
+
+
+GetOptions(\%opt, "gtf=s","po=s","version","mo=s","count=i","chromfile=s","t=s","iCLIP","track","name=s","description=s","color=s","s=s","id=s");
+
+my $cmnd;
+my $prefix = "gb_$opt{id}";
+$prefix =~ s/\s/_/g;
+
+
+if (exists $opt{version}){
+ $cmnd = "python /usr/local/bin/pyGTF2bedGraph.py --version";
+}
+else{
+ $cmnd = "python /usr/local/bin/pyGTF2bedGraph.py --gtf $opt{gtf} --chromfile $opt{chromfile} -t $opt{t} --count $opt{count} -o $prefix";
+
+ if(exists $opt{iCLIP}){
+
+ $cmnd .= " --iCLIP";
+ }
+
+ if(exists $opt{track}){
+ $cmnd .= " --track --name \"$opt{name}\" --description \"$opt{description}\"";
+
+ if(exists $opt{color}){$cmnd .= " --color $opt{color}";}
+ if(exists $opt{s}){$cmnd .= " -s \"$opt{s}\"";}
+ }
+}
+
+system $cmnd;
+
+system "mv $prefix"."_plus_strand.bedgraph $opt{po}";
+system "mv $prefix"."_minus_strand.bedgraph $opt{mo}";
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyGTF2bedGraph.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyGTF2bedGraph.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,162 @@
+
+ converter
+
+ pyCRAC
+
+ pyGTF2bedGraph.pl --gtf $input --po $po --mo $mo
+ --chromfile $addchr.chr
+ -t $type
+ --count $count
+ $iclip
+ #if $addtrack.track == "--track":
+ --track
+ --name $addtrack.name
+ --description $addtrack.description
+ #if $addtrack.colorscheme.colorsel == "default":
+ --color $addtrack.colorscheme.color
+ #else:
+ -s '$addtrack.colorscheme.plus,$addtrack.colorscheme.minus'
+ #end if#
+ #end if#
+ --id $po.id
+
+ /usr/local/bin/pyGTF2bedGraph.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pyGTF2bedGraph**
+
+pyGTF2bedGraph is part of the pyCRAC_ package. Generates bedgraph files for each chromosome. An homage to bedtools genomecoverage. Takes a pyReadCounters GTF file as input file. Can also output bedGraph files for substitutions and deletions.
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+File input options::
+
+ --gtf=readdata.gtf
+ type the path to the gtf file data file. Be default it
+ expects data from the standard input
+ -o converted
+ provide a name for an output file. A file extension or
+ strand information is not necessary.
+ -c yeast.txt, --chromfile=yeast.txt
+ Location of the chromosome info file. This file should
+ have two columns: first column is the names of the
+ chromosomes, second column is length of the
+ chromosomes. Default is yeast
+ -t TYPE, --type=TYPE
+ this tool can generate bedGraph files for reads,
+ substitutions or deletions. Please use
+ 'reads','substitutions' or 'deletions' to indicate the
+ type of data. Default='reads'
+ --count
+ Takes the numbers in the 'score' column of the GTF
+ file as the total number of reads for each position.
+ Default is 1 for each interval.
+ --iCLIP
+ This turns on the iCLIP mode and the sgr reads or cDNA
+ files will report cross-linking site frequencies in
+ iCLIP data
+ -v, --verbose
+ to print status messages to a log file
+
+These options can be used to add a track line for the UCSC genome browser::
+
+ --track
+ Use this flag to add a UCSC genome browser track line
+ to the beginning of your file
+ -n NAME, --name=NAME
+ For the USCS track line: provide a track name. Default
+ = 'User_supplied_track'
+ -d DESCRIPTION, --description=DESCRIPTION
+ For the USCS track line: provide a track description.
+ Default = 'User_supplied_track'
+ --color=COLOR
+ select the track color. Default = black
+ -s STRANDS, --colorstrands=STRANDS
+ select the colors for each strand. Default =
+ 'red,blue'
+
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyGetGTFSources.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyGetGTFSources.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,63 @@
+
+
+ pyCRAC
+
+
+ /usr/local/bin/pyGetGTFSources.py --gtf $addGTF.gtf --count -o $out
+
+ /usr/local/bin/pyGetGTFSources.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pyGetGTFSources**
+
+pyGetGTFSources is part of the pyCRAC_ package. Extracts source names from the second column in a GTF file.
+
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+Options::
+
+ --gtf=Yourfavoritegtf.gtf
+ type the path to the gtf file that you want to use. By
+ default it expects data from the standard input
+ -o OUTFILE, --outfile=OUTFILE
+ type the name and path of the file you want to write
+ the output to. Default is standard output
+ --count with this flag you the program will count the
+ occurence for each source/annotation in the gtf file
+
+
+
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyGetGeneNamesFromGTF.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyGetGeneNamesFromGTF.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,71 @@
+
+
+ pyCRAC
+
+
+ /usr/local/bin/pyGetGeneNamesFromGTF.py --gtf $addGTF.gtf --attribute $attribute $count -o $out
+
+ /usr/local/bin/pyGetGeneNamesFromGTF.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pyGetGeneNamesFromGTF**
+
+pyGetGeneNamesFromGTF is part of the pyCRAC_ package. Extracts and counts all gene names from a GTF file.
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+Options::
+
+ --gtf=Yourfavoritegtf.gtf
+ type the path to the gtf file that you want to use. By
+ default it expects data from the standard input.
+ -o OUTFILE, --outfile=OUTFILE
+ type the name and path of the file you want to write
+ the output to. Default is standard output
+ -a ATTRIBUTE, --attribute=ATTRIBUTE
+ from which attribute do you want to extract names?
+ Choices: gene_name, gene_id, transcript_name,
+ transcript_id
+ --count
+ with this flag you the program will count the
+ occurence for each source/annotation in the gtf file
+
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyMotif.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyMotif.pl Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,41 @@
+#!/usr/bin/perl -w
+use strict;
+use Getopt::Long;
+
+my %opt;
+
+
+GetOptions(\%opt, "f=s","version","gtf=s","range=i","overlap=i","--annotation=s", "--tab=s","--k_min=i","--k_max=i","--numberofkmers=i","--count=s","--features=s","--zscores=s","--random=s","options","o=s","id=s");
+
+my $cmnd;
+
+my $prefix = "m_$opt{id}";
+
+
+if (exists $opt{version}){
+ $cmnd = "python /usr/local/bin/pyMotif.py --version";
+}
+else{
+ $cmnd = "python /usr/local/bin/pyMotif.py -f $opt{f} --gtf $opt{gtf} --tab $opt{tab} --annotation $opt{annotation} -o $prefix";
+
+ if(exists $opt{options}){
+
+ $cmnd .= " --range=$opt{range} --overlap=$opt{overlap} --k_min=$opt{k_min} --k_max=$opt{k_max} --numberofkmers=$opt{numberofkmers}";
+ }
+}
+
+#testing
+#open (COUNT, ">$opt{count}") || die "";
+#print COUNT "$cmnd";
+
+system $cmnd;
+
+
+system "mv $prefix"."_$opt{annotation}_data_k-mers_count.txt $opt{count}";
+system "mv $prefix"."_$opt{annotation}_top_k-mers_in_features.gtf $opt{features}";
+system "mv $prefix"."_$opt{annotation}_k-mer_Z_scores.txt $opt{zscores}";
+system "mv $prefix"."_$opt{annotation}_random_k-mers_count.txt $opt{random}";
+
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyMotif.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyMotif.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,222 @@
+
+
+ pyCRAC
+
+
+ pyMotif.pl
+ -f $input
+ --gtf=$addGTF.gtf
+
+ #if $addGTF.gtfFile == "default" and $addGTF.annotate.annotations == "auto":
+ --annotation $addGTF.annotate.scan.annotation
+ #else:
+ --annotation $addGTF.annotate.annotation
+ #end if#
+
+ --tab=$addTab.tab
+
+ #if $addOpt.options == "edit":
+ --options
+ --k_min $addOpt.kmin
+ --k_max $addOpt.kmax
+ --numberofkmers=$addOpt.numberofkmers
+ --overlap $addOpt.overlap
+ --range $addOpt.range
+ #end if#
+ -o "$input.name"
+ --id $count.id
+ --count $count
+ --random $random
+ --features $features
+ --zscores $zscores
+
+ /usr/local/bin/pyMotif.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pyMotif**
+
+pyMotif is part of the pyCRAC_ package. Looks for enriched sequence motifs in high-throughput sequencing data. Produces a GTF type output file
+with coordinates and Z-scores for enriched motifs. The GTF file can be visualised in genome browsers.
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+File input options::
+
+ -f intervals.gtf, --input_file=intervals.gtf
+ Provide the path to an interval gtf file. By default
+ it expects data from the standard input.
+ -o OUTPUT_FILE, --output_file=OUTPUT_FILE
+ Use this flag to override the standard file names. Do
+ NOT add an extension.
+ --gtf=annotation_file.gtf
+ type the path to the gtf annotation file that you want
+ to use
+ --tab=tab_file.tab
+ type the path to the tab file that contains the
+ genomic reference sequence
+
+pyMotif specific options::
+
+ --k_min=4
+ this option allows you to set the shortest k-mer
+ length. Default = 4.
+ --k_max=6
+ this option allows you to set the longest k-mer
+ length. Default = 8.
+ -n 100, --numberofkmers=100
+ choose the maximum number of enriched k-mer sequences
+ you want to have reported in output files. Default =
+ 1000
+
+pyCRAC common options::
+
+ -a protein_coding, --annotation=protein_coding
+ select which annotation (i.e. protein_coding, ncRNA,
+ sRNA, rRNA,snoRNA,snRNA, depending on the source of
+ your GTF file) you would like to focus your search on.
+ Default = all annotations
+ -r 100, --range=100
+ allows you to add regions flanking the genomic
+ feature. If you set '-r 50' or '--range=50', then the
+ program will add 50 nucleotides to each feature on
+ each side regardless of whether the GTF file has genes
+ with annotated UTRs.
+ --overlap=1
+ sets the number of nucleotides a motif has to overlap
+ with a genomic feature before it is considered a hit.
+ Default = 1 nucleotide
+
+
+
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyPileup.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyPileup.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,384 @@
+
+
+
+ pyCRAC
+
+
+ /usr/local/bin/pyPileup.py
+ -f $ftype.input
+ --file_type $ftype.file_type
+ #if $geneOpt.alignGene == "gene":
+ -g $geneOpt.genes
+ #end if#
+ #if $geneOpt.alignGene == "chr":
+ --chr $geneOpt.chr
+ #end if#
+ #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.disc.discard == "discard":
+ --discarded $discarded
+ #end if#
+ --gtf=$addGTF.gtf
+ --tab=$addTab.tab
+ #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.addAlignOpt.alignoptions == "edit":
+ --align_quality=$ftype.addAlignOpt.align_quality
+ --align_score=$ftype.addAlignOpt.align_score
+ --distance=$ftype.addAlignOpt.d
+ --length=$ftype.addAlignOpt.length
+ #if int($ftype.addAlignOpt.max) > 0:
+ --max=$ftype.addAlignOpt.max
+ #end if#
+ $ftype.addAlignOpt.unique
+ $ftype.addAlignOpt.blocks
+ $ftype.addAlignOpt.mutations
+ #if $ftype.disc.discard == "--discarded":
+ --discarded $discarded
+ #end if#
+ #end if#
+ #if $addOpt.options == "edit":
+ --range=$addOpt.range
+ --overlap=$addOpt.overlap
+ $addOpt.iclip
+ $addOpt.ignore
+ -s $addOpt.sequence
+ #if int($addOpt.limit) > 0:
+ --limit=$addOpt.limit
+ #end if#
+ #end if#
+ -o $output
+
+ /usr/local/bin/pyPileup.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (ftype['file_type'] == "novo" or ftype['file_type'] == "sam") and ftype['disc']['discard'] == "discard"
+
+
+
+
+
+.. class:: infomark
+
+**pyPileup**
+
+pyPileup is part of the pyCRAC_ package. Produces pileups containing the number of hits, substitutions and deletions for each nucleotide covered by
+reads in specific genes or genomic regions
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+File input options::
+
+ -f FILE, --input_file=FILE
+ As input files you can use Novoalign native output,
+ SAM, pyMotif or pyReadCounters GTF files as input
+ file. By default it expects data from the standard
+ input. Make sure to specify the file type of the file
+ you want to have analyzed using the --file_type
+ option!
+ -o OUTPUT_FILE, --output_file=OUTPUT_FILE
+ Use this flag to override the standard output file
+ names. All pileups will be written to one output file.
+ -g FILE, --genes_file=FILE
+ here you need to type in the name of your gene list
+ file (1 column) or the hittable file
+ --chr=FILE
+ if you simply would like to align reads against a
+ genomic sequence you should generate a tab delimited
+ file containing an identifyer, chromosome name, start
+ position, end position and strand
+ --gtf=annotation_file.gtf
+ type the path to the gtf annotation file that you want
+ to use
+ --tab=tab_file.tab
+ type the path to the tab file that contains the
+ genomic reference sequence
+ --file_type=FILE_TYPE
+ use this option to specify the file type (i.e. 'novo',
+ 'sam', 'gtf'). This will tell the program which
+ parsers to use for processing the files. Default =
+ 'novo'
+
+pyPileup specific options::
+
+ --limit=500
+ with this option you can select how many reads mapped
+ to a particular gene/ORF/region you want to count.
+ Default = All
+ --iCLIP
+ This turns on the iCLIP mode and the pileups will
+ report cross-linking site frequencies in iCLIP data in
+ reference sequences
+
+Common options::
+
+ -v, --verbose
+ prints all the status messages to a file rather than
+ the standard output
+ --ignorestrand
+ this flag tells the program to ignore strand
+ information and all overlapping reads will considered
+ sense reads. Useful for analysing ChIP or RIP data
+ --zip=FILE
+ use this option to compress all the output files in a
+ single zip file
+ --overlap=1
+ sets the number of nucleotides a read has to overlap
+ with a gene before it is considered a hit. Default =
+ 1 nucleotide
+ -s genomic, --sequence=genomic
+ with this option you can select whether you want the
+ reads aligned to the genomic or the coding sequence.
+ Default = genomic
+ -r 100, --range=100
+ allows you to set the length of the UTR regions. If
+ you set '-r 50' or '--range=50', then the program will
+ set a fixed length (50 bp) regardless of whether the
+ GTF file has genes with annotated UTRs.
+
+Options for novo, SAM and BAM files::
+
+ --align_quality=100, --mapping_quality=100
+ with these options you can set the alignment quality
+ (Novoalign) or mapping quality (SAM) threshold. Reads
+ with qualities lower than the threshold will be
+ ignored. Default = 0
+ --align_score=100
+ with this option you can set the alignment score
+ threshold. Reads with alignment scores lower than the
+ threshold will be ignored. Default = 0
+ -l 100, --length=100
+ to set read length threshold. Default = 1000
+ -m 100000, --max=100000
+ maximum number of mapped reads that will be analyzed.
+ Default = All
+ --unique
+ with this option reads with multiple alignment
+ locations will be removed. Default = Off
+ --blocks
+ with this option reads with the same start and end
+ coordinates on a chromosome will only be counted once.
+ Default = Off
+ --discarded=FILE
+ prints the lines from the alignments file that were
+ discarded by the parsers. This file contains reads
+ that were unmapped (NM), of poor quality (i.e. QC) or
+ paired reads that were mapped to different chromosomal
+ locations or were too far apart on the same
+ chromosome. Useful for debugging purposes
+ -d 1000, --distance=1000
+ this option allows you to set the maximum number of
+ base-pairs allowed between two non-overlapping paired
+ reads. Default = 1000
+ --mutations=delsonly
+ Use this option to only track mutations that are of
+ interest. For CRAC data this is usually deletions
+ (--mutations=delsonly). For PAR-CLIP data this is
+ usually T-C mutations (--mutations=TC). Other options
+ are: do not report any mutations: --mutations=nomuts.
+ Only report specific base mutations, for example only
+ in T's, C's and G's :--mutations=[TCG]. The brackets
+ are essential. Other nucleotide combinations are also
+ possible
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyReadAligner.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyReadAligner.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,368 @@
+
+
+ pyCRAC
+
+
+ /usr/local/bin/pyReadAligner.py
+ -f $ftype.input
+ --file_type $ftype.file_type
+ #if $geneOpt.alignGene == "gene":
+ -g $geneOpt.genes
+ #end if#
+ #if $geneOpt.alignGene == "chr":
+ --chr $geneOpt.chr
+ #end if#
+ #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.disc.discard == "discard":
+ --discarded $discarded
+ #end if#
+ --gtf=$addGTF.gtf
+ --tab=$addTab.tab
+ #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.addAlignOpt.alignoptions == "edit":
+ --align_quality=$ftype.addAlignOpt.align_quality
+ --align_score=$ftype.addAlignOpt.align_score
+ --distance=$ftype.addAlignOpt.d
+ --length=$ftype.addAlignOpt.length
+ #if int($ftype.addAlignOpt.max) > 0:
+ --max=$ftype.addAlignOpt.max
+ #end if#
+ $ftype.addAlignOpt.unique
+ $ftype.addAlignOpt.blocks
+ $ftype.addAlignOpt.mutations
+ #end if#
+ #if $addOpt.options == "edit":
+ --range=$addOpt.range
+ --overlap=$addOpt.overlap
+ $addOpt.ignore
+ -s $addOpt.sequence
+ #if int($addOpt.limit) > 0:
+ --limit=$addOpt.limit
+ #end if#
+ #end if#
+ -o $output
+
+ /usr/local/bin/pyReadAligner.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (ftype['file_type'] == "novo" or ftype['file_type'] == "sam") and ftype['disc']['discard'] == "discard"
+
+
+
+
+
+.. class:: infomark
+
+**pyReadAligner**
+
+pyReadAligner is part of the pyCRAC_ package. Generates multiple sequence alignments for reads mapped to individual genes or genomic regions.
+Produces a fasta output file.
+
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+File input options::
+
+ -f FILE, --input_file=FILE
+ As input files you can use Novoalign native output or
+ SAM files as input file. By default it expects data
+ from the standard input. Make sure to specify the file
+ type of the file you want to have analyzed using the
+ --file_type option!
+ -o OUTPUT_FILE, --output_file=OUTPUT_FILE
+ Use this flag to override the standard output file
+ names. All alignments will be written to one output
+ file.
+ -g FILE, --genes_file=FILE
+ here you need to type in the name of your gene list
+ file (1 column) or the hittable file
+ --chr=FILE
+ if you simply would like to align reads against a
+ genomic sequence you should generate a tab delimited
+ file containing an identifyer, chromosome name, start
+ position, end position and strand
+ --gtf=annotation_file.gtf
+ type the path to the gtf annotation file that you want
+ to use
+ --tab=tab_file.tab
+ type the path to the tab file that contains the
+ genomic reference sequence
+ --file_type=FILE_TYPE
+ use this option to specify the file type (i.e. 'novo',
+ 'sam', 'gtf'). This will tell the program which
+ parsers to use for processing the files. Default =
+ 'novo'
+
+pyReadAligner specific options::
+
+ --limit=500
+ with this option you can select how many reads mapped
+ to a particular gene/ORF/region you want to count.
+ Default = All
+
+Common options::
+
+ --ignorestrand
+ this flag tells the program to ignore strand
+ information and all overlapping reads will considered
+ sense reads. Useful for analysing ChIP or RIP data
+ --overlap=1
+ sets the number of nucleotides a read has to overlap
+ with a gene before it is considered a hit. Default =
+ 1 nucleotide
+ -s genomic, --sequence=genomic
+ with this option you can select whether you want the
+ reads aligned to the genomic or the coding sequence.
+ Default = genomic
+ -r 100, --range=100
+ allows you to set the length of the UTR regions. If
+ you set '-r 50' or '--range=50', then the program will
+ set a fixed length (50 bp) regardless of whether the
+ GTF file has genes with annotated UTRs.
+
+Options for novo, SAM and BAM files::
+
+ --align_quality=100, --mapping_quality=100
+ with these options you can set the alignment quality
+ (Novoalign) or mapping quality (SAM) threshold. Reads
+ with qualities lower than the threshold will be
+ ignored. Default = 0
+ --align_score=100
+ with this option you can set the alignment score
+ threshold. Reads with alignment scores lower than the
+ threshold will be ignored. Default = 0
+ -l 100, --length=100
+ to set read length threshold. Default = 1000
+ -m 100000, --max=100000
+ maximum number of mapped reads that will be analyzed.
+ Default = All
+ --unique
+ with this option reads with multiple alignment
+ locations will be removed. Default = Off
+ --blocks
+ with this option reads with the same start and end
+ coordinates on a chromosome will only be counted once.
+ Default = Off
+ --discarded=FILE
+ prints the lines from the alignments file that were
+ discarded by the parsers. This file contains reads
+ that were unmapped (NM), of poor quality (i.e. QC) or
+ paired reads that were mapped to different chromosomal
+ locations or were too far apart on the same
+ chromosome. Useful for debugging purposes
+ -d 1000, --distance=1000
+ this option allows you to set the maximum number of
+ base-pairs allowed between two non-overlapping paired
+ reads. Default = 1000
+ --mutations=delsonly
+ Use this option to only track mutations that are of
+ interest. For CRAC data this is usually deletions
+ (--mutations=delsonly). For PAR-CLIP data this is
+ usually T-C mutations (--mutations=TC). Other options
+ are: do not report any mutations: --mutations=nomuts.
+ Only report specific base mutations, for example only
+ in T's, C's and G's :--mutations=[TCG]. The brackets
+ are essential. Other nucleotide combinations are also
+ possible
+
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pyReadCounters.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyReadCounters.pl Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,60 @@
+#!/usr/bin/perl -w
+use strict;
+use Getopt::Long;
+
+my %opt;
+
+
+GetOptions(\%opt, "f=s","file_type=s","version","gtf=s","align_quality=i","align_score=i","range=i","length=i","max=i","distance=i","ignorestrand","overlap=i","unique","blocks","mutations=s","countoutput=s","stats=s","hittable=s","intronUTRoverlap=s","discarded=s","options","alignOpt","id=s");
+
+my $cmnd;
+my $prefix = "rc_$opt{id}";
+
+
+if (exists $opt{version}){
+ $cmnd = "python /usr/local/bin/pyReadCounters.py --version";
+}
+else{
+ $cmnd = "python /usr/local/bin/pyReadCounters.py -f $opt{f} --file_type $opt{file_type} --gtf $opt{gtf} -o $prefix";
+
+ if(exists $opt{options}){
+
+ $cmnd .= " --range=$opt{range} --overlap=$opt{overlap}";
+
+ if(exists $opt{ignorestrand}){ $cmnd .= " --ignorestrand";}
+ }
+
+ if(exists $opt{alignOpt}){
+ $cmnd .= " --align_quality=$opt{align_quality} --align_score=$opt{align_score} --length=$opt{length} --distance=$opt{distance}";
+ if(exists $opt{max}){$cmnd .= " --max=$opt{max}";}
+ if(exists $opt{unique}){$cmnd .= " --unique";}
+ if(exists $opt{blocks}){$cmnd .= " --blocks";}
+ if(exists $opt{mutations}){$cmnd .= " --mutations=$opt{mutations}";}
+ if(exists $opt{discarded}){$cmnd .= " --discarded=$opt{discarded}";}
+
+ }
+}
+
+
+system $cmnd;
+print STDOUT $cmnd;
+
+
+if(exists $opt{blocks}){
+ system "mv $prefix"."_hittable_cDNAs.txt $opt{hittable}";
+ system "mv $prefix"."_file_statistics_cDNAs.txt $opt{stats}";
+ system "mv $prefix"."_intron_and_UTR_overlap_cDNAs.gtf $opt{intronUTRoverlap}";
+
+ if($opt{file_type} ne "gtf"){
+ system "mv $prefix"."_count_output_cDNAs.gtf $opt{countoutput}";
+ }
+}
+else{
+ system "mv $prefix"."_hittable_reads.txt $opt{hittable}";
+ system "mv $prefix"."_file_statistics_reads.txt $opt{stats}";
+ system "mv $prefix"."_intron_and_UTR_overlap_reads.gtf $opt{intronUTRoverlap}";
+
+ if($opt{file_type} ne "gtf"){
+ system "mv $prefix"."_count_output_reads.gtf $opt{countoutput}";
+ }
+}
diff -r 000000000000 -r 19b20927172d pyCRAC/pyReadCounters.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyReadCounters.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,359 @@
+
+
+ pyCRAC
+
+
+ pyReadCounters.pl
+ -f $ftype.input
+ --file_type $ftype.file_type
+ --gtf $addGTF.gtf
+ #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.disc.discard == "discard":
+ --discarded $discarded
+ #end if#
+ #if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.addAlignOpt.alignoptions == "edit":
+ --alignOpt
+ --align_quality $ftype.addAlignOpt.align_quality
+ --align_score $ftype.addAlignOpt.align_score
+ #if int($ftype.addAlignOpt.max) > 0:
+ --max $ftype.addAlignOpt.max
+ #end if#
+ --distance $ftype.addAlignOpt.d
+ --length $ftype.addAlignOpt.length
+ $ftype.addAlignOpt.unique
+ $ftype.addAlignOpt.blocks
+ $ftype.addAlignOpt.mutations
+ #end if#
+ #if $addOpt.options == "edit":
+ --options
+ --range $addOpt.range
+ $addOpt.ignore
+ --overlap $addOpt.overlap
+ #end if#
+
+ --stats $stats
+ --hittable $hittable
+ --intronUTRoverlap $intronUTRoverlap
+
+ #if $ftype.file_type == "novo" or $ftype.file_type == "sam":
+ --countoutput $countoutput
+ #end if#
+
+ --id $stats.id
+
+ /usr/local/bin/pyReadCounters.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ftype['file_type'] == "novo" or ftype['file_type'] == "sam"
+
+
+ (ftype['file_type'] == "novo" or ftype['file_type'] == "sam") and ftype['disc']['discard'] == "discard"
+
+
+
+
+.. class:: infomark
+
+**pyReadCounters**
+
+pyReadCounters is part of the pyCRAC_ package. Produces a gene hittable file, two GTF output files showing to which genomic features the reads overlap.
+Finally the tool produces a read statistics file that provides information about the complexity of your dataset.
+
+**Output file examples**
+
+A hittable file::
+
+ # generated by pyReadCounters version 1.1.0, Mon Apr 16 20:34:22 2012
+ # /usr/local/bin/pyReadCounters.py -f RNAseq_data.novo -c 1 --unique
+ # total number of reads 12534556
+ # total number of paired reads 10947376
+ # total number of single reads 483095
+ # total number of mapped reads: 11430471
+ # total number of overlapping genomic features 7019550
+ # sense 5960669
+ # anti-sense 1058881
+ # feature sense_overlap anti-sense_overlap number of reads
+
+ ## protein_coding 3190701
+ YEF3 49930 3629 24221
+ PMA1 32621 2650 21776
+ COX1 24559 1037 15174
+ TFP1 21539 1689 13506
+ HSC82 21177 1458 12729
+ ADH1 20245 1467 11351
+ AI5_ALPHA 20022 918 13101
+ AI4 19390 886 12638
+ AI3 17823 798 11473
+ AI2 17590 790 11297
+ RPL10 16822 1113 8797
+ ENO2 16336 1125 8913
+ TEF1 15578 1333 5450
+
+An example of a GTF 'count_output' file::
+
+ ##gff-version 2
+ # generated by Counters version 1.2.0, Tue Jan 8 22:47:29 2013
+ # pyReadCounters.py -f PAR_CLIP_unique.novo --mutations=TC -v
+ # total number of reads: 2455251
+ # total number of paired reads: 0
+ # total number of single reads: 2455251
+ # total number of mapped reads: 2455251
+ # total number of overlapping genomic features: 5153943
+ # sense: 2640600
+ # anti-sense: 2513343
+ chrXIV reads exon 661572 661605 2 + . gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661596S;
+ chrXIV reads exon 661720 661738 1 + . gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661726S;
+ chrXIV reads exon 661839 661878 4 + . gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661875S;
+
+This output file also reports whether a read contains a mutation.
+
+For example::
+
+ # 661596S
+
+Indicates that the read had a nucleotide substitution ("S") at genomic coordinate 661596. The chromosome name can be found in the first column.
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+File input options::
+
+ -f FILE, --input_file=FILE
+ provide the path to your novo, SAM/BAM or gtf data
+ file. Default is standard input. Make sure to specify
+ the file type of the file you want to have analyzed
+ using the --file_type option!
+ -o OUTPUT_FILE, --output_file=OUTPUT_FILE
+ Use this flag to override the standard file names. Do
+ NOT add an extension.
+ --file_type=FILE_TYPE
+ use this option to specify the file type (i.e.
+ 'novo','sam' or 'gtf'). This will tell the program
+ which parsers to use for processing the files. Default
+ = 'novo'
+ --gtf=annotation_file.gtf
+ type the path to the gtf annotation file that you want
+ to use
+
+Common pyCRAC options::
+
+ --ignorestrand
+ To ignore strand information and all reads overlapping
+ with genomic features will be considered sense reads.
+ Useful for analysing ChIP or RIP data
+ --overlap=1
+ sets the number of nucleotides a read has to overlap
+ with a gene before it is considered a hit. Default =
+ 1 nucleotide
+ -r 100, --range=100
+ allows you to add regions flanking the genomic
+ feature. If you set '-r 50' or '--range=50', then the
+ program will add 50 nucleotides to each feature on
+ each side regardless of whether the GTF file has genes
+ with annotated UTRs
+
+Options for SAM/BAM and Novo files::
+
+ --mutations=delsonly
+ Use this option to only track mutations that are of
+ interest. For CRAC data this is usually deletions
+ (--mutations=delsonly). For PAR-CLIP data this is
+ usually T-C mutations (--mutations=TC). Other options
+ are\: do not report any mutations: --mutations=nomuts.
+ Only report specific base mutations, for example only
+ in T's, C's and G's :--mutations=[TCG]. The brackets
+ are essential. Other nucleotide combinations are also
+ possible
+ --align_quality=100, --mapping_quality=100
+ with these options you can set the alignment quality
+ (Novoalign) or mapping quality (SAM) threshold. Reads
+ with qualities lower than the threshold will be
+ ignored. Default = 0
+ --align_score=100
+ with this option you can set the alignment score
+ threshold. Reads with alignment scores lower than the
+ threshold will be ignored. Default = 0
+ --unique
+ with this option reads with multiple alignment
+ locations will be removed. Default = Off
+ --blocks
+ with this option reads with the same start and end
+ coordinates on a chromosome will be counted as one
+ cDNA. Default = Off
+ -m 100000, --max=100000
+ maximum number of mapped reads that will be analyzed.
+ Default = All
+ -d 1000, --distance=1000
+ this option allows you to set the maximum number of
+ base-pairs allowed between two non-overlapping paired
+ reads. Default = 1000
+ --discarded=FILE
+ prints the lines from the alignments file that were
+ discarded by the parsers. This file contains reads
+ that were unmapped (NM), of poor quality (i.e. QC) or
+ paired reads that were mapped to different chromosomal
+ locations or were too far apart on the same
+ chromosome. Useful for debugging purposes
+ -l 100, --length=1000
+ to set read length threshold. Default = 1000
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pySelectMotifsFromGTF.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pySelectMotifsFromGTF.xml Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,75 @@
+
+
+ pyCRAC
+
+
+ /usr/local/bin/pySelectMotifsFromGTF.py
+ --gtf $input
+ -m $motif
+ -o $out
+ -l $length
+ -z $zscore
+
+ /usr/local/bin/pySelectMotifsFromGTF.py --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**pySelectMotifsFromGTF**
+
+pySelectMotifsFromGTF is part of the pyCRAC_ package. Extracts your favourite k-mer sequence from pyMotif GTF output files.
+Note that you can include degenerate nucleotides in your motif string::
+
+ N = A, G, C or T
+ R = A or G = puRine
+ Y = C or T = pYrimidine
+ M = A or C = aroMatic
+ S = G or C
+ W = A or T
+ K = G or T = Keto
+ V = A, C or G = Not T (letter after)
+ D = A, G or T = Not C
+ H = A, C or T = Not G
+ B = C, G or T = Not A
+
+So if you enter KBCTTG as search string and length=6, then the program will extract a large number of six-mers from your data.
+If you set length = 8, it will look for this pattern in a stretch of 8 nucleotides.
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+
+------
+
+**Parameter list**
+
+Options::
+
+ --gtf=Yourfavoritegtf.gtf
+ type the path to the gtf file that you want to use. By
+ default it expects data from the standard input
+ -o FILE, --output=FILE
+ Optional.Specify the name of the output file. Default
+ is standard output. Make sure it has the .gtf
+ extension!
+ -m KBCTTG, --motif=KBCTTG
+ Specify the motif you want extract from the GTF file.
+ -z 15.0, --Z_score=15.0
+ Set a minimum k-mer Z-score. Default=0
+ -l 4, --length=4
+ Set a k-mer length. Default is no length filtering
+
+
+
diff -r 000000000000 -r 19b20927172d pyCRAC/pycrac.chr.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pycrac.chr.loc.sample Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,1 @@
+s.cerevisiae EF2 /usr/local/pyCRAC/db/Saccharomyces_cerevisiae.EF2.59.1.0_chr_lengths.txt
diff -r 000000000000 -r 19b20927172d pyCRAC/pycrac.fasta.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pycrac.fasta.loc.sample Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,1 @@
+s.cerevisiae EF2 /usr/local/pyCRAC/db/Saccharomyces_cerevisiae.EF2.59.1.0.fa
diff -r 000000000000 -r 19b20927172d pyCRAC/pycrac.gtf.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pycrac.gtf.loc.sample Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,1 @@
+s.cerevisiae EF2 /usr/local/pyCRAC/db/Saccharomyces_cerevisiae.EF2.59.1.2.gtf
diff -r 000000000000 -r 19b20927172d pyCRAC/pycrac.tab.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pycrac.tab.loc.sample Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,1 @@
+s.cerevisiae EF2 /usr/local/pyCRAC/db/Saccharomyces_cerevisiae.EF2.59.1.0.fa.tab
diff -r 000000000000 -r 19b20927172d pyCRAC/tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/tool_data_table_conf.xml.sample Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,23 @@
+
+
+
+