# HG changeset patch # User fcaramia # Date 1371700988 14400 # Node ID a1034918ab9be3951af2e3297c91864e91f2b88e Uploaded diff -r 000000000000 -r a1034918ab9b all_fasta.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/all_fasta.loc.sample Thu Jun 20 00:03:08 2013 -0400 @@ -0,0 +1,18 @@ +#This file lists the locations and dbkeys of all the fasta files +#under the "genome" directory (a directory that contains a directory +#for each build). The script extract_fasta.py will generate the file +#all_fasta.loc. This file has the format (white space characters are +#TAB characters): +# +# +# +#So, all_fasta.loc could look something like this: +# +#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa +#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa +#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa +# +#Your all_fasta.loc file should contain an entry for each individual +#fasta file. So there will be multiple fasta files for each build, +#such as with hg19 above. +# diff -r 000000000000 -r a1034918ab9b joint_snv_mix.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/joint_snv_mix.pl Thu Jun 20 00:03:08 2013 -0400 @@ -0,0 +1,75 @@ + +use strict; +use warnings; +use File::Basename; +use Cwd; +use File::Path qw(make_path remove_tree); +die qq( +Bad numbr of inputs + +) if(!@ARGV); + +my $bam_normal; +my $bam_tumor; +my $player_options = ""; +my $output; +my $action; +my $ref_genome; + +foreach my $input (@ARGV) +{ + my @tmp = split "::", $input; + if($tmp[0] eq "BAMNORMAL") + { + $bam_normal = $tmp[1]; + } + elsif($tmp[0] eq "BAMTUMOR") + { + $bam_tumor = $tmp[1]; + } + elsif($tmp[0] eq "OPTION") + { + $player_options = "$player_options ${tmp[1]}"; + } + elsif($tmp[0] eq "REFGENOME") + { + $ref_genome = $tmp[1]; + } + elsif($tmp[0] eq "ACTION") + { + $action = $tmp[1]; + } + elsif($tmp[0] eq "OUTPUT") + { + $output = $tmp[1]; + } + + else + { + die("Unknown Input: $input\n"); + } +} + + +#Create Symbolic links and indexes + +my $working_dir = cwd(); + +system ("ln -s $bam_normal $working_dir/normal.bam"); +system ("samtools index $working_dir/normal.bam"); + +system ("ln -s $bam_tumor $working_dir/tumor.bam"); +system ("samtools index $working_dir/tumor.bam"); + + +#run jsm + +if($action eq "classify") +{ + system ("jsm.py $action $player_options $ref_genome $working_dir/normal.bam $working_dir/tumor.bam"); +} +else +{ + system ("jsm.py $action $player_options $ref_genome $working_dir/normal.bam $working_dir/tumor.bam $output"); +} + diff -r 000000000000 -r a1034918ab9b joint_snv_mix.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/joint_snv_mix.xml Thu Jun 20 00:03:08 2013 -0400 @@ -0,0 +1,277 @@ + + classify germline and somatic mutations + + python + cython + pysam + samtools + jointsnvmix + + + + joint_snv_mix.pl + + "ACTION::${option.option}" + + "REFGENOME::$refFile.fields.path" + "BAMNORMAL::$normal_file" + "BAMTUMOR::$tumor_file" + + + #if str($option.option) == "classify": + #if ($option.parameters): + "OPTION::--parameters_file $option.parameters" + #end if + "OPTION::--out_file $output" + "OPTION::--somatic_threshold $option.somatic_threshold" + + #end if + + #if str($option.option) == "train": + #if ($option.priors): + "OPTION::--priors_file $option.priors" + #end if + "OUTPUT::$output" + "OPTION::--convergence_threshold $option.convergence_threshold" + "OPTION::--max_iters $option.max_iters" + + #end if + #if ($positions_file): + "OPTION::--positions_file $positions_file" + #end if + + "OPTION::--min_base_qual $min_base_quality" + "OPTION::--min_map_qual $min_map_quality" + "OPTION::--model $model" + #if ($chromosome): + "OPTION::--chromosome $chromosome" + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +:: + + JointSNVMix implements a probabilistic graphical model to analyse sequence data + from tumour/normal pairs. The model draws statistical strength by analysing both + genome jointly to more accurately classify germline and somatic mutations. + + + Train + + The SnvMix family of models are complete generative models of the data. + As such the model parameters can be learned using the Expectation Maximisation + (EM) algorithm. The train command allows this to be done. + + All methods require that a file with the parameters for the prior densities, + and an initial set of parameters be passed in. Templates for these files can + be found in the config/ directory which ships with the package. If you are + unsure about setting the priors or parameter values these files should suffice. + + The train command will produce a parameters file suitable for use with the + classification command. Training is highly recommended to achieve optimal + performance when using SnvMix based model. + + To reduce memory consumption all subcommands of train take an optional --skip-size flag. + This is the number of positions to skip over before sampling a position for the training set. + Smaller values will lead to larger training sets which will require more memory, + but should yield better parameter estimates. + + All subcommands of train also take optional parameters for minimum depth a + position has in the tumour and normal to be used for training. Higher depth + sites should give more robust estimates of the parameters. The default values + of these are likely fine. + + + Classify + + The classify command is used for analysing tumour/normal paired data and + computing the posterior probability for each of the nine joint genotypes for + a pair of diploid genomes. + + + +**Models** + +:: + + There are currently three models supported by both the train and classify commands. + All models use the JointSNVMix mixture model which jointly analyses the normal and tumour genomes. + By default snvmix2 is used but other models can be specified. + + binomial + + Uses binomial densities in the mixture model this was previously referred to as the JointSnvMix1 mode. + + snvmix2 + + Uses snvmix2 densities in the mixture as described in the original SNVMix paper previously referred to as JointSnvMix2. + + beta_binomial + + Uses beta-binomial densities in the mixture model new in version 0.8. The beta-binomial is a robust (in the statistical sense) + alternative to binomial model. It can be beneficial when dealing with over-dispersed data. This is useful in cancer genomes + since allelic frequencies at somatic mutations sites may deviate significantly from those expected under diploid model. + + +**Input** + + Bam files containing normal and tumor reads. + + +**Parameters** + + + Classify + + chromosome CHROMOSOME + Chromosome to analyse. If not set all chromosomes will + be analysed. + + min_base_qual MIN_BASE_QUAL + Remove bases with base quality lower than this. + Default is 0. + + min_map_qual MIN_MAP_QUAL + Remove bases with mapping quality lower than this. + Default is 0. + + positions_file POSITIONS_FILE + Path to a file containing a list of positions to + create use for analysis. Should be space separated + chrom pos. Additionally for each chromosome the + positions should be sorted. The same format as + samtools. + + parameters_file PARAMETERS_FILE + Path to a file with custom parameters values for the + model. + + somatic_threshold SOMATIC_THRESHOLD + Only sites with P(Somatic) = p_AA_AB + p_AA_BB greater + than equal this value will be printed. Default is 0. + + + Train + + chromosome CHROMOSOME + Chromosome to analyse. If not set all chromosomes will + be analysed. + + min_base_qual MIN_BASE_QUAL + Remove bases with base quality lower than this. + Default is 0. + + min_map_qual MIN_MAP_QUAL + Remove bases with mapping quality lower than this. + Default is 0. + + positions_file POSITIONS_FILE + Path to a file containing a list of positions to + create use for analysis. Should be space separated + chrom pos. Additionally for each chromosome the + positions should be sorted. The same format as + samtools. + + priors_file PRIORS_FILE + Path to a file with priors for the model parameters. + + initial_parameters_file INITIAL_PARAMETERS_FILE + Path to a file with initial parameter values for the + model. + + min_normal_depth MIN_NORMAL_DEPTH + Minimum depth of coverage in normal sample for a site + to be eligible for use in training set. Default 10 + + min_tumour_depth MIN_TUMOUR_DEPTH + Minimum depth of coverage in tumour sample for a site + to be eligible for use in training set. Default 10 + + max_normal_depth MAX_NORMAL_DEPTH + Maximum depth of coverage in normal sample for a site + to be eligible for use in training set. Default 100 + + max_tumour_depth MAX_TUMOUR_DEPTH + Maximum depth of coverage in tumour sample for a site + to be eligible for use in training set. Default 100 + + max_iters MAX_ITERS + Maximum number of iterations to used for training + model. Default 1000 + + skip_size SKIP_SIZE + When subsampling will skip over this number of + position before adding a site to the subsample. Larger + values lead to smaller subsample data sets with faster + training and less memory. Smaller values should lead + to better parameter estimates. Default 1. + + convergence_threshold CONVERGENCE_THRESHOLD + Convergence threshold for EM training. Once the change + in objective function is below this value training + will end. Default 1e-6 + + + + + + + + + + diff -r 000000000000 -r a1034918ab9b jsm_to_vcf.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jsm_to_vcf.pl Thu Jun 20 00:03:08 2013 -0400 @@ -0,0 +1,49 @@ +die qq( +Bad numbr of inputs + +) if(!@ARGV); + +my $input=$ARGV[0]; +my $vcf=$ARGV[1]; + + +# Convert output to VCF format +open(FH, $input) or die "Couldn't open jsm file $input!\n"; +open(OUT, ">$vcf") or die "Couldn't create vcf file $vcf!\n"; + +# print the vcf format we are using +print OUT "##fileformat=VCFv4.1\n"; + +# grab header which is the first line after the comment lines which start with ## +my $header = ; +while(grep(/^##/, $header)) +{ + $header = ; +} +my @head = split("\t", $header); + +print "Converting jsm output to vcf\n"; +# vcf header is +# #CHROM POS ID REF ALT QUAL FILTER INFO +print OUT "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; +# for each line in jsm transform to vcf, any columns not in vcf concatenate them +# together and place them in the info column +while (my $line = ) +{ + chomp $line; + my @fields = split("\t", $line); + # create info column + # tumor_name=MH208_TUMOR;normal_name=MH208_LIVER;...;n_alt_sum=702 + my @info; + for(my $index = 4; $index < $#fields; $index++) + { + push @info, "$head[$index]=$fields[$index]"; + } + my $infofield = join(";", @info); + $fields[-1] = "PASS"; + + # print the line + print OUT "$fields[0]\t$fields[1]\t.\t$fields[2]\t$fields[3]\t.\t$fields[-1]\t$infofield\n"; +} +close(FH); +close(OUT); diff -r 000000000000 -r a1034918ab9b jsm_to_vcf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jsm_to_vcf.xml Thu Jun 20 00:03:08 2013 -0400 @@ -0,0 +1,40 @@ + + + + + jsm_to_vcf.pl $input $output + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +Convert Joint SNV Mix output file into standard VCF format, version 4.1 + + +**Input** + +Coverage: + + JSM output file + + + + + + + + diff -r 000000000000 -r a1034918ab9b tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Thu Jun 20 00:03:08 2013 -0400 @@ -0,0 +1,8 @@ + + + + + value, dbkey, name, path + +
+
diff -r 000000000000 -r a1034918ab9b tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Jun 20 00:03:08 2013 -0400 @@ -0,0 +1,82 @@ + + + + + + + http://sourceforge.net/projects/samtools/files/samtools/0.1.18/samtools-0.1.18.tar.bz2 + sed -i.bak -e 's/-lcurses/-lncurses/g' Makefile + make + + samtools + $INSTALL_DIR/bin + + + misc/maq2sam-long + $INSTALL_DIR/bin + + + $INSTALL_DIR/bin + + + + + Compiling SAMtools requires the ncurses and zlib development libraries. + + + + + + JointSNVMix requires Python >= 2.7 + + + + + + + http://cython.org/release/Cython-0.19.1.tar.gz + $INSTALL_DIR/lib/python + export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python && python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin + + $INSTALL_DIR/lib/python + $INSTALL_DIR/bin + + + + + + + + + + + http://pysam.googlecode.com/files/pysam-0.5.tar.gz + $INSTALL_DIR/lib/python + export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python && python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin + + $INSTALL_DIR/lib/python + + + + + + + + + + http://joint-snv-mix.googlecode.com/files/JointSNVMix-0.7.5.tar.gz + $INSTALL_DIR/lib/python + export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python && python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin + + $INSTALL_DIR/lib/python + $INSTALL_DIR/bin + + + + + + + + + +