# HG changeset patch # User saskia-hiltemann # Date 1438592475 14400 # Node ID 1209f18a5a83a69a205bd98ca8e5a3622c191332 Uploaded diff -r 000000000000 -r 1209f18a5a83 JunctionDiff-vs-background.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/JunctionDiff-vs-background.sh Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,75 @@ +#!/bin/bash + +#JunctionDiff-vs-background.sh $variants $genomes ${reference.fields.crr_path} ${reference.fields.31G_var_paths} ${reference.54G_var_paths} $output_filtered $output_report $scoreThresholdA $scoreThresholdB $distance $minlength + +#set some defaults +output_report="output_reports.tsv" + +set -- `getopt -n$0 -u -a --longoptions="variants: reference: VN_junctions: cgatools_binary: outputfile_filtered: outputfile_report: scoreThresholdA: scoreThresholdB: distance: minlength: " "h:" "$@"` || usage +[ $# -eq 0 ] && usage + +while [ $# -gt 0 ] +do + case "$1" in + --variants) variants=$2;shift;; + --reference) crr=$2;shift;; + --VN_junctions) VN_junctionfiles_list=$2;shift;; + --cgatools_binary) cgatools_binary=$2;shift;; #cgatools binary to use + --outputfile_filtered) output_filtered=$2;shift;; + --outputfile_report) output_report=$2;shift;; + --scoreThresholdA) scoreThresholdA=$2;shift;; + --scoreThresholdB) scoreThresholdB=$2;shift;; + --distance) distance=$2;shift;; + --minlength) minlength=$2;shift;; + -h) shift;; + --) shift;break;; + -*) usage;; + *) break;; + esac + shift +done + + +# make copy of input junctions file, as this file will be altered +junctions="junctions.tsv" +cp $variants $junctions + + +### run JunctionDiff against all of the VN junctionfiles + +echo "running JunctionDiff against each of the VN genomes" + +# for each line in VN genomes list of junctionfiles, run junctiondiff +count=0 +while read line +do + if [[ $line != "" ]] # catch empty lines + then + count=$[$count+1] + ${cgatools_binary} junctiondiff \ + --beta \ + --statout \ + --reference $crr \ + --junctionsA $junctions \ + --junctionsB $line \ + --scoreThresholdA $scoreThresholdA \ + --scoreThresholdB $scoreThresholdB \ + --distance $distance \ + --minlength $minlength + + #concatenate all reports + echo -e "report of run $count:\n----------------------" >> $output_report + cat report.tsv >> $output_report + echo "" >> $output_report + + + #rename output file to junctions file for next iteration + rm $junctions + mv "diff-$junctions" $junctions + fi +done < $VN_junctionfiles_list + +cp $junctions $output_filtered + + + diff -r 000000000000 -r 1209f18a5a83 JunctionDiff-vs-background.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/JunctionDiff-vs-background.xml Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,72 @@ + + Filter SVs based on presence in VN set + + + cgatools + + + + JunctionDiff-vs-background.sh + --variants $variants + --reference ${reference.fields.reference_crr_cgatools} + #if $virtnorm.VNset == "diversity" + --VN_junctions ${reference.fields.VN_genomes_junctionfile_list} + #else + --VN_junctions ${reference.fields.VN_genomes_junctionfile_list_1000G} + #end if + --cgatools_binary cgatools + --outputfile_filtered $output_filtered + --scoreThresholdA $scoreThresholdA + --scoreThresholdB $scoreThresholdB + --distance $distance + --minlength $minlength + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + report == "Y" + + + + +**What it does** + + + +**Input Files** +Complete Genomics Junctions file + +**Output Files** +Junctions remaining after filtering + + + + + + + diff -r 000000000000 -r 1209f18a5a83 README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,32 @@ +Installs VirtualNormal Correction Tool + +After installing this tool via admin panel, manually configure the following: + +1) edit virtual_normal_correction.loc file + + - change "/path/to/hg18.crr" to the location of the Complete Genomics reference crr file on your system + (can be downloaded from ftp://ftp.completegenomics.com/ReferenceFiles/ ) + + - change "/path/to/VN_genomes_varfiles_hg18.txt" to the location of the file containing the locations of all the Complete Genomics + varfiles to be used as a virtual normal. This file should contain 1 file location per line, e.g. + + /path/to/normal-varfile-1 + /path/to/normal-varfile-2 + /path/to/normal-varfile-3 + /path/to/normal-varfile-4 + /path/to/normal-varfile-5 + /path/to/normal-varfile-6 + /path/to/normal-varfile-7 + /path/to/normal-varfile-8 + ... + + Varfiles can be in compressed or uncompressed form. For example, Complete Genomics' Diversity panel can be used. + (can be downloaded from ftp://ftp2.completegenomics.com/) + + - change "/path/to/VN_genomes_junctionfiles_hg18.txt" to the location of the file containing the locations of all the Complete Genomics + junctionfiles to be used as a virtual normal. This file should contain 1 file location per line. For example, Complete Genomics' + Diversity panel can be used. (can be downloaded from ftp://ftp2.completegenomics.com/) + + 2) restart Galaxy for changes to take effect + + After this initial setup, additional normals can be added to the lists without having to restart Galaxy. diff -r 000000000000 -r 1209f18a5a83 TV-vs-background.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TV-vs-background.sh Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,139 @@ +#!/bin/bash + +#TV-vs-background.sh $variants $genomes ${reference.fields.crr_path} ${reference.fields.31G_var_paths} ${reference.54G_var_paths} $threshold $output_all $output_filtered + +echo $@ + +set -- `getopt -n$0 -u -a --longoptions="variants: reference: VN_varfiles: outputfile_filtered: outputfile_all: threshold: thresholdhc:" "h:" "$@"` || usage +[ $# -eq 0 ] && usage + +while [ $# -gt 0 ] +do + case "$1" in + --variants) variants=$2;shift;; + --reference) crr=$2;shift;; + --VN_varfiles) VN_varfiles_list=$2;shift;; + --outputfile_filtered) output_filtered=$2;shift;; + --outputfile_all) output_all=$2;shift;; + --threshold) threshold=$2;shift;; + --thresholdhc) thresholdhc=$2;shift;; + -h) shift;; + --) shift;break;; + -*) usage;; + *) break;; + esac + shift +done + +# replace newline chars with spaces for input to testvariants +tr '\n' ' ' < $VN_varfiles_list > VN_varfiles.txt + + +### run TestVariants against 31G, 54G or 85G + +echo "number of normals: $VNsetsize" +echo "list of normals: ($VN_varfiles_list)" +cat VN_varfiles.txt + + +echo "running TV against Virtual Normal set" +echo "command: cgatools testvariants\ + --beta \ + --reference $crr \ + --input $variants \ + --output $output_all \ + --variants `cat VN_varfiles.txt`" + +cgatools testvariants \ + --beta \ + --reference $crr \ + --input $variants \ + --output $output_all \ + --variants `cat VN_varfiles.txt` + + + +VNsetsize=`cat $VN_varfiles_list | wc -l` + + + +### filter file based on occurrence in background genomes +cp $output_all $output_filtered +cp $output_all output_expanded + +### condens file to columns with counts for all background genomes +echo "Counting..." +awk 'BEGIN{ + FS="\t"; + OFS="\t"; + totalnormals="'"$VNsetsize"'"+0 + count["00"]="0"; + count["01"]="0"; + count["11"]="0"; + count["0N"]="0"; + count["1N"]="0"; + count["NN"]="0"; + count["0"]="0"; + count["1"]="0"; + count["N"]="0"; + }{ + if(FNR==1) # header + print $1,$2,$3,$4,$5,$6,$7,$8,"VN_occurrences","VN_frequency","VN_fullycalled_count","VN_fullycalled_frequency","VN_00","VN_01","VN_11","VN_0N","VN_1N","VN_NN","VN_0","VN_1","VN_N" + else{ + #count entries in reference genomes + for (c in count) + count[c]=0; + for (i=9; i<=NF; i++){ + count[$i]++; + } + occurrences=count["11"]+count["01"]+count["1N"]+count["1"] + fullycalled=count["11"]+count["01"]+count["00"]+count["1"]+count["0"] + print $1,$2,$3,$4,$5,$6,$7,$8,occurrences,occurrences/totalnormals,fullycalled,fullycalled/totalnormals,count["00"],count["01"],count["11"],count["0N"],count["1N"],count["NN"],count["0"],count["1"],count["N"] + } + }END{ + + + }' $output_all > "${output_all}-counted" + + +# this counted file is the final output file +rm $output_all +mv "${output_all}-counted" $output_all + + + +### filter out variants occurring in more than of the background genomes +# if total of columns containing a 1 (01,11,1N,1) is >= threshold +awk 'BEGIN{ + FS="\t"; + OFS="\t"; + }{ + if(FNR==1){ + print $0 + } + if(FNR>1){ + if($9 < "'"$threshold"'" ) + print $0 + } + }END{}' $output_all > $output_filtered + + +awk 'BEGIN{ + FS="\t"; + OFS="\t"; + threshold="'"${thresholdhc}"'"+0 + }{ + if(FNR==1) + print $0 + else if($11 >= threshold) + print $0 + + }END{}' $output_filtered > "output_filtered_highconf.tsv" + + + + + + + + diff -r 000000000000 -r 1209f18a5a83 TV-vs-background.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TV-vs-background.xml Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,101 @@ + + Filter small variants based on presence in Virtual Normal set + + + cgatools + + + + TV-vs-background.sh + --variants $variants + --reference ${reference.fields.reference_crr_cgatools} + #if $virtnorm.VNset == "diversity": + --VN_varfiles ${reference.fields.VN_genomes_varfiles_list} + #else + --VN_varfiles ${reference.fields.VN_genomes_varfiles_list_1000G} + #end if + --threshold $threshold + --thresholdhc $thresholdhc + --outputfile_all $output_all + --outputfile_filtered $output_filtered + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool compares a list of variants to a set of normal genomes. Each variant will be annotated with the number of normal samples it appears in. +The tool will also output how often the variant was found in one or both alleles (01 or 11), and distinguish between a variant not being present in the normal (00) +or the location being no-called in the normal (NN) or half-called (0N,1N) etc. + +This may take quite some time depending on the number of input variants and the number of normal genomes. + +**Input Files** + +This program takes as input a list of variants as produced by the ListVariants tool, or the vcf-to-LV preprocessing tool. Input must be a tab-separated file of the following format:: + + variantID - chromosome - begin - end - varType - reference - alleleSeq - xRef + 1034 chr1 972803 972804 snp T C dbsnp:rs31238120 + +valid entries in varType column are: snp,sub,ins,del. + +Chromosome coordinates must be zero-based half-open. + +Column names must match the ones given above. + + +**Output Files** + +1) Original input file annotated with presence (or lack thereof) in background genomes + +2) Filtered version of output 1, variants are removed when present in at least *threshold* of the background normal genomes (default: 1) (filters on column 9 of output file) + +3) High Confidence filtered version of output 2. Of all the variants labelled somatic, filter out any variants not fully called in at least *high confidence threshold* normals. (filter on column 11 of output file) + +Example output format:: + + variantId chromosome begin end varType reference alleleSeq xRef VN_occurrences VN_frequency VN_fullycalled_count VN_fullycalled_frequency VN_00 VN_01 VN_11 VN_0N VN_1N VN_NN VN_0 VN_1 VN_N + 34 chr1 46661 46662 snp T C dbsnp.100:rs2691309 26 0.472727 33 0.6 7 19 7 1 0 20 0 0 0 + 35 chr1 46850 46850 ins A 0 0 10 0.181818 10 0 0 5 0 39 0 0 0 + 36 chr1 46895 46896 snp T C dbsnp.100:rs2691311 8 0.145455 40 0.727273 33 7 0 2 1 11 0 0 0 + 37 chr1 46926 46927 snp G A dbsnp.100:rs2548884 7 0.127273 43 0.781818 36 7 0 2 0 9 0 0 0 + + + + + + diff -r 000000000000 -r 1209f18a5a83 VN_genomes_locations.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VN_genomes_locations.txt Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,8 @@ +/path/to/normal-varfile-1 +/path/to/normal-varfile-2 +/path/to/normal-varfile-3 +/path/to/normal-varfile-4 +/path/to/normal-varfile-5 +/path/to/normal-varfile-6 +/path/to/normal-varfile-7 +/path/to/normal-varfile-8 diff -r 000000000000 -r 1209f18a5a83 tool-data/virtual_normal_correction.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/virtual_normal_correction.loc.sample Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,6 @@ +#loc file for annovar tool + +# value, dbkey, name, VN_genomes_varfiles_list, VN_genomes_junctionfile_list, reference_crr_cgatools + +hg18 hg18 Virtual Normal hg18 /mnt/galaxyIndices/VirtualNormal/VN_genomes_varfiles_hg18.txt /mnt/galaxyIndices/VirtualNormal/VN_genomes_junctionfiles_hg18.txt /mnt/galaxyIndices/cgatools/build36.crr +hg19 hg19 Virtual Normal hg19 /mnt/galaxyIndices/VirtualNormal/VN_genomes_varfiles_hg19.txt /mnt/galaxyIndices/VirtualNormal/VN_genomes_junctionfiles_hg19.txt /mnt/galaxyIndices/cgatools/build37.crr diff -r 000000000000 -r 1209f18a5a83 tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,7 @@ + + + +value, dbkey, name, VN_genomes_varfiles_list, VN_genomes_junctionfile_list, reference_crr_cgatools + +
+
diff -r 000000000000 -r 1209f18a5a83 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,23 @@ + + + + + + http://sourceforge.net/projects/cgatools/files/1.7.1/cgatools-1.7.1.5-linux_binary-x86_64.tar.gz + chmod a+x bin/cgatools + + bin/cgatools + $INSTALL_DIR/bin + + + $INSTALL_DIR/bin + $REPOSITORY_INSTALL_DIR + + + + + Downloads and installs the cgatools binary. + + + + diff -r 000000000000 -r 1209f18a5a83 vcf2lv.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf2lv.sh Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,104 @@ +#!/bin/bash + +vcffile=$1 +outputfile=$2 + +# vcf columns: CHROM-POS-ID-REF-ALT +# LV cloumns: variantId-chromosome-start-end-reference-alleleSeq-xRef + + +# add chr prefix if not present +# determine varType (snp, ins, del, sub) +# convert coordinates to 0-based halfopen +# calculate end coordinate from position and length +# remove leading reference base from the non-SNP variants, update position + +awk 'BEGIN{ + FS="\t"; + OFS="\t"; + count=0; + + #output new header + print "variantId", "chromosome", "begin", "end", "varType", "reference", "alleleSeq", "xRef" + }{ + + if(substr($0,1,1)!="#" && $5 != "."){ #skip header or nonvariant entries (period in ALT column) + + # detect multivariants + chrom=$1 + pos=$2 + ref=$4 + #alt=$5 + reflen=length($4) + + # add chr prefix if needed + if(substr($1,1,3)!="chr") + chromosome="chr"$1 + else + chromosome=chrom + + # split ALT column in case of multiple variant alleles + split($5,alleles,","); + + for (i in alleles) { + alt=alleles[i] + + + # determine varType + if(length(ref) == 1 && length(alt) == 1) + varType="snp" + else if (length(ref) == 1 ) + varType="ins" + else if (length(alt) == 1 ) + varType="del" + else + varType="sub" + + # determine start and end coordinates in 0-based half-open coordinate system + + if (varType=="snp"){ + start=pos-1 + end=pos + } + else if (varType=="ins"){ + start=pos + end=pos + } + else if (varType=="del"){ + start=pos + end=pos+(reflen-1) + } + else if (varType=="sub"){ + start=pos + end=pos+(reflen-1) + } + + # remove leading reference base + if (varType!="snp" && substr(ref,1,1)==substr(alt,1,1)){ #subs not mandatory leading reference base :s + reference=substr(ref,2) + alleleSeq=substr(alt,2) + } + else{ + reference=ref + alleleSeq=alt + } + + #print output variant(s) + + if(chromosome != "chrM") + print count, chromosome, start, end, varType, reference, alleleSeq, "" + + count+=1 + } + } + }END{}' $vcffile > $outputfile + + + +#from 100Genomes site: + +#CHROM chromosome: an identifier from the reference genome. All entries for a specific CHROM should form a contiguous block within the VCF file.(Alphanumeric String, Required) +#POS position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. (Integer, Required) +#ID semi-colon separated list of unique identifiers where available. If this is a dbSNP variant it is encouraged to use the rs number(s). No identifier should be present in more than one data record. If there is no identifier available, then the missing value should be used. (Alphanumeric String) +#REF reference base(s): Each base must be one of A,C,G,T,N. Bases should be in uppercase. Multiple bases are permitted. The value in the POS field refers to the position of the first base in the String. For InDels, the reference String must include the base before the event (which must be reflected in the POS field). (String, Required). +#ALT comma separated list of alternate non-reference alleles called on at least one of the samples. Options are base Strings made up of the bases A,C,G,T,N, or an angle-bracketed ID String (””). If there are no alternative alleles, then the missing value should be used. Bases should be in uppercase. (Alphanumeric String; no whitespace, commas, or angle-brackets are permitted in the ID String itself) diff -r 000000000000 -r 1209f18a5a83 vcf2lv.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf2lv.xml Mon Aug 03 05:01:15 2015 -0400 @@ -0,0 +1,27 @@ + + convert VCF file to CG-compatible variant list + + + vcf2lv.sh $vcffile $outputfile + + + + + + + + + + + + +**what it does** + +Converts a VCF file containing small variants (SNVs, indels and substitutions) to a Complete Genomics type variantlist. + +After conversion, the file can be used as input to the Virtual Normal filtering pipeline. + + + + +