Mercurial > repos > iuc > fermikit_variants

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/all_fasta.loc.sample	Thu Jan 05 08:35:48 2017 -0500
@@ -0,0 +1,18 @@
+#This file lists the locations and dbkeys of all the fasta files
+#under the "genome" directory (a directory that contains a directory
+#for each build). The script extract_fasta.py will generate the file
+#all_fasta.loc. This file has the format (white space characters are
+#TAB characters):
+#
+#<unique_build_id>	<dbkey>		<display_name>	<file_path>
+#
+#So, all_fasta.loc could look something like this:
+#
+#apiMel3	apiMel3	Honeybee (Apis mellifera): apiMel3		/path/to/genome/apiMel3/apiMel3.fa
+#hg19canon	hg19		Human (Homo sapiens): hg19 Canonical		/path/to/genome/hg19/hg19canon.fa
+#hg19full	hg19		Human (Homo sapiens): hg19 Full			/path/to/genome/hg19/hg19full.fa
+#
+#Your all_fasta.loc file should contain an entry for each individual
+#fasta file. So there will be multiple fasta files for each build,
+#such as with hg19 above.
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fermikit_run_calling.xml	Thu Jan 05 08:35:48 2017 -0500
@@ -0,0 +1,58 @@
+<tool id="fermikit_variants" name="fermikit-variants" version="0.14.dev1">
+    <description>call variants from genome-aligned contigs</description>
+    <requirements>
+        <requirement type="package" version="0.14.dev1">fermikit</requirement>
+        <requirement type="package" version="0.6.5">sambamba</requirement>
+    </requirements>
+    <command detect_errors="aggressive"><![CDATA[
+        #import re
+        #set escaped_element_identifier = re.sub('[^\w\-\s]', '_', str($bam.element_identifier))
+        #set ref = $reference_genome.fasta_item.fields.path if $reference_genome.reference_genome_source == "reference" else $reference_genome.history_item
+        export ROOT=\$(dirname \$(type -P k8)) &&
+        ln -f -s '$bam' '$escaped_element_identifier' &&
+        htsbox pileup -cuf '$ref' '$escaped_element_identifier'  | gzip -1 > raw.vcf.gz &&
+        k8 "\$ROOT"/hapdip.js deovlp raw.vcf.gz | k8 "\$ROOT"/hapdip.js anno | gzip -1 > tmp.vcf.gz 2> flt.vcf.log &&
+        k8 "\$ROOT"/hapdip.js filter -q3 tmp.vcf.gz > flt.vcf 2>> flt.vcf.log &&
+        htsbox abreak -bcuf '$ref' <(sambamba sort -n -o /dev/stdout '$escaped_element_identifier') > sv.vcf
+     ]]></command>
+     <inputs>
+         <param name="bam" type="data" label="aligned contigs" help="To generate aligned contigs align fermi2 contigs with BWA mem options -B9 -O16 -L5" format="bam"/>
+         <conditional name="reference_genome" label="Reference genome to call variants against">
+             <param name="reference_genome_source" type="select">
+                 <option value="reference">Use a built-in genome to call variants</option>
+                 <option value="history">Use a genome from history to call variants</option>
+             </param>
+             <when value="history">
+                <param format="fasta" label="Select a reference genome" name="history_item" type="data" />
+            </when>
+            <when value="reference">
+                <param label="Select a reference genome" name="fasta_item" type="select">
+                    <options from_data_table="all_fasta">
+                        <filter column="2" type="sort_by"/>
+                    </options>
+                </param>
+            </when>
+         </conditional>
+    </inputs>
+    <outputs>
+        <data name="structural_variants" format="vcf" label="fermikit SV on ${on_string}" from_work_dir="sv.vcf"/>
+        <data name="snps_indels" format="vcf" label="fermikit SNPs and short INDELs on ${on_string}" from_work_dir="flt.vcf"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="reference_genome_source" value="history"/>
+            <param name="history_item" value="small.fa.gz" ftype="fasta"/>
+            <param name="bam" value="aligned_contigs.bam"/>
+            <output name="structural_variants" file="sv.vcf" lines_diff="2"/>
+            <output name="snps_indels" file="flt.vcf" lines_diff="2"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+FermiKit is a de novo assembly based variant calling pipeline for deep Illumina
+resequencing data. This galaxy wrapper can be used to call variants from contigs
+generated by fermi2 that have subsequently been aligned to a reference genome
+using bwa (options -B9 -O16 -L5 or -x intractg).
+
+]]></help> <citations> <citation
+type="doi">10.1093/bioinformatics/btv440</citation> </citations> </tool>
Binary file test-data/aligned_contigs.bam has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/flt.vcf	Thu Jan 05 08:35:48 2017 -0500
@@ -0,0 +1,49 @@
+##fileformat=VCFv4.1
+##source=htsbox-pileup-r327
+##reference=/tmp/tmpIcvwsb/files/000/dataset_2.dat
+##contig=<ID=11_1910000_1940000,length=30001>
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##INFO=<ID=_DP,Number=1,Type=Integer,Description="Raw read depth">
+##INFO=<ID=_DS,Number=1,Type=Integer,Description="min{alt_DP_on_forward, alt_DP_on_reverse}">
+##INFO=<ID=_AB,Number=1,Type=Integer,Description="Percentage of non-reference reads">
+##INFO=<ID=_FS,Number=1,Type=Integer,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
+##FILTER=<ID=DPhigh,Description="High read depth: _DP>62.82">
+##FILTER=<ID=DPlow,Description="Low read depth: _DP<3">
+##FILTER=<ID=FShigh,Description="Large Fisher-Strand bias: _FS>30">
+##FILTER=<ID=ABlow,Description="Low fraction of non-reference reads: _AB<30 at SNPs or _AB<30 at INDELs">
+##FILTER=<ID=DSlow,Description="Low double-strand support at SNPs: _DS<1">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	aligned_contigs_bam
+11_1910000_1940000	636	.	c	T	25	.	_DP=39;_AB=64	GT:AD	1/0:14,25
+11_1910000_1940000	708	.	t	C	40	.	_DP=40;_AB=100	GT:AD	1/1:0,40
+11_1910000_1940000	1116	.	t	C	43	.	_DP=43;_AB=100	GT:AD	1/1:0,43
+11_1910000_1940000	1891	.	t	C	44	.	_DP=44;_AB=100	GT:AD	1/1:0,44
+11_1910000_1940000	2297	.	a	G	43	.	_DP=43;_AB=100	GT:AD	1/1:0,43
+11_1910000_1940000	2727	.	a	G	42	.	_DP=42;_AB=100	GT:AD	1/1:0,42
+11_1910000_1940000	3378	.	a	G	50	.	_DP=50;_AB=100	GT:AD	1/1:0,50
+11_1910000_1940000	4140	.	c	T	21	.	_DP=36;_AB=58	GT:AD	1/0:15,21
+11_1910000_1940000	4820	.	g	A	35	.	_DP=35;_AB=100	GT:AD	1/1:0,35
+11_1910000_1940000	4860	.	t	G	36	.	_DP=36;_AB=100	GT:AD	1/1:0,36
+11_1910000_1940000	4976	.	g	A	33	.	_DP=33;_AB=100	GT:AD	1/1:0,33
+11_1910000_1940000	5455	.	a	AGT	12	.	_DP=25;_AB=48	GT:AD	0/1:13,12
+11_1910000_1940000	5559	.	g	T	39	.	_DP=39;_AB=100	GT:AD	1/1:0,39
+11_1910000_1940000	6369	.	ct	C	8	ABlow	_DP=42;_AB=19	GT:AD	0/1:34,8
+11_1910000_1940000	6654	.	g	A	43	.	_DP=43;_AB=100	GT:AD	1/1:0,43
+11_1910000_1940000	7873	.	g	A	45	.	_DP=45;_AB=100	GT:AD	1/1:0,45
+11_1910000_1940000	8084	.	t	C	54	.	_DP=54;_AB=100	GT:AD	1/1:0,54
+11_1910000_1940000	10894	.	t	G	40	.	_DP=40;_AB=100	GT:AD	1/1:0,40
+11_1910000_1940000	12259	.	a	G	35	.	_DP=35;_AB=100	GT:AD	1/1:0,35
+11_1910000_1940000	15695	.	g	A	36	.	_DP=36;_AB=100	GT:AD	1/1:0,36
+11_1910000_1940000	16353	.	ctt	C	14	.	_DP=14;_AB=100	GT:AD	1/1:0,14
+11_1910000_1940000	20714	.	c	T	24	.	_DP=42;_AB=57	GT:AD	1/0:18,24
+11_1910000_1940000	24531	.	t	G	32	.	_DP=32;_AB=100	GT:AD	1/1:0,32
+11_1910000_1940000	24546	.	gt	G	25	.	_DP=25;_AB=100	GT:AD	1/1:0,25
+11_1910000_1940000	25710	.	t	TTG	17	.	_DP=46;_AB=37	GT:AD	0/1:29,17
+11_1910000_1940000	25813	.	c	CTG,CTGTG	13	.	_DP=33;_AB=70	GT:AD	1/0:10,13,10
+11_1910000_1940000	26084	.	ctg	C,CTGTGTG	17	.	_DP=36;_AB=75	GT:AD	1/2:9,17,10
+11_1910000_1940000	26212	.	g	A	29	.	_DP=55;_AB=53	GT:AD	1/0:26,29
+11_1910000_1940000	26370	.	c	CTG,CTGTGTG	12	.	_DP=34;_AB=68	GT:AD	1/0:11,12,11
+11_1910000_1940000	26521	.	a	G	30	.	_DP=30;_AB=100	GT:AD	1/1:0,30
+11_1910000_1940000	26713	.	a	G	32	.	_DP=32;_AB=100	GT:AD	1/1:0,32
+11_1910000_1940000	27735	.	tc	T	41	.	_DP=41;_AB=100	GT:AD	1/1:0,41
+11_1910000_1940000	28524	.	c	A	18	.	_DP=38;_AB=47	GT:AD	0/1:20,18
Binary file test-data/small.fa.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sv.vcf	Thu Jan 05 08:35:48 2017 -0500
@@ -0,0 +1,16 @@
+##fileformat=VCFv4.1
+##source=htsbox-abreak-r327
+##reference=/tmp/tmpIcvwsb/files/000/dataset_2.dat
+##contig=<ID=11_1910000_1940000,length=30001>
+##ALT=<ID=DEL,Description="Deletion">
+##ALT=<ID=INS,Description="Insertion">
+##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="SV length">
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End coordinate of this variant">
+##INFO=<ID=QGAP,Number=1,Type=Integer,Description="Length of gap on the query sequence">
+##INFO=<ID=MINMAPQ,Number=1,Type=Integer,Description="Min flanking mapping quality">
+##INFO=<ID=MINSC,Number=1,Type=Integer,Description="Min flanking alignment score">
+##INFO=<ID=MINTIPQ,Number=1,Type=Integer,Description="Min quality/depth flanking the break point">
+##FILTER=<ID=LowSupp,Description="MINTIPQ < 10">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+11_1910000_1940000	5276	.	C	<DEL>	30	.	SVTYPE=DEL;END=26956;SVLEN=21678;QGAP=3;MINMAPQ=60;MINSC=793;MINTIPQ=23
Binary file test-data/test.fastq.gz has changed
Binary file test-data/unitigs.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Thu Jan 05 08:35:48 2017 -0500
@@ -0,0 +1,9 @@
+<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
+<tables>
+    <!-- Locations of all fasta files under genome directory -->
+    <table name="all_fasta" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/all_fasta.loc" />
+    </table>
+</tables>
+