changeset 3:2bc9b3ee5eea draft

Uploaded
author dereeper
date Thu, 02 Nov 2017 05:41:34 -0400
parents 013ff9bb23aa
children 6d1122b57344
files Plink.pl find_indiv.py plink.sh plink.xml tool-data/tool_dependencies.xml
diffstat 5 files changed, 0 insertions(+), 400 deletions(-) [+]
line wrap: on
line diff
--- a/Plink.pl	Fri Aug 05 10:26:44 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,242 +0,0 @@
-
-#!/usr/bin/perl
-
-use strict;
-use Getopt::Long;
-use Bio::SeqIO;
-
-my $usage = qq~Usage:$0 <args> [<opts>]
-
-where <args> are:
-
-    -i, --input          <VCF input>
-    -o, --out            <Output basename>
-      
-      <opts> are:
-
-    -s, --samples        <Samples to be analyzed. Comma separated list>
-    -c, --chromosomes    <List of chromosomes to be analyzed.>
-    -e, --export         <Output format (VCF/freq/plink. Default: VCF>
-    -f, --frequency      <Minimum MAF. Default: 0.001>
-    -m, --max_freq       <Maximum MAF. Default: 0.5>
-    -a, --allow_missing  <Allowed missing data proportion per site. Must be comprised between 0 and 1. Default: 1>
-    -t, --type           <Type of polymorphisms to keep (ALL/SNP). Default: ALL>
-    -b, --bounds         <Lower bound and upper bound for a range of sites to be processed (start,end). Default: 1, 100000000>
-    -r, --remove_filt    <Remove all sites with a FILTER flag other than PASS (true/false). Default: false>
-    -d, --distance       <Thin sites so that no two sites are within the specified distance from one another. Default: 0>
-~;
-$usage .= "\n";
-
-my ($input,$out);
-
-my $PLINK_EXE = "plink";
-
-
-#my $indel_size_max = 500;
-#my $indel_size_min = 1;
-my $frequency_max = 0.5;
-my $frequency_min = 0.001;
-my $pos_max = 100000000000;
-my $pos_min = 0;
-my $filter_snp_type = "all";
-my $remove_filt = "False";
-
-my $missing_data = 1;
-my $export = "VCF";
-my $type = "ALL";
-my $bounds;
-my $samples;
-my $chromosomes;
-my $thin;
-
-GetOptions(
-	"input=s"        => \$input,
-	"out=s"          => \$out,
-	"samples=s"      => \$samples,
-	"chromosomes=s"  => \$chromosomes,
-	"frequency=s"    => \$frequency_min,
-	"max_freq=s"     => \$frequency_max,
-	"allow_missing=s"=> \$missing_data,
-	"export=s"       => \$export,
-	"type=s"         => \$type,
-	"bounds=s"       => \$bounds,
-	"remove_filt=s"  => \$remove_filt,
-	"distance=s"         => \$thin
-);
-
-
-die $usage
-  if ( !$input || !$out);
-
-if ($samples && $samples =~/^([\w\,\-\.]+)\s*$/){
-	$samples = $1;
-}
-elsif ($samples){
-	die "Error: Samples must be a comma separated list of string\n";
-}
-if ($bounds && $bounds =~/^([\d\,]+)\s*$/){
-	$bounds = $1;
-}
-elsif($bounds){
-	die "Error: Bounds must be a comma separated list of integers\n";
-}
-
-my $minfreq_cmd = "";
-if ($frequency_min && $frequency_min > 0 && $frequency_min =~/^([\d\.]+)\s*$/){
-	$frequency_min = $1;
-	$minfreq_cmd = "--maf $frequency_min";
-}
-elsif ($frequency_min == 0){
-	$minfreq_cmd = "";
-}
-elsif ($frequency_min){
-	die "Error: frequency must be an integer\n";
-}
-if ($thin && $thin =~/^([\d\.]+)\s*$/){
-        $thin = $1;
-}
-elsif ($thin){
-        die "Error: frequency must be an integer\n";
-}
-my $maxfreq_cmd = "";
-if ($frequency_max && $frequency_max =~/^([\d\.]+)\s*$/){
-	$frequency_max = $1;
-	if ($frequency_max < 0.5){
-		$maxfreq_cmd = "--max-maf $frequency_max";		
-	}
-}
-elsif($frequency_max){
-	die "Error: frequency must be an integer\n";
-}
-if ($missing_data =~/^([\d\.]+)\s*$/){
-	$missing_data = $1;
-	#$missing_data = 1 - $missing_data;	
-}
-elsif ($missing_data){
-	die "Error: Missing data must be an integer\n";
-}
-if ($export && $export =~/^([\w]+)\s*$/){
-	$export = $1;
-}
-elsif($export){
-	die "Error: Export must be a string\n";
-}
-if ($type && $type =~/^([\w]+)\s*$/){
-	$type = $1;
-}
-elsif($type){
-	die "Error: Type must be a string\n";
-}
-
-
-my @dnasamples;
-if ($samples)
-{
-	@dnasamples = split(",",$samples);
-}
-my @boundaries;
-if ($bounds)
-{
-	@boundaries = split(",",$bounds);
-}
-
-
-my $experiment = "chromosomes";
-my $table = "";
-my %genes;
-my @snp_ids;
-my @snp_ids_and_positions;
-my @snp_ids_and_positions_all;
-my $gene;
-my $snp_num = 0;
-my %ref_sequences;
-my %snps_of_gene;
-
-my $indiv_cmd = "";
-if (@dnasamples)
-{
-	if (scalar @dnasamples > 1)
-	{
-		open(my $S,">$out.samples");
-		foreach my $samp(@dnasamples){
-			print $S "$samp	$samp\n";
-		}
-		close($S);
-		$indiv_cmd = "--keep $out.samples ";
-	}
-	else
-	{
-		$indiv_cmd = "--indv " . join(" --indv ",@dnasamples);
-	}
-}
-
-my $chrom_cmd = "";
-if ($chromosomes)
-{
-	$chrom_cmd = "--chr ".$chromosomes
-}
-
-my $export_cmd = "--recode vcf-iid";
-if ($export eq "bcf"){
-	$export_cmd = "--recode bcf";
-}
-if ($export eq "freq"){
-	$export_cmd = "--freq";
-}
-if ($export eq "plink"){
-	$export_cmd = "--make-bed";
-}
-if ($export eq "bed"){
-        $export_cmd = "--make-bed";
-} 
-
-
-my $bounds_cmd = "";
-if (@boundaries && $chrom_cmd=~/\w/ && $chrom_cmd !~/,/)
-{
-        $bounds_cmd = "--from-bp $boundaries[0] --to-bp $boundaries[1]";
-}
-
-
- 
-my $type_cmd = "";
-if ($type eq "SNP")
-{
-	$type_cmd = "--snps-only";
-}
-
-my $filt_cmd = "";
-if ($remove_filt eq "true")
-{
-	$filt_cmd = "--remove-filtered-all";
-}
-
-my $thin_cmd = "";
-if ($thin){
-	$thin_cmd = "--bp-space $thin";
-}
-
-#my $bcf_input = $input;
-#$bcf_input =~s/vcf/bcf/g;
-my $bcf_input;
-my $bed_input = $input;
-$bed_input =~s/\.bed//g;
-
-if (-e "$bed_input.bed"){
-        system("$PLINK_EXE --bfile $bed_input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.plink.stdout 2>$out.plink.stderr");
-	# for first 1000 SNPs
-	system("$PLINK_EXE --bfile $bed_input --out $out.recode $type_cmd --recode vcf-fid $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr --thin-count 800 1>$out.2.plink.stdout 2>$out.2.plink.stderr");
-}
-elsif (-e $bcf_input){
-	system("$PLINK_EXE --bcf $bcf_input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.plink.stdout 2>$out.plink.stderr");
-}
-else
-{
-	system("$PLINK_EXE --vcf $input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.3.plink.stdout 2>$out.3.plink.stderr");
-
-}
-
-
-
-
-	
--- a/find_indiv.py	Fri Aug 05 10:26:44 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,20 +0,0 @@
-import sys
-import os
-import re
-
-def get_field_samples_options(dataset):
-	options = []
-	line=os.popen("grep '#CHROM' %s"%dataset.file_name).read()[:-1].split('\t')
-	index=line.index('FORMAT')
-	for opt in line[index+1:] :
-		options.append((opt,opt, True))
-	return options
-
-def get_field_chrs_options(dataset):
-        options = []
-        chrs=os.popen("grep '##contig' %s"%dataset.file_name).read()[:-1].split('\n')
-        for line in chrs:
-		opt=re.search('^##contig=<ID=(\w+),length=',line).group(1)
-                options.append((opt,opt, True))
-        return options
-
--- a/plink.sh	Fri Aug 05 10:26:44 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,49 +0,0 @@
-#!/bin/bash
-
-
-tool_path=$(dirname $0)
-
-filein=$1
-fileout_label=$(date "+%Y%m%d%H%M%S")
-fileout=$2
-filelog=$3
-frequency=$4
-max_freq=$5
-allow_missing=$6
-type=${7}
-bound_start=${8}
-bound_end=${9}
-
-cp -rf $filein input$$.vcf
-
-if [ "${10}" != "None" ]
-then samples="--samples ${10}"
-fi
-
-if [ "${11}" != "None" ]
-then chromosomes="--chromosomes ${11}"
-fi
-
-if [ "$bound_start" -gt "$bound_end" ]
-then tmp=$bound_start ; bound_start=$bound_end ; bound_end=$tmp ; echo "Warning : Lower bound must be lower than greater bound!" >&2
-fi
-
-
-export="VCF"
-
-perl $tool_path/Plink.pl --input input$$.vcf --out $fileout_label --export $export --frequency $frequency --max_freq $max_freq --allow_missing $allow_missing --type $type --bounds $bound_start','$bound_end $samples $chromosomes
-
-
-#echo ${16} >>$fileout_label.log
-#echo ${15} >>$fileout_label.log
-#echo ${17} >>$fileout_label.log
-#echo ${18} >>$fileout_label.log
-
-if [ "$export" = "VCF" ]
-then cp  $fileout_label.vcf $fileout ; rm $fileout_label.vcf
-else cp  $fileout_label.bed $fileout; cp $fileout_label.bed ${15} ; cp $fileout_label.bim ${18} ;rm $fileout_label.bed $fileout_label.fam $fileout_label.bim
-fi
-
-cp $fileout_label.log $filelog
-rm $fileout_label.log
-
--- a/plink.xml	Fri Aug 05 10:26:44 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,83 +0,0 @@
-<tool id="sniplay_plink" name="plink" version="1.0">
-    
-    <!-- [REQUIRED] Tool description displayed after the tool name -->
-    <description> - Filter large VCF file</description>
-    
-    <!-- [OPTIONAL] 3rd party tools, binaries, modules... required for the tool to work -->
-    <requirements>
-        <requirement type="binary">perl</requirement>
-	<requirement type="package" version="1.90">plink</requirement>
-    </requirements>
-    
-
-    <!-- [STRONGLY RECOMMANDED] Exit code rules -->
-    <stdio>
-        <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR -->
-        <exit_code range="1:" level="fatal" />
-    </stdio>    
-
-    <!-- [REQUIRED] The command to execute -->
-    <command interpreter="bash">
-	./plink.sh $vcf $fileout $filelog $frequency $max_freq $allow_missing $type_p $bound_start $bound_end
-	#if str( $samples ) == "":
-	'None'
-	#else
-	$samples
-	#end if
-	#if str( $chromosomes ) == "":
-	'None'
-	#else
-	$chromosomes
-	#end if
-    </command>
-    <code file="find_indiv.py"/>
-    <!-- [REQUIRED] Input files and tool parameters -->
-    <inputs>
-	<param name="vcf" type="data" format="vcf" optional="false" label="VCF input" />
-
-	<param name="samples" type="select" label="Samples" multiple="true" dynamic_options="get_field_samples_options(vcf)" help="Samples to be analyzed." />
-	<!--<param name="samples" type="text" label="Samples" multiple="true" help="Samples to be analyzed." />-->
-        <!--<param name="chromosomes" type="select" label="Chromosomes" multiple="true" dynamic_options="get_field_chrs_options(input)" help="Chromosomes to be analyzed." />-->
-	<param name="frequency" type="float" value="0" label="Minimum MAF" help="Minimum Minor Allele Frequency (MAF)" />
-	<param name="max_freq" type="float" value="0.5" label="Maximum MAF" help="Maximum Minor Allele Frequency (MAF)" />
-	<param name="allow_missing" type="float" value="1" min="0" max="1" label="Missing data proportion" help="Allowed missing data proportion per site. Must be comprised between 0 and 1." />
-        <param name="type_p" type="select" label="Polymorphisms" help="Type of polymorphisms to keep." >
-            <option value="ALL" selected="true">All</option>
-            <option value="SNP">SNPs only</option>
-        </param>
-        <param name="chromosomes" type="text" label="Chromosomes" multiple="true" help="Chromosomes to be analyzed. (Comma-separated list of reference sequences, ex: Chr1,Chr2)" />
-	<param name="bound_start" type="integer" value="1" label="Lower bound" help="Lower bound for a range of sites to be processed." />
-	<param name="bound_end" type="integer" value="100000000" label="Upper bound" help="Upper bound for a range of sites to be processed." />
-    </inputs>
-    
-    <!-- [REQUIRED] Output files -->
-    <outputs>
-	<data name="fileout" format="vcf" label="Plink filtered VCF"/>
-	<data name="filelog" format="txt" label="Plink logfile" />
-    </outputs>
-    
-    
-    <!-- [OPTIONAL] Help displayed in Galaxy -->
-    <help>
-
-.. class:: infomark
-
-**Authors**     Shaun Purcell : plink_
-
-.. _plink: https://www.cog-genomics.org/plink2
-
- | **Please cite** "PLINK: a toolset for whole-genome association and population-based linkage analysis.", Purcell S, Neale B, Todd-Brown K, Thomas L, Ferreira MAR, Bender D, Maller J, Sklar P, de Bakker PIW, Daly MJ, Sham PC, **American Journal of Human Genetics**, 2007 
-
-.. class:: infomark
-
-**Galaxy integration** Dereeper Alexis (IRD), Andres Gwendoline (Institut Français de Bioinformatique).
-
-.. class:: infomark
-
-**Support** For any questions about Galaxy integration, please send an e-mail to support.abims@sb-roscoff.fr
-
-
-
-    </help>
-
-</tool>
--- a/tool-data/tool_dependencies.xml	Fri Aug 05 10:26:44 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="plink" version="1.90">
-        <repository changeset_revision="98e3a3cf0a35" name="package_plink_1_90" owner="dereeper" toolshed="https://toolshed.g2.bx.psu.edu" />
-    </package>
-</tool_dependency>