view Plink.pl @ 8:f733bf4f49ca draft default tip

planemo upload commit 475f4d7d8442a0d75e103af326ae5881c4d2a4ac
author dereeper
date Mon, 16 Apr 2018 08:58:55 -0400
parents d6a7be1b5adb
children
line wrap: on
line source


#!/usr/bin/perl

use strict;
use Getopt::Long;
use Bio::SeqIO;

my $usage = qq~Usage:$0 <args> [<opts>]

where <args> are:

    -i, --input          <VCF input>
    -o, --out            <Output basename>
      
      <opts> are:

    -s, --samples        <Samples to be analyzed. Comma separated list>
    -c, --chromosomes    <List of chromosomes to be analyzed.>
    -e, --export         <Output format (VCF/freq/plink. Default: VCF>
    -f, --frequency      <Minimum MAF. Default: 0.001>
    -m, --max_freq       <Maximum MAF. Default: 0.5>
    -a, --allow_missing  <Allowed missing data proportion per site. Must be comprised between 0 and 1. Default: 1>
    -t, --type           <Type of polymorphisms to keep (ALL/SNP). Default: ALL>
    -b, --bounds         <Lower bound and upper bound for a range of sites to be processed (start,end). Default: 1, 100000000>
    -r, --remove_filt    <Remove all sites with a FILTER flag other than PASS (true/false). Default: false>
    -d, --distance       <Thin sites so that no two sites are within the specified distance from one another. Default: 0>
~;
$usage .= "\n";

my ($input,$out);

my $PLINK_EXE = "plink";


#my $indel_size_max = 500;
#my $indel_size_min = 1;
my $frequency_max = 0.5;
my $frequency_min = 0.001;
my $pos_max = 100000000000;
my $pos_min = 0;
my $filter_snp_type = "all";
my $remove_filt = "False";

my $missing_data = 1;
my $export = "VCF";
my $type = "ALL";
my $bounds;
my $samples;
my $chromosomes;
my $thin;

GetOptions(
	"input=s"        => \$input,
	"out=s"          => \$out,
	"samples=s"      => \$samples,
	"chromosomes=s"  => \$chromosomes,
	"frequency=s"    => \$frequency_min,
	"max_freq=s"     => \$frequency_max,
	"allow_missing=s"=> \$missing_data,
	"export=s"       => \$export,
	"type=s"         => \$type,
	"bounds=s"       => \$bounds,
	"remove_filt=s"  => \$remove_filt,
	"distance=s"         => \$thin
);


die $usage
  if ( !$input || !$out);

if ($samples && $samples =~/^([\w\,\-\.]+)\s*$/){
	$samples = $1;
}
elsif ($samples){
	die "Error: Samples must be a comma separated list of string\n";
}
if ($bounds && $bounds =~/^([\d\,]+)\s*$/){
	$bounds = $1;
}
elsif($bounds){
	die "Error: Bounds must be a comma separated list of integers\n";
}

my $minfreq_cmd = "";
if ($frequency_min && $frequency_min > 0 && $frequency_min =~/^([\d\.]+)\s*$/){
	$frequency_min = $1;
	$minfreq_cmd = "--maf $frequency_min";
}
elsif ($frequency_min == 0){
	$minfreq_cmd = "";
}
elsif ($frequency_min){
	die "Error: frequency must be an integer\n";
}
if ($thin && $thin =~/^([\d\.]+)\s*$/){
        $thin = $1;
}
elsif ($thin){
        die "Error: frequency must be an integer\n";
}
my $maxfreq_cmd = "";
if ($frequency_max && $frequency_max =~/^([\d\.]+)\s*$/){
	$frequency_max = $1;
	if ($frequency_max < 0.5){
		$maxfreq_cmd = "--max-maf $frequency_max";		
	}
}
elsif($frequency_max){
	die "Error: frequency must be an integer\n";
}
if ($missing_data =~/^([\d\.]+)\s*$/){
	$missing_data = $1;
	#$missing_data = 1 - $missing_data;	
}
elsif ($missing_data){
	die "Error: Missing data must be an integer\n";
}
if ($export && $export =~/^([\w]+)\s*$/){
	$export = $1;
}
elsif($export){
	die "Error: Export must be a string\n";
}
if ($type && $type =~/^([\w]+)\s*$/){
	$type = $1;
}
elsif($type){
	die "Error: Type must be a string\n";
}


my @dnasamples;
if ($samples)
{
	@dnasamples = split(",",$samples);
}
my @boundaries;
if ($bounds)
{
	@boundaries = split(",",$bounds);
}


my $experiment = "chromosomes";
my $table = "";
my %genes;
my @snp_ids;
my @snp_ids_and_positions;
my @snp_ids_and_positions_all;
my $gene;
my $snp_num = 0;
my %ref_sequences;
my %snps_of_gene;

my $indiv_cmd = "";
if (@dnasamples)
{
	if (scalar @dnasamples > 1)
	{
		open(my $S,">$out.samples");
		foreach my $samp(@dnasamples){
			print $S "$samp	$samp\n";
		}
		close($S);
		$indiv_cmd = "--keep $out.samples ";
	}
	else
	{
		$indiv_cmd = "--indv " . join(" --indv ",@dnasamples);
	}
}

my $chrom_cmd = "";
if ($chromosomes)
{
	$chrom_cmd = "--chr ".$chromosomes
}

my $export_cmd = "--recode vcf-iid";
if ($export eq "bcf"){
	$export_cmd = "--recode bcf";
}
if ($export eq "freq"){
	$export_cmd = "--freq";
}
if ($export eq "plink"){
	$export_cmd = "--make-bed";
}
if ($export eq "bed"){
        $export_cmd = "--make-bed";
} 


my $bounds_cmd = "";
if (@boundaries && $chrom_cmd=~/\w/ && $chrom_cmd !~/,/)
{
        $bounds_cmd = "--from-bp $boundaries[0] --to-bp $boundaries[1]";
}


 
my $type_cmd = "";
if ($type eq "SNP")
{
	$type_cmd = "--snps-only";
}

my $filt_cmd = "";
if ($remove_filt eq "true")
{
	$filt_cmd = "--remove-filtered-all";
}

my $thin_cmd = "";
if ($thin){
	$thin_cmd = "--bp-space $thin";
}

#my $bcf_input = $input;
#$bcf_input =~s/vcf/bcf/g;
my $bcf_input;
my $bed_input = $input;
$bed_input =~s/\.bed//g;

if (-e "$bed_input.bed"){
        system("$PLINK_EXE --bfile $bed_input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.plink.stdout 2>$out.plink.stderr");
	# for first 1000 SNPs
	system("$PLINK_EXE $bed_input --out $out.recode $type_cmd --recode vcf-fid $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr --thin-count 800 1>$out.2.plink.stdout 2>$out.2.plink.stderr");
}
elsif (-e $bcf_input){
	system("$PLINK_EXE --bcf $bcf_input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.plink.stdout 2>$out.plink.stderr");
}
else
{
	system("$PLINK_EXE --vcf $input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.3.plink.stdout 2>$out.3.plink.stderr");

}