view extract_data_single_galaxy.pl @ 0:b7d6db3ba6bc draft

Uploaded
author grau
date Wed, 13 Nov 2013 04:25:23 -0500
parents
children
line wrap: on
line source

use strict;

if(@ARGV == 0){
die <<USAGE
usage:
perl extract_data.pl <chromFa> <bedfile> <chromcol> <startcol> <seccolm> <secondcol> <width> <statcol> <outfile>
    
    <chromFa>: the chromosome FastA containing all chromosome sequences
    <bedfile>:  the file containing the peaks in tabular format, 
                e.g., bed, gff, narrowPeak
    <chromcol>: the column of <bedfile> containing the chromosome
    <startcol>: the column of <bedfile> containing the start position relative to
                the chromosome start
    <seccolm>:   center: "Center of peak (relative to start)", end: "End of peak (global coordinates)"
    <secondcol>:  the column of <bedfile> containing the peak center position (center) relative to
                <startcol> or the column of <bedfile> containing the end position (end)
    <width>:    fixed width of all regions
    <statcol>:  the column of <bedfile> containing the peak statistic
                or a similar measure of confidence
    <outfile>:  the path to the output file, written as FastA
USAGE
}


my $chromFa = $ARGV[0];
my $bed = $ARGV[1];
my $chromcol = $ARGV[2]-1;
my $startcol = $ARGV[3]-1;
my $seccolm = $ARGV[4];
my $seccol = $ARGV[5]-1;
my $width = $ARGV[6];
my $statcol = $ARGV[7]-1;
my $outfile = $ARGV[8];

my $sort = 1;


sub loadSeq{
	my $prefix = shift;
	print $prefix," ";
	open(FA,$chromFa);
	my $head = "";
	my @lines = ();
	while(<FA>){
		chomp();
		if(/^>/){
			if($head){
				last;
			}
			if(/^>\s*(${prefix}|chr${prefix})(\s.*$|$)/i){
				$head = $_;
			}
		}elsif($head){
			push(@lines,lc($_));
		}
	}
	my $str = join("",@lines);
	print "loaded\n";
	return $str;
}



open(IN,$ARGV[1]);

my @lines = ();

while(<IN>){
	chomp();
	my @parts = split("\t",$_);
	$parts[$chromcol] =~ s/chr0/chr/g;
	my @vals = ();
	if($seccolm eq "center"){
		@vals = ($parts[$chromcol],$parts[$startcol]+$parts[$seccol],$parts[$statcol]);
	}else{
		@vals = ($parts[$chromcol],int(($parts[$startcol]+$parts[$seccol])/2),$parts[$statcol]);
	}
	push(@vals,$width);
	push(@lines,\@vals);
}

close(IN);
#print "Read input file ".$bed."\n";


if($sort){

	@lines = sort { ${$a}[0] cmp ${$b}[0]  } @lines;

}

open(OUT,">".$outfile);

print "Extracting sequences...\n\n";

my $oldchr = "";
my $sequence = "";
for my $line (@lines){
	my @ar = @{$line};
	my $chr = $ar[0];
	unless($chr eq $oldchr){
		$sequence = loadSeq($chr);
	}
	$oldchr = $chr;
	my $w = $ar[3];
	if($w <= 0){
		print $w," -> next\n";
		next;
	}
	if($w % 2 == 0){
		$w = $w/2;
	}else{
		$w = ($w-1)/2;
	}

	my $start = $ar[1]-$w-1;

	my $head = "> chr: ".$chr."; start: ".$start."; peak: ".($ar[1]-$start)."; signal: ".$ar[2]."\n";
	my $curr = substr($sequence,$start,$ar[3]);
	if($curr =~ /[^ACGTacgt]/){
		print "Sequence for\n\t",substr($head,1),"omitted due to ambiguous nucleotides.\n\n";
	}else{
		print OUT $head,$curr,"\n";
	}
}

close(OUT);
print "\nDone.\n";