Mercurial > repos > grau > dimont_motif_discovery
view extract_data_single_galaxy.pl @ 1:eb36f7f72fb1 draft
Uploaded
author | grau |
---|---|
date | Wed, 20 Nov 2013 04:33:20 -0500 |
parents | b7d6db3ba6bc |
children |
line wrap: on
line source
use strict; if(@ARGV == 0){ die <<USAGE usage: perl extract_data.pl <chromFa> <bedfile> <chromcol> <startcol> <seccolm> <secondcol> <width> <statcol> <outfile> <chromFa>: the chromosome FastA containing all chromosome sequences <bedfile>: the file containing the peaks in tabular format, e.g., bed, gff, narrowPeak <chromcol>: the column of <bedfile> containing the chromosome <startcol>: the column of <bedfile> containing the start position relative to the chromosome start <seccolm>: center: "Center of peak (relative to start)", end: "End of peak (global coordinates)" <secondcol>: the column of <bedfile> containing the peak center position (center) relative to <startcol> or the column of <bedfile> containing the end position (end) <width>: fixed width of all regions <statcol>: the column of <bedfile> containing the peak statistic or a similar measure of confidence <outfile>: the path to the output file, written as FastA USAGE } my $chromFa = $ARGV[0]; my $bed = $ARGV[1]; my $chromcol = $ARGV[2]-1; my $startcol = $ARGV[3]-1; my $seccolm = $ARGV[4]; my $seccol = $ARGV[5]-1; my $width = $ARGV[6]; my $statcol = $ARGV[7]-1; my $outfile = $ARGV[8]; my $sort = 1; sub loadSeq{ my $prefix = shift; print $prefix," "; open(FA,$chromFa); my $head = ""; my @lines = (); while(<FA>){ chomp(); if(/^>/){ if($head){ last; } if(/^>\s*(${prefix}|chr${prefix})(\s.*$|$)/i){ $head = $_; } }elsif($head){ push(@lines,lc($_)); } } my $str = join("",@lines); print "loaded\n"; return $str; } open(IN,$ARGV[1]); my @lines = (); while(<IN>){ chomp(); my @parts = split("\t",$_); $parts[$chromcol] =~ s/chr0/chr/g; my @vals = (); if($seccolm eq "center"){ @vals = ($parts[$chromcol],$parts[$startcol]+$parts[$seccol],$parts[$statcol]); }else{ @vals = ($parts[$chromcol],int(($parts[$startcol]+$parts[$seccol])/2),$parts[$statcol]); } push(@vals,$width); push(@lines,\@vals); } close(IN); #print "Read input file ".$bed."\n"; if($sort){ @lines = sort { ${$a}[0] cmp ${$b}[0] } @lines; } open(OUT,">".$outfile); print "Extracting sequences...\n\n"; my $oldchr = ""; my $sequence = ""; for my $line (@lines){ my @ar = @{$line}; my $chr = $ar[0]; unless($chr eq $oldchr){ $sequence = loadSeq($chr); } $oldchr = $chr; my $w = $ar[3]; if($w <= 0){ print $w," -> next\n"; next; } if($w % 2 == 0){ $w = $w/2; }else{ $w = ($w-1)/2; } my $start = $ar[1]-$w-1; my $head = "> chr: ".$chr."; start: ".$start."; peak: ".($ar[1]-$start)."; signal: ".$ar[2]."\n"; my $curr = substr($sequence,$start,$ar[3]); if($curr =~ /[^ACGTacgt]/){ print "Sequence for\n\t",substr($head,1),"omitted due to ambiguous nucleotides.\n\n"; }else{ print OUT $head,$curr,"\n"; } } close(OUT); print "\nDone.\n";