diff extract_data_single_galaxy.pl @ 0:b7d6db3ba6bc draft

Uploaded
author grau
date Wed, 13 Nov 2013 04:25:23 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_data_single_galaxy.pl	Wed Nov 13 04:25:23 2013 -0500
@@ -0,0 +1,128 @@
+use strict;
+
+if(@ARGV == 0){
+die <<USAGE
+usage:
+perl extract_data.pl <chromFa> <bedfile> <chromcol> <startcol> <seccolm> <secondcol> <width> <statcol> <outfile>
+    
+    <chromFa>: the chromosome FastA containing all chromosome sequences
+    <bedfile>:  the file containing the peaks in tabular format, 
+                e.g., bed, gff, narrowPeak
+    <chromcol>: the column of <bedfile> containing the chromosome
+    <startcol>: the column of <bedfile> containing the start position relative to
+                the chromosome start
+    <seccolm>:   center: "Center of peak (relative to start)", end: "End of peak (global coordinates)"
+    <secondcol>:  the column of <bedfile> containing the peak center position (center) relative to
+                <startcol> or the column of <bedfile> containing the end position (end)
+    <width>:    fixed width of all regions
+    <statcol>:  the column of <bedfile> containing the peak statistic
+                or a similar measure of confidence
+    <outfile>:  the path to the output file, written as FastA
+USAGE
+}
+
+
+my $chromFa = $ARGV[0];
+my $bed = $ARGV[1];
+my $chromcol = $ARGV[2]-1;
+my $startcol = $ARGV[3]-1;
+my $seccolm = $ARGV[4];
+my $seccol = $ARGV[5]-1;
+my $width = $ARGV[6];
+my $statcol = $ARGV[7]-1;
+my $outfile = $ARGV[8];
+
+my $sort = 1;
+
+
+sub loadSeq{
+	my $prefix = shift;
+	print $prefix," ";
+	open(FA,$chromFa);
+	my $head = "";
+	my @lines = ();
+	while(<FA>){
+		chomp();
+		if(/^>/){
+			if($head){
+				last;
+			}
+			if(/^>\s*(${prefix}|chr${prefix})(\s.*$|$)/i){
+				$head = $_;
+			}
+		}elsif($head){
+			push(@lines,lc($_));
+		}
+	}
+	my $str = join("",@lines);
+	print "loaded\n";
+	return $str;
+}
+
+
+
+open(IN,$ARGV[1]);
+
+my @lines = ();
+
+while(<IN>){
+	chomp();
+	my @parts = split("\t",$_);
+	$parts[$chromcol] =~ s/chr0/chr/g;
+	my @vals = ();
+	if($seccolm eq "center"){
+		@vals = ($parts[$chromcol],$parts[$startcol]+$parts[$seccol],$parts[$statcol]);
+	}else{
+		@vals = ($parts[$chromcol],int(($parts[$startcol]+$parts[$seccol])/2),$parts[$statcol]);
+	}
+	push(@vals,$width);
+	push(@lines,\@vals);
+}
+
+close(IN);
+#print "Read input file ".$bed."\n";
+
+
+if($sort){
+
+	@lines = sort { ${$a}[0] cmp ${$b}[0]  } @lines;
+
+}
+
+open(OUT,">".$outfile);
+
+print "Extracting sequences...\n\n";
+
+my $oldchr = "";
+my $sequence = "";
+for my $line (@lines){
+	my @ar = @{$line};
+	my $chr = $ar[0];
+	unless($chr eq $oldchr){
+		$sequence = loadSeq($chr);
+	}
+	$oldchr = $chr;
+	my $w = $ar[3];
+	if($w <= 0){
+		print $w," -> next\n";
+		next;
+	}
+	if($w % 2 == 0){
+		$w = $w/2;
+	}else{
+		$w = ($w-1)/2;
+	}
+
+	my $start = $ar[1]-$w-1;
+
+	my $head = "> chr: ".$chr."; start: ".$start."; peak: ".($ar[1]-$start)."; signal: ".$ar[2]."\n";
+	my $curr = substr($sequence,$start,$ar[3]);
+	if($curr =~ /[^ACGTacgt]/){
+		print "Sequence for\n\t",substr($head,1),"omitted due to ambiguous nucleotides.\n\n";
+	}else{
+		print OUT $head,$curr,"\n";
+	}
+}
+
+close(OUT);
+print "\nDone.\n";
\ No newline at end of file