annotate extract_data_single_galaxy.pl @ 0:b7d6db3ba6bc draft

Uploaded
author grau
date Wed, 13 Nov 2013 04:25:23 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
1 use strict;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
2
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
3 if(@ARGV == 0){
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
4 die <<USAGE
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
5 usage:
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
6 perl extract_data.pl <chromFa> <bedfile> <chromcol> <startcol> <seccolm> <secondcol> <width> <statcol> <outfile>
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
7
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
8 <chromFa>: the chromosome FastA containing all chromosome sequences
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
9 <bedfile>: the file containing the peaks in tabular format,
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
10 e.g., bed, gff, narrowPeak
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
11 <chromcol>: the column of <bedfile> containing the chromosome
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
12 <startcol>: the column of <bedfile> containing the start position relative to
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
13 the chromosome start
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
14 <seccolm>: center: "Center of peak (relative to start)", end: "End of peak (global coordinates)"
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
15 <secondcol>: the column of <bedfile> containing the peak center position (center) relative to
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
16 <startcol> or the column of <bedfile> containing the end position (end)
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
17 <width>: fixed width of all regions
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
18 <statcol>: the column of <bedfile> containing the peak statistic
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
19 or a similar measure of confidence
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
20 <outfile>: the path to the output file, written as FastA
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
21 USAGE
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
22 }
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
23
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
24
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
25 my $chromFa = $ARGV[0];
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
26 my $bed = $ARGV[1];
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
27 my $chromcol = $ARGV[2]-1;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
28 my $startcol = $ARGV[3]-1;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
29 my $seccolm = $ARGV[4];
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
30 my $seccol = $ARGV[5]-1;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
31 my $width = $ARGV[6];
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
32 my $statcol = $ARGV[7]-1;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
33 my $outfile = $ARGV[8];
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
34
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
35 my $sort = 1;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
36
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
37
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
38 sub loadSeq{
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
39 my $prefix = shift;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
40 print $prefix," ";
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
41 open(FA,$chromFa);
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
42 my $head = "";
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
43 my @lines = ();
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
44 while(<FA>){
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
45 chomp();
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
46 if(/^>/){
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
47 if($head){
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
48 last;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
49 }
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
50 if(/^>\s*(${prefix}|chr${prefix})(\s.*$|$)/i){
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
51 $head = $_;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
52 }
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
53 }elsif($head){
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
54 push(@lines,lc($_));
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
55 }
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
56 }
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
57 my $str = join("",@lines);
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
58 print "loaded\n";
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
59 return $str;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
60 }
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
61
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
62
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
63
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
64 open(IN,$ARGV[1]);
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
65
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
66 my @lines = ();
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
67
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
68 while(<IN>){
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
69 chomp();
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
70 my @parts = split("\t",$_);
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
71 $parts[$chromcol] =~ s/chr0/chr/g;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
72 my @vals = ();
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
73 if($seccolm eq "center"){
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
74 @vals = ($parts[$chromcol],$parts[$startcol]+$parts[$seccol],$parts[$statcol]);
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
75 }else{
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
76 @vals = ($parts[$chromcol],int(($parts[$startcol]+$parts[$seccol])/2),$parts[$statcol]);
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
77 }
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
78 push(@vals,$width);
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
79 push(@lines,\@vals);
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
80 }
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
81
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
82 close(IN);
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
83 #print "Read input file ".$bed."\n";
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
84
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
85
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
86 if($sort){
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
87
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
88 @lines = sort { ${$a}[0] cmp ${$b}[0] } @lines;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
89
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
90 }
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
91
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
92 open(OUT,">".$outfile);
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
93
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
94 print "Extracting sequences...\n\n";
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
95
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
96 my $oldchr = "";
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
97 my $sequence = "";
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
98 for my $line (@lines){
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
99 my @ar = @{$line};
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
100 my $chr = $ar[0];
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
101 unless($chr eq $oldchr){
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
102 $sequence = loadSeq($chr);
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
103 }
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
104 $oldchr = $chr;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
105 my $w = $ar[3];
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
106 if($w <= 0){
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
107 print $w," -> next\n";
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
108 next;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
109 }
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
110 if($w % 2 == 0){
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
111 $w = $w/2;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
112 }else{
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
113 $w = ($w-1)/2;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
114 }
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
115
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
116 my $start = $ar[1]-$w-1;
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
117
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
118 my $head = "> chr: ".$chr."; start: ".$start."; peak: ".($ar[1]-$start)."; signal: ".$ar[2]."\n";
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
119 my $curr = substr($sequence,$start,$ar[3]);
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
120 if($curr =~ /[^ACGTacgt]/){
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
121 print "Sequence for\n\t",substr($head,1),"omitted due to ambiguous nucleotides.\n\n";
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
122 }else{
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
123 print OUT $head,$curr,"\n";
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
124 }
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
125 }
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
126
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
127 close(OUT);
b7d6db3ba6bc Uploaded
grau
parents:
diff changeset
128 print "\nDone.\n";