0
|
1 use strict;
|
|
2
|
|
3 if(@ARGV == 0){
|
|
4 die <<USAGE
|
|
5 usage:
|
|
6 perl extract_data.pl <chromFa> <bedfile> <chromcol> <startcol> <seccolm> <secondcol> <width> <statcol> <outfile>
|
|
7
|
|
8 <chromFa>: the chromosome FastA containing all chromosome sequences
|
|
9 <bedfile>: the file containing the peaks in tabular format,
|
|
10 e.g., bed, gff, narrowPeak
|
|
11 <chromcol>: the column of <bedfile> containing the chromosome
|
|
12 <startcol>: the column of <bedfile> containing the start position relative to
|
|
13 the chromosome start
|
|
14 <seccolm>: center: "Center of peak (relative to start)", end: "End of peak (global coordinates)"
|
|
15 <secondcol>: the column of <bedfile> containing the peak center position (center) relative to
|
|
16 <startcol> or the column of <bedfile> containing the end position (end)
|
|
17 <width>: fixed width of all regions
|
|
18 <statcol>: the column of <bedfile> containing the peak statistic
|
|
19 or a similar measure of confidence
|
|
20 <outfile>: the path to the output file, written as FastA
|
|
21 USAGE
|
|
22 }
|
|
23
|
|
24
|
|
25 my $chromFa = $ARGV[0];
|
|
26 my $bed = $ARGV[1];
|
|
27 my $chromcol = $ARGV[2]-1;
|
|
28 my $startcol = $ARGV[3]-1;
|
|
29 my $seccolm = $ARGV[4];
|
|
30 my $seccol = $ARGV[5]-1;
|
|
31 my $width = $ARGV[6];
|
|
32 my $statcol = $ARGV[7]-1;
|
|
33 my $outfile = $ARGV[8];
|
|
34
|
|
35 my $sort = 1;
|
|
36
|
|
37
|
|
38 sub loadSeq{
|
|
39 my $prefix = shift;
|
|
40 print $prefix," ";
|
|
41 open(FA,$chromFa);
|
|
42 my $head = "";
|
|
43 my @lines = ();
|
|
44 while(<FA>){
|
|
45 chomp();
|
|
46 if(/^>/){
|
|
47 if($head){
|
|
48 last;
|
|
49 }
|
|
50 if(/^>\s*(${prefix}|chr${prefix})(\s.*$|$)/i){
|
|
51 $head = $_;
|
|
52 }
|
|
53 }elsif($head){
|
|
54 push(@lines,lc($_));
|
|
55 }
|
|
56 }
|
|
57 my $str = join("",@lines);
|
|
58 print "loaded\n";
|
|
59 return $str;
|
|
60 }
|
|
61
|
|
62
|
|
63
|
|
64 open(IN,$ARGV[1]);
|
|
65
|
|
66 my @lines = ();
|
|
67
|
|
68 while(<IN>){
|
|
69 chomp();
|
|
70 my @parts = split("\t",$_);
|
|
71 $parts[$chromcol] =~ s/chr0/chr/g;
|
|
72 my @vals = ();
|
|
73 if($seccolm eq "center"){
|
|
74 @vals = ($parts[$chromcol],$parts[$startcol]+$parts[$seccol],$parts[$statcol]);
|
|
75 }else{
|
|
76 @vals = ($parts[$chromcol],int(($parts[$startcol]+$parts[$seccol])/2),$parts[$statcol]);
|
|
77 }
|
|
78 push(@vals,$width);
|
|
79 push(@lines,\@vals);
|
|
80 }
|
|
81
|
|
82 close(IN);
|
|
83 #print "Read input file ".$bed."\n";
|
|
84
|
|
85
|
|
86 if($sort){
|
|
87
|
|
88 @lines = sort { ${$a}[0] cmp ${$b}[0] } @lines;
|
|
89
|
|
90 }
|
|
91
|
|
92 open(OUT,">".$outfile);
|
|
93
|
|
94 print "Extracting sequences...\n\n";
|
|
95
|
|
96 my $oldchr = "";
|
|
97 my $sequence = "";
|
|
98 for my $line (@lines){
|
|
99 my @ar = @{$line};
|
|
100 my $chr = $ar[0];
|
|
101 unless($chr eq $oldchr){
|
|
102 $sequence = loadSeq($chr);
|
|
103 }
|
|
104 $oldchr = $chr;
|
|
105 my $w = $ar[3];
|
|
106 if($w <= 0){
|
|
107 print $w," -> next\n";
|
|
108 next;
|
|
109 }
|
|
110 if($w % 2 == 0){
|
|
111 $w = $w/2;
|
|
112 }else{
|
|
113 $w = ($w-1)/2;
|
|
114 }
|
|
115
|
|
116 my $start = $ar[1]-$w-1;
|
|
117
|
|
118 my $head = "> chr: ".$chr."; start: ".$start."; peak: ".($ar[1]-$start)."; signal: ".$ar[2]."\n";
|
|
119 my $curr = substr($sequence,$start,$ar[3]);
|
|
120 if($curr =~ /[^ACGTacgt]/){
|
|
121 print "Sequence for\n\t",substr($head,1),"omitted due to ambiguous nucleotides.\n\n";
|
|
122 }else{
|
|
123 print OUT $head,$curr,"\n";
|
|
124 }
|
|
125 }
|
|
126
|
|
127 close(OUT);
|
|
128 print "\nDone.\n"; |