Mercurial > repos > grau > dimont_motif_discovery
comparison extract_data_single_galaxy.pl @ 0:b7d6db3ba6bc draft
Uploaded
author | grau |
---|---|
date | Wed, 13 Nov 2013 04:25:23 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b7d6db3ba6bc |
---|---|
1 use strict; | |
2 | |
3 if(@ARGV == 0){ | |
4 die <<USAGE | |
5 usage: | |
6 perl extract_data.pl <chromFa> <bedfile> <chromcol> <startcol> <seccolm> <secondcol> <width> <statcol> <outfile> | |
7 | |
8 <chromFa>: the chromosome FastA containing all chromosome sequences | |
9 <bedfile>: the file containing the peaks in tabular format, | |
10 e.g., bed, gff, narrowPeak | |
11 <chromcol>: the column of <bedfile> containing the chromosome | |
12 <startcol>: the column of <bedfile> containing the start position relative to | |
13 the chromosome start | |
14 <seccolm>: center: "Center of peak (relative to start)", end: "End of peak (global coordinates)" | |
15 <secondcol>: the column of <bedfile> containing the peak center position (center) relative to | |
16 <startcol> or the column of <bedfile> containing the end position (end) | |
17 <width>: fixed width of all regions | |
18 <statcol>: the column of <bedfile> containing the peak statistic | |
19 or a similar measure of confidence | |
20 <outfile>: the path to the output file, written as FastA | |
21 USAGE | |
22 } | |
23 | |
24 | |
25 my $chromFa = $ARGV[0]; | |
26 my $bed = $ARGV[1]; | |
27 my $chromcol = $ARGV[2]-1; | |
28 my $startcol = $ARGV[3]-1; | |
29 my $seccolm = $ARGV[4]; | |
30 my $seccol = $ARGV[5]-1; | |
31 my $width = $ARGV[6]; | |
32 my $statcol = $ARGV[7]-1; | |
33 my $outfile = $ARGV[8]; | |
34 | |
35 my $sort = 1; | |
36 | |
37 | |
38 sub loadSeq{ | |
39 my $prefix = shift; | |
40 print $prefix," "; | |
41 open(FA,$chromFa); | |
42 my $head = ""; | |
43 my @lines = (); | |
44 while(<FA>){ | |
45 chomp(); | |
46 if(/^>/){ | |
47 if($head){ | |
48 last; | |
49 } | |
50 if(/^>\s*(${prefix}|chr${prefix})(\s.*$|$)/i){ | |
51 $head = $_; | |
52 } | |
53 }elsif($head){ | |
54 push(@lines,lc($_)); | |
55 } | |
56 } | |
57 my $str = join("",@lines); | |
58 print "loaded\n"; | |
59 return $str; | |
60 } | |
61 | |
62 | |
63 | |
64 open(IN,$ARGV[1]); | |
65 | |
66 my @lines = (); | |
67 | |
68 while(<IN>){ | |
69 chomp(); | |
70 my @parts = split("\t",$_); | |
71 $parts[$chromcol] =~ s/chr0/chr/g; | |
72 my @vals = (); | |
73 if($seccolm eq "center"){ | |
74 @vals = ($parts[$chromcol],$parts[$startcol]+$parts[$seccol],$parts[$statcol]); | |
75 }else{ | |
76 @vals = ($parts[$chromcol],int(($parts[$startcol]+$parts[$seccol])/2),$parts[$statcol]); | |
77 } | |
78 push(@vals,$width); | |
79 push(@lines,\@vals); | |
80 } | |
81 | |
82 close(IN); | |
83 #print "Read input file ".$bed."\n"; | |
84 | |
85 | |
86 if($sort){ | |
87 | |
88 @lines = sort { ${$a}[0] cmp ${$b}[0] } @lines; | |
89 | |
90 } | |
91 | |
92 open(OUT,">".$outfile); | |
93 | |
94 print "Extracting sequences...\n\n"; | |
95 | |
96 my $oldchr = ""; | |
97 my $sequence = ""; | |
98 for my $line (@lines){ | |
99 my @ar = @{$line}; | |
100 my $chr = $ar[0]; | |
101 unless($chr eq $oldchr){ | |
102 $sequence = loadSeq($chr); | |
103 } | |
104 $oldchr = $chr; | |
105 my $w = $ar[3]; | |
106 if($w <= 0){ | |
107 print $w," -> next\n"; | |
108 next; | |
109 } | |
110 if($w % 2 == 0){ | |
111 $w = $w/2; | |
112 }else{ | |
113 $w = ($w-1)/2; | |
114 } | |
115 | |
116 my $start = $ar[1]-$w-1; | |
117 | |
118 my $head = "> chr: ".$chr."; start: ".$start."; peak: ".($ar[1]-$start)."; signal: ".$ar[2]."\n"; | |
119 my $curr = substr($sequence,$start,$ar[3]); | |
120 if($curr =~ /[^ACGTacgt]/){ | |
121 print "Sequence for\n\t",substr($head,1),"omitted due to ambiguous nucleotides.\n\n"; | |
122 }else{ | |
123 print OUT $head,$curr,"\n"; | |
124 } | |
125 } | |
126 | |
127 close(OUT); | |
128 print "\nDone.\n"; |