comparison bin/get_bed_fa_j.pl @ 1:adc0f7765d85 draft

planemo upload
author bioitcore
date Thu, 07 Sep 2017 15:06:58 -0400
parents
children
comparison
equal deleted inserted replaced
0:d4ca551ca300 1:adc0f7765d85
1 # Adapted from Chenghai Xue's script
2
3 $starttime=time();
4
5 $input_file_1 = $ARGV[0]; # exon junction file
6 $input_file_2 = $ARGV[1]; # genome file list
7 $output_file_1 = $ARGV[2]; # exon junction bed (might be less than input_file_1
8 $output_file_2 = $ARGV[3]; # exon junction fa
9 #$leftLen = $ARGV[4];
10 #$rightLen = $ARGV[5];
11
12 open(IN_1, "$input_file_1") or die "can't open the input file : $!";
13 open(IN_2, "$input_file_2") or die "can't open the input file : $!";
14 open OUT_1, ">$output_file_1" or die "Can not open output_file : $!";
15 open OUT_2, ">$output_file_2" or die "Can not open output_file : $!";
16
17 @chromList = (<IN_2>);
18 chomp(@chromList);
19 $len_chromList = @chromList;
20 print "BED2FA: in $input_file_2, found $len_chromList chromosomes\n";
21 foreach $one (@chromList){
22 if($one =~ /\/(chr.[^\/]*?)\.*fa$/i){
23 $chr_hash{$1} = $one;
24 #print $1,"\n";
25 }
26 }
27 @key_chr_hash = keys(%chr_hash);
28 $len_key_chr_hash = @key_chr_hash;
29 @sort_key_chr_hash = sort_chromNo(@key_chr_hash);
30 $len_sort_key_chr_hash = @sort_key_chr_hash;
31 #for($i=0; $i<$len_sort_key_chr_hash; $i++){
32 # print "$sort_key_chr_hash[$i] $chr_hash{$sort_key_chr_hash[$i]}\n";
33 #}
34
35 $num_1=0;
36 $num_2=0;
37 $num_count_chrom=0;
38 my ($chrom, $chromStart, $chromEnd, $name, $score, $strand, $thickStart, $thickEnd, $itemRgb, $blockCount, $blockSizes, $blockStarts);
39 $current_chrom = "";
40 while(<IN_1>){
41 $num_1++;
42 $line = $_;
43 chomp $line;
44 #print $line,"\n";
45 @cols = split ("\t", $line);
46 if(scalar(@cols)==12)
47 {
48 ($chrom, $chromStart, $chromEnd, $name, $score, $strand, $thickStart, $thickEnd, $itemRgb, $blockCount, $blockSizes, $blockStarts) = @cols;
49 }
50 if(scalar(@cols)!=12)
51 {
52 ($chrom, $chromStart, $chromEnd, $name, $score, $strand)=@cols;
53 $thickStart=$chromStart;
54 # print $thickStart,"\n";
55 $thickEnd = $chromEnd;
56 $blockCount=1;
57 $blockSizes=$chromEnd-$chromStart;
58 $blockStarts = 0;
59 }
60 $strand="+" if !$strand;
61 @a_blockSizes = split (/\,/, $blockSizes);
62 @a_blockStarts = split (/\,/, $blockStarts);
63 if($chrom ne $current_chrom){
64 if($num_1 != 1){
65 print "$num_chr_1 $num_chr_2 $len_contigSeqStr\n";
66 }
67 print "BED2FA: $chrom: ";
68
69 $num_chr_1=0;
70 $num_chr_2=0;
71
72 if(exists $chr_hash{$chrom}){
73 $num_count_chrom++;
74 $current_chrom = $chrom;
75 #print $current_chrom,"\n";
76 #=pod
77 $chromFastaFile = $chr_hash{$chrom};
78 #print $chromFastaFile,"\n";
79 open($fin, "<$chromFastaFile") or die "can't open the chrom file : $!";
80 local ($/) = undef;
81 $contigSeqStr = <$fin>;
82 close ($fin);
83 #print $contigSeqStr,"mark\t";
84 $contigSeqStr =~s/^\>.*?\n//g;
85 #print $contigSeqStr,"mark2\t";
86
87 $contigSeqStr =~s/\s|\n//g;
88 #print $contigSeqStr,"mark3\n";
89
90 $len_contigSeqStr = length $contigSeqStr;
91 #=cut
92 }
93 else{
94 $num_chr_1++;
95 next;
96 }
97 }
98 $num_chr_1++;
99
100 # modify from here................................
101 my @Starts;
102 my @Ends;
103 my @JuncSeq;
104 my $ssStrTag=1;
105 for($i_wuj=0;$i_wuj<$blockCount;$i_wuj++)
106 {
107 $Starts[$i_wuj] = $chromStart + $a_blockStarts[$i_wuj];
108 $Ends[$i_wuj] = $Starts[$i_wuj] + $a_blockSizes[$i_wuj];
109 $JuncSeq[$i_wuj] = uc substr ($contigSeqStr,$Starts[$i_wuj], $a_blockSizes[$i_wuj]);
110 if($strand eq "-"){
111 $JuncSeq[$i_wuj] = uc string_reverse_complement(lc $JuncSeq[$i_wuj]);
112 }
113 }
114 # for($i_wuj=0;$i_wuj<$blockCount-1;$i_wuj++)
115 # {
116 # $ssStr = uc substr ($contigSeqStr, $Ends[$i_wuj], 2) . substr ($contigSeqStr, $Starts[$i_wuj+1] - 2, 2);
117 # if($strand eq "-"){
118 # $ssStr = uc string_reverse_complement(lc $ssStr);
119 #$ssStr = $rc_ssStr;
120 # }
121 # $ssStrTag = 0 if ($ssStr ne "GTAG");
122
123 # }
124 # if($ssStrTag ==1){
125 if(1){
126 $num_2++;
127 $num_chr_2++;
128 print OUT_1 "$line\n";
129 #print OUT_2 ">$name\|$chrom\|$chromStart\|$chromEnd\|$strand\|$ssStr\|$num_2\n$junctionSeqStrLeft$junctionSeqStrRight\n";
130 # print OUT_2 ">$name\|$chrom\|$chromStart\|$chromEnd\|$strand\|GTAG\|$num_2\|$blockCount\n";
131 print OUT_2 ">$name\|$chrom\|$chromStart\|$chromEnd\|$strand\|$num_2\|$blockCount\n";
132 if($strand eq "+")
133 {
134 for($i_wuj=0;$i_wuj<$blockCount;$i_wuj++)
135 {
136 print OUT_2 $JuncSeq[$i_wuj];
137 }
138 }
139 else
140 {
141 for($i_wuj=$blockCount-1;$i_wuj>-1;$i_wuj--)
142 {
143 print OUT_2 $JuncSeq[$i_wuj];
144 }
145
146 }
147 print OUT_2 "\n";
148 }
149
150 }
151 print "$num_chr_1 $num_chr_2 $len_contigSeqStr\n";
152 print "BED2FA: in file1, $num_count_chrom chroms, $num_1 beds, $num_2 saved.\n";
153
154 close IN_1 or die "can't close the input file : $!";
155 close IN_2 or die "can't close the input file : $!";
156 close OUT_1 or die "can't close the output file : $!";
157 close OUT_2 or die "can't close the output file : $!";
158
159 #######################################
160 $complete_time = time()-$starttime;
161 print "BED2FA: Run $complete_time seconds...Done!\n";
162
163 #######################################
164 # sub fuctions
165 sub string_reverse_complement{
166 local($string) = @_;
167 local($len_str, $ret, $i, $char);
168
169 $len_str = length $string;
170 $ret = "";
171 for($i=0; $i<$len_str; $i++){
172 $char = substr($string, $i, 1);
173 if($char eq 'a'){
174 $char = 't';
175 }
176 elsif($char eq 't'){
177 $char = 'a';
178 }
179 elsif($char eq 'c'){
180 $char = 'g';
181 }
182 elsif($char eq 'g'){
183 $char = 'c';
184 }
185 else{
186 $char = 'n';
187 }
188 $ret = $char.$ret;
189 }
190
191 return $ret;
192 }
193
194 sub sort_chromNo{
195 local(@chrom) = @_;
196 local($len_key_chr_hash, $i, @sort_chr_hash);
197 local(@digit_random, @words_random, @digit_other_1, @digit_other_2, @words_other_1, @words_other_2, @digit, @words);
198 local(@sort_digit, @sort_words, @sort_digit_random, @sort_words_random, @sort_digit_other, @sort_words_other);
199 local($len_digit, $len_words, $len_digit_random, $len_words_random, $len_digit_other, $len_words_other, $term);
200
201 $len_key_chr_hash = @chrom;
202 # sort via chr number for printing result
203 for($i=0; $i<$len_key_chr_hash; $i++){
204 if($key_chr_hash[$i] =~ /chr(\d+)\_random/){
205 push(@digit_random, $1);
206 }
207 elsif($key_chr_hash[$i] =~ /chr(\w+)\_random/){
208 push(@words_random, $1);
209 }
210 elsif($key_chr_hash[$i] =~ /chr(\d+)\_([\w\d\_]+)/){
211 push(@digit_other_1, $1);
212 push(@digit_other_2, $2);
213 }
214 elsif($key_chr_hash[$i] =~ /chr(\w+)\_([\w\d\_]+)/){
215 push(@words_other_1, $1);
216 push(@words_other_2, $2);
217 }
218 elsif($key_chr_hash[$i] =~ /chr(\d+)/){
219 push(@digit, $1);
220 }
221 elsif($key_chr_hash[$i] =~ /chr(\w+)/){
222 push(@words, $1);
223 }
224 else{
225 print "BED2FA: There is unknown type of chromosomes: $key_chr_hash[$i]\n";
226 }
227 }
228 @sort_digit = sort by_mostly_numeric @digit;
229 @sort_words = sort by_mostly_string @words;
230 @sort_digit_random = sort by_mostly_numeric @digit_random;
231 @sort_words_random = sort by_mostly_string @words_random;
232 @sort_digit_other = sort_2_array_number_string(\@digit_other_1, \@digit_other_2);
233 @sort_words_other = sort_2_array_string_string(\@words_other_1, \@words_other_2);
234
235 $len_digit = @sort_digit;
236 for($i=0; $i<$len_digit; $i++){
237 $term = "chr".$sort_digit[$i];
238 push(@sort_chr_hash, $term);
239 }
240 $len_words = @sort_words;
241 for($i=0; $i<$len_words; $i++){
242 $term = "chr".$sort_words[$i];
243 push(@sort_chr_hash, $term);
244 }
245 $len_digit_random = @sort_digit_random;
246 for($i=0; $i<$len_digit_random; $i++){
247 $term = "chr".$sort_digit_random[$i]."_random";
248 push(@sort_chr_hash, $term);
249 }
250 $len_words_random = @sort_words_random;
251 for($i=0; $i<$len_words_random; $i++){
252 $term = "chr".$sort_words_random[$i]."_random";
253 push(@sort_chr_hash, $term);
254 }
255 $len_digit_other = @sort_digit_other;
256 for($i=0; $i<$len_digit_other; $i=$i+2){
257 $term = "chr".$sort_digit_other[$i]."_".$sort_digit_other[$i+1];
258 push(@sort_chr_hash, $term);
259 }
260 $len_words_other = @sort_words_other;
261 for($i=0; $i<$len_words_other; $i=$i+2){
262 $term = "chr".$sort_words_other[$i]."_".$sort_words_other[$i+1];
263 push(@sort_chr_hash, $term);
264 }
265
266 return @sort_chr_hash;
267 }
268
269 sub sort_2_array_number_string{
270 local($a, $b) = @_;
271 local($len_a, $len_b, $i, %family, $one, $two);
272 local(@ret);
273
274 $len_a = @$a;
275 $len_b = @$b;
276 if($len_a == $len_b){
277 for($i=0; $i<$len_a; $i++){
278 $family{$$a[$i]}{$$b[$i]} = 0;
279 }
280 for $one (sort by_mostly_numeric keys %family) {
281 for $two (sort by_mostly_string keys %{ $family{$one} }) {
282 push(@ret, $one);
283 push(@ret, $two);
284 }
285 }
286 }
287 else{
288 print "ERROR: Sort array is not same size\n";
289 print "a $len_a, b $len_b\n";
290 }
291
292 return @ret;
293 }
294
295 sub sort_2_array_string_string{
296 local($a, $b) = @_;
297 local($len_a, $len_b, $i, %family, $one, $two);
298 local(@ret);
299
300 $len_a = @$a;
301 $len_b = @$b;
302 if($len_a == $len_b){
303 for($i=0; $i<$len_a; $i++){
304 $family{$$a[$i]}{$$b[$i]} = 0;
305 }
306 for $one (sort by_mostly_string keys %family) {
307 for $two (sort by_mostly_string keys %{ $family{$one} }) {
308 push(@ret, $one);
309 push(@ret, $two);
310 }
311 }
312 }
313 else{
314 print "ERROR: Sort array is not same size\n";
315 print "a $len_a, b $len_b\n";
316 }
317
318 return @ret;
319 }
320
321 sub by_mostly_numeric{
322 # ( $a <=> $b ) || ( $a cmp $b );
323 ( $a <=> $b );
324 }
325
326 sub by_mostly_string{
327 # ( $a <=> $b ) || ( $a cmp $b );
328 ( $a cmp $b );
329 }
330