Mercurial > repos > bioitcore > splicetrap
comparison bin/get_bed_fa_j.pl @ 1:adc0f7765d85 draft
planemo upload
| author | bioitcore |
|---|---|
| date | Thu, 07 Sep 2017 15:06:58 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:d4ca551ca300 | 1:adc0f7765d85 |
|---|---|
| 1 # Adapted from Chenghai Xue's script | |
| 2 | |
| 3 $starttime=time(); | |
| 4 | |
| 5 $input_file_1 = $ARGV[0]; # exon junction file | |
| 6 $input_file_2 = $ARGV[1]; # genome file list | |
| 7 $output_file_1 = $ARGV[2]; # exon junction bed (might be less than input_file_1 | |
| 8 $output_file_2 = $ARGV[3]; # exon junction fa | |
| 9 #$leftLen = $ARGV[4]; | |
| 10 #$rightLen = $ARGV[5]; | |
| 11 | |
| 12 open(IN_1, "$input_file_1") or die "can't open the input file : $!"; | |
| 13 open(IN_2, "$input_file_2") or die "can't open the input file : $!"; | |
| 14 open OUT_1, ">$output_file_1" or die "Can not open output_file : $!"; | |
| 15 open OUT_2, ">$output_file_2" or die "Can not open output_file : $!"; | |
| 16 | |
| 17 @chromList = (<IN_2>); | |
| 18 chomp(@chromList); | |
| 19 $len_chromList = @chromList; | |
| 20 print "BED2FA: in $input_file_2, found $len_chromList chromosomes\n"; | |
| 21 foreach $one (@chromList){ | |
| 22 if($one =~ /\/(chr.[^\/]*?)\.*fa$/i){ | |
| 23 $chr_hash{$1} = $one; | |
| 24 #print $1,"\n"; | |
| 25 } | |
| 26 } | |
| 27 @key_chr_hash = keys(%chr_hash); | |
| 28 $len_key_chr_hash = @key_chr_hash; | |
| 29 @sort_key_chr_hash = sort_chromNo(@key_chr_hash); | |
| 30 $len_sort_key_chr_hash = @sort_key_chr_hash; | |
| 31 #for($i=0; $i<$len_sort_key_chr_hash; $i++){ | |
| 32 # print "$sort_key_chr_hash[$i] $chr_hash{$sort_key_chr_hash[$i]}\n"; | |
| 33 #} | |
| 34 | |
| 35 $num_1=0; | |
| 36 $num_2=0; | |
| 37 $num_count_chrom=0; | |
| 38 my ($chrom, $chromStart, $chromEnd, $name, $score, $strand, $thickStart, $thickEnd, $itemRgb, $blockCount, $blockSizes, $blockStarts); | |
| 39 $current_chrom = ""; | |
| 40 while(<IN_1>){ | |
| 41 $num_1++; | |
| 42 $line = $_; | |
| 43 chomp $line; | |
| 44 #print $line,"\n"; | |
| 45 @cols = split ("\t", $line); | |
| 46 if(scalar(@cols)==12) | |
| 47 { | |
| 48 ($chrom, $chromStart, $chromEnd, $name, $score, $strand, $thickStart, $thickEnd, $itemRgb, $blockCount, $blockSizes, $blockStarts) = @cols; | |
| 49 } | |
| 50 if(scalar(@cols)!=12) | |
| 51 { | |
| 52 ($chrom, $chromStart, $chromEnd, $name, $score, $strand)=@cols; | |
| 53 $thickStart=$chromStart; | |
| 54 # print $thickStart,"\n"; | |
| 55 $thickEnd = $chromEnd; | |
| 56 $blockCount=1; | |
| 57 $blockSizes=$chromEnd-$chromStart; | |
| 58 $blockStarts = 0; | |
| 59 } | |
| 60 $strand="+" if !$strand; | |
| 61 @a_blockSizes = split (/\,/, $blockSizes); | |
| 62 @a_blockStarts = split (/\,/, $blockStarts); | |
| 63 if($chrom ne $current_chrom){ | |
| 64 if($num_1 != 1){ | |
| 65 print "$num_chr_1 $num_chr_2 $len_contigSeqStr\n"; | |
| 66 } | |
| 67 print "BED2FA: $chrom: "; | |
| 68 | |
| 69 $num_chr_1=0; | |
| 70 $num_chr_2=0; | |
| 71 | |
| 72 if(exists $chr_hash{$chrom}){ | |
| 73 $num_count_chrom++; | |
| 74 $current_chrom = $chrom; | |
| 75 #print $current_chrom,"\n"; | |
| 76 #=pod | |
| 77 $chromFastaFile = $chr_hash{$chrom}; | |
| 78 #print $chromFastaFile,"\n"; | |
| 79 open($fin, "<$chromFastaFile") or die "can't open the chrom file : $!"; | |
| 80 local ($/) = undef; | |
| 81 $contigSeqStr = <$fin>; | |
| 82 close ($fin); | |
| 83 #print $contigSeqStr,"mark\t"; | |
| 84 $contigSeqStr =~s/^\>.*?\n//g; | |
| 85 #print $contigSeqStr,"mark2\t"; | |
| 86 | |
| 87 $contigSeqStr =~s/\s|\n//g; | |
| 88 #print $contigSeqStr,"mark3\n"; | |
| 89 | |
| 90 $len_contigSeqStr = length $contigSeqStr; | |
| 91 #=cut | |
| 92 } | |
| 93 else{ | |
| 94 $num_chr_1++; | |
| 95 next; | |
| 96 } | |
| 97 } | |
| 98 $num_chr_1++; | |
| 99 | |
| 100 # modify from here................................ | |
| 101 my @Starts; | |
| 102 my @Ends; | |
| 103 my @JuncSeq; | |
| 104 my $ssStrTag=1; | |
| 105 for($i_wuj=0;$i_wuj<$blockCount;$i_wuj++) | |
| 106 { | |
| 107 $Starts[$i_wuj] = $chromStart + $a_blockStarts[$i_wuj]; | |
| 108 $Ends[$i_wuj] = $Starts[$i_wuj] + $a_blockSizes[$i_wuj]; | |
| 109 $JuncSeq[$i_wuj] = uc substr ($contigSeqStr,$Starts[$i_wuj], $a_blockSizes[$i_wuj]); | |
| 110 if($strand eq "-"){ | |
| 111 $JuncSeq[$i_wuj] = uc string_reverse_complement(lc $JuncSeq[$i_wuj]); | |
| 112 } | |
| 113 } | |
| 114 # for($i_wuj=0;$i_wuj<$blockCount-1;$i_wuj++) | |
| 115 # { | |
| 116 # $ssStr = uc substr ($contigSeqStr, $Ends[$i_wuj], 2) . substr ($contigSeqStr, $Starts[$i_wuj+1] - 2, 2); | |
| 117 # if($strand eq "-"){ | |
| 118 # $ssStr = uc string_reverse_complement(lc $ssStr); | |
| 119 #$ssStr = $rc_ssStr; | |
| 120 # } | |
| 121 # $ssStrTag = 0 if ($ssStr ne "GTAG"); | |
| 122 | |
| 123 # } | |
| 124 # if($ssStrTag ==1){ | |
| 125 if(1){ | |
| 126 $num_2++; | |
| 127 $num_chr_2++; | |
| 128 print OUT_1 "$line\n"; | |
| 129 #print OUT_2 ">$name\|$chrom\|$chromStart\|$chromEnd\|$strand\|$ssStr\|$num_2\n$junctionSeqStrLeft$junctionSeqStrRight\n"; | |
| 130 # print OUT_2 ">$name\|$chrom\|$chromStart\|$chromEnd\|$strand\|GTAG\|$num_2\|$blockCount\n"; | |
| 131 print OUT_2 ">$name\|$chrom\|$chromStart\|$chromEnd\|$strand\|$num_2\|$blockCount\n"; | |
| 132 if($strand eq "+") | |
| 133 { | |
| 134 for($i_wuj=0;$i_wuj<$blockCount;$i_wuj++) | |
| 135 { | |
| 136 print OUT_2 $JuncSeq[$i_wuj]; | |
| 137 } | |
| 138 } | |
| 139 else | |
| 140 { | |
| 141 for($i_wuj=$blockCount-1;$i_wuj>-1;$i_wuj--) | |
| 142 { | |
| 143 print OUT_2 $JuncSeq[$i_wuj]; | |
| 144 } | |
| 145 | |
| 146 } | |
| 147 print OUT_2 "\n"; | |
| 148 } | |
| 149 | |
| 150 } | |
| 151 print "$num_chr_1 $num_chr_2 $len_contigSeqStr\n"; | |
| 152 print "BED2FA: in file1, $num_count_chrom chroms, $num_1 beds, $num_2 saved.\n"; | |
| 153 | |
| 154 close IN_1 or die "can't close the input file : $!"; | |
| 155 close IN_2 or die "can't close the input file : $!"; | |
| 156 close OUT_1 or die "can't close the output file : $!"; | |
| 157 close OUT_2 or die "can't close the output file : $!"; | |
| 158 | |
| 159 ####################################### | |
| 160 $complete_time = time()-$starttime; | |
| 161 print "BED2FA: Run $complete_time seconds...Done!\n"; | |
| 162 | |
| 163 ####################################### | |
| 164 # sub fuctions | |
| 165 sub string_reverse_complement{ | |
| 166 local($string) = @_; | |
| 167 local($len_str, $ret, $i, $char); | |
| 168 | |
| 169 $len_str = length $string; | |
| 170 $ret = ""; | |
| 171 for($i=0; $i<$len_str; $i++){ | |
| 172 $char = substr($string, $i, 1); | |
| 173 if($char eq 'a'){ | |
| 174 $char = 't'; | |
| 175 } | |
| 176 elsif($char eq 't'){ | |
| 177 $char = 'a'; | |
| 178 } | |
| 179 elsif($char eq 'c'){ | |
| 180 $char = 'g'; | |
| 181 } | |
| 182 elsif($char eq 'g'){ | |
| 183 $char = 'c'; | |
| 184 } | |
| 185 else{ | |
| 186 $char = 'n'; | |
| 187 } | |
| 188 $ret = $char.$ret; | |
| 189 } | |
| 190 | |
| 191 return $ret; | |
| 192 } | |
| 193 | |
| 194 sub sort_chromNo{ | |
| 195 local(@chrom) = @_; | |
| 196 local($len_key_chr_hash, $i, @sort_chr_hash); | |
| 197 local(@digit_random, @words_random, @digit_other_1, @digit_other_2, @words_other_1, @words_other_2, @digit, @words); | |
| 198 local(@sort_digit, @sort_words, @sort_digit_random, @sort_words_random, @sort_digit_other, @sort_words_other); | |
| 199 local($len_digit, $len_words, $len_digit_random, $len_words_random, $len_digit_other, $len_words_other, $term); | |
| 200 | |
| 201 $len_key_chr_hash = @chrom; | |
| 202 # sort via chr number for printing result | |
| 203 for($i=0; $i<$len_key_chr_hash; $i++){ | |
| 204 if($key_chr_hash[$i] =~ /chr(\d+)\_random/){ | |
| 205 push(@digit_random, $1); | |
| 206 } | |
| 207 elsif($key_chr_hash[$i] =~ /chr(\w+)\_random/){ | |
| 208 push(@words_random, $1); | |
| 209 } | |
| 210 elsif($key_chr_hash[$i] =~ /chr(\d+)\_([\w\d\_]+)/){ | |
| 211 push(@digit_other_1, $1); | |
| 212 push(@digit_other_2, $2); | |
| 213 } | |
| 214 elsif($key_chr_hash[$i] =~ /chr(\w+)\_([\w\d\_]+)/){ | |
| 215 push(@words_other_1, $1); | |
| 216 push(@words_other_2, $2); | |
| 217 } | |
| 218 elsif($key_chr_hash[$i] =~ /chr(\d+)/){ | |
| 219 push(@digit, $1); | |
| 220 } | |
| 221 elsif($key_chr_hash[$i] =~ /chr(\w+)/){ | |
| 222 push(@words, $1); | |
| 223 } | |
| 224 else{ | |
| 225 print "BED2FA: There is unknown type of chromosomes: $key_chr_hash[$i]\n"; | |
| 226 } | |
| 227 } | |
| 228 @sort_digit = sort by_mostly_numeric @digit; | |
| 229 @sort_words = sort by_mostly_string @words; | |
| 230 @sort_digit_random = sort by_mostly_numeric @digit_random; | |
| 231 @sort_words_random = sort by_mostly_string @words_random; | |
| 232 @sort_digit_other = sort_2_array_number_string(\@digit_other_1, \@digit_other_2); | |
| 233 @sort_words_other = sort_2_array_string_string(\@words_other_1, \@words_other_2); | |
| 234 | |
| 235 $len_digit = @sort_digit; | |
| 236 for($i=0; $i<$len_digit; $i++){ | |
| 237 $term = "chr".$sort_digit[$i]; | |
| 238 push(@sort_chr_hash, $term); | |
| 239 } | |
| 240 $len_words = @sort_words; | |
| 241 for($i=0; $i<$len_words; $i++){ | |
| 242 $term = "chr".$sort_words[$i]; | |
| 243 push(@sort_chr_hash, $term); | |
| 244 } | |
| 245 $len_digit_random = @sort_digit_random; | |
| 246 for($i=0; $i<$len_digit_random; $i++){ | |
| 247 $term = "chr".$sort_digit_random[$i]."_random"; | |
| 248 push(@sort_chr_hash, $term); | |
| 249 } | |
| 250 $len_words_random = @sort_words_random; | |
| 251 for($i=0; $i<$len_words_random; $i++){ | |
| 252 $term = "chr".$sort_words_random[$i]."_random"; | |
| 253 push(@sort_chr_hash, $term); | |
| 254 } | |
| 255 $len_digit_other = @sort_digit_other; | |
| 256 for($i=0; $i<$len_digit_other; $i=$i+2){ | |
| 257 $term = "chr".$sort_digit_other[$i]."_".$sort_digit_other[$i+1]; | |
| 258 push(@sort_chr_hash, $term); | |
| 259 } | |
| 260 $len_words_other = @sort_words_other; | |
| 261 for($i=0; $i<$len_words_other; $i=$i+2){ | |
| 262 $term = "chr".$sort_words_other[$i]."_".$sort_words_other[$i+1]; | |
| 263 push(@sort_chr_hash, $term); | |
| 264 } | |
| 265 | |
| 266 return @sort_chr_hash; | |
| 267 } | |
| 268 | |
| 269 sub sort_2_array_number_string{ | |
| 270 local($a, $b) = @_; | |
| 271 local($len_a, $len_b, $i, %family, $one, $two); | |
| 272 local(@ret); | |
| 273 | |
| 274 $len_a = @$a; | |
| 275 $len_b = @$b; | |
| 276 if($len_a == $len_b){ | |
| 277 for($i=0; $i<$len_a; $i++){ | |
| 278 $family{$$a[$i]}{$$b[$i]} = 0; | |
| 279 } | |
| 280 for $one (sort by_mostly_numeric keys %family) { | |
| 281 for $two (sort by_mostly_string keys %{ $family{$one} }) { | |
| 282 push(@ret, $one); | |
| 283 push(@ret, $two); | |
| 284 } | |
| 285 } | |
| 286 } | |
| 287 else{ | |
| 288 print "ERROR: Sort array is not same size\n"; | |
| 289 print "a $len_a, b $len_b\n"; | |
| 290 } | |
| 291 | |
| 292 return @ret; | |
| 293 } | |
| 294 | |
| 295 sub sort_2_array_string_string{ | |
| 296 local($a, $b) = @_; | |
| 297 local($len_a, $len_b, $i, %family, $one, $two); | |
| 298 local(@ret); | |
| 299 | |
| 300 $len_a = @$a; | |
| 301 $len_b = @$b; | |
| 302 if($len_a == $len_b){ | |
| 303 for($i=0; $i<$len_a; $i++){ | |
| 304 $family{$$a[$i]}{$$b[$i]} = 0; | |
| 305 } | |
| 306 for $one (sort by_mostly_string keys %family) { | |
| 307 for $two (sort by_mostly_string keys %{ $family{$one} }) { | |
| 308 push(@ret, $one); | |
| 309 push(@ret, $two); | |
| 310 } | |
| 311 } | |
| 312 } | |
| 313 else{ | |
| 314 print "ERROR: Sort array is not same size\n"; | |
| 315 print "a $len_a, b $len_b\n"; | |
| 316 } | |
| 317 | |
| 318 return @ret; | |
| 319 } | |
| 320 | |
| 321 sub by_mostly_numeric{ | |
| 322 # ( $a <=> $b ) || ( $a cmp $b ); | |
| 323 ( $a <=> $b ); | |
| 324 } | |
| 325 | |
| 326 sub by_mostly_string{ | |
| 327 # ( $a <=> $b ) || ( $a cmp $b ); | |
| 328 ( $a cmp $b ); | |
| 329 } | |
| 330 |
