comparison convert_bowtie_to_blast.pl @ 23:cad6a1dfb174 draft

Uploaded
author big-tiandm
date Wed, 05 Nov 2014 21:11:49 -0500
parents 07745c0958dd
children
comparison
equal deleted inserted replaced
22:b6686462d0cb 23:cad6a1dfb174
1 #!/usr/bin/perl
2
3
4 use warnings;
5 use strict;
6 use Getopt::Std;
7
8 ######################################### USAGE ################################
9
10 my $usage=
11 "$0 file_bowtie_result file_solexa_seq file_chromosome
12
13 This is a converter which changes Bowtie output into Blast format.
14 The input includes three files: a Bowtie result file (default Bowtie
15 output file), a fasta file consisting of small Reads and a chromosome
16 fasta file. It outputs the alignments in blast_parsed format.
17
18 file_bowtie_result likes:
19
20 AtFlower100010_x2 + MIR319c 508 AAGGAGATTCTTTCAGTCCAG IIIIIIIIIIIIIIIIIIIII 0
21 AtFlower1000188_x1 + MIR2933a 421 TCGGAGAGGAAATTCGTCGGCG IIIIIIIIIIIIIIIIIIIIII 0
22
23 file_solexa_seq likes:
24
25 >AtFlower100010_x2
26 AAGGAGATTCTTTCAGTCCAG
27
28 file_chromosome contains chromosome seq in fasta format
29
30 ";
31
32
33 ####################################### INPUT FILES ############################
34
35 my $file_bowtie_result=shift or die $usage;
36 my $file_short_seq=shift or die $usage;
37 my $file_chromosome_seq=shift or die $usage;
38
39
40 ##################################### GLOBAL VARIBALES #########################
41
42 my %short_seq_length=();
43 my %chromosome_length=();
44
45
46 ######################################### MAIN #################################
47
48 #get the short sequence id and its length
49 sequence_length($file_short_seq,\%short_seq_length);
50
51 #get the chromosome sequence id and its length
52 sequence_length($file_chromosome_seq,\%chromosome_length);
53
54 #convert bowtie result format to blast format;
55 change_format($file_bowtie_result);
56
57 exit;
58
59
60 ##################################### SUBROUTINES ##############################
61
62 sub sequence_length{
63 my ($file,$hash) = @_;
64 my ($id, $desc, $sequence, $seq_length) = ();
65
66 open (FASTA, "<$file") or die "can not open $$file\n";
67 while (<FASTA>)
68 {
69 chomp;
70 if (/^>(\S+)(.*)/)
71 {
72 $id = $1;
73 $desc = $2;
74 $sequence = "";
75 while (<FASTA>){
76 chomp;
77 if (/^>(\S+)(.*)/){
78 $$hash{$id} = length $sequence;
79 $id = $1;
80 $desc = $2;
81 $sequence = "";
82 next;
83 }
84 $sequence .= $_;
85 }
86 }
87 }
88 $seq_length=length($sequence);
89 $$hash{$id} = $seq_length;
90 close FASTA;
91 }
92
93
94
95
96
97 sub change_format{
98 #Change Bowtie format into blast format
99 my $file=shift @_;
100 open(FILE,"<$file")||die"can not open the bowtie result file:$!\n";
101 #open(BLASTOUT,">blastout")||die"can not create the blastout file:$!\n";
102
103 while(<FILE>){
104 chomp;
105 my @tmp=split("\t",$_);
106 #Clean the reads ID
107 my @tmp1=split(" ",$tmp[0]);
108 print "$tmp1[0]"."\t"."$short_seq_length{$tmp1[0]}"."\t"."1".'..'."$short_seq_length{$tmp1[0]}"."\t"."$tmp[2]"."\t"."$chromosome_length{$tmp[2]}"."\t";
109 if($tmp[1] eq "+"){
110 my $seq_end=$tmp[3] + $short_seq_length{$tmp1[0]};
111 my $seq_bg=$tmp[3] + 1;
112 print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Plus"."\n";
113 }
114 if($tmp[1] eq "-"){
115 my $seq_end=$chromosome_length{$tmp[2]} - $tmp[3];
116 my $seq_bg=$seq_end - $short_seq_length{$tmp1[0]} + 1;
117 print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Minus"."\n";
118 }
119 }
120
121 # close BLASTOUT;
122
123 }
124
125
126