annotate convert_bowtie_to_blast.pl @ 4:c97dcf05b5d1 draft

Uploaded
author big-tiandm
date Fri, 25 Jul 2014 05:17:20 -0400
parents faf38239b1a9
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
1 #!/usr/bin/perl
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
2
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
3
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
4 use warnings;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
5 use strict;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
6 use Getopt::Std;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
7
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
8 ######################################### USAGE ################################
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
9
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
10 my $usage=
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
11 "$0 file_bowtie_result file_solexa_seq file_chromosome
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
12
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
13 This is a converter which changes Bowtie output into Blast format.
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
14 The input includes three files: a Bowtie result file (default Bowtie
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
15 output file), a fasta file consisting of small Reads and a chromosome
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
16 fasta file. It outputs the alignments in blast_parsed format.
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
17
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
18 file_bowtie_result likes:
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
19
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
20 AtFlower100010_x2 + MIR319c 508 AAGGAGATTCTTTCAGTCCAG IIIIIIIIIIIIIIIIIIIII 0
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
21 AtFlower1000188_x1 + MIR2933a 421 TCGGAGAGGAAATTCGTCGGCG IIIIIIIIIIIIIIIIIIIIII 0
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
22
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
23 file_solexa_seq likes:
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
24
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
25 >AtFlower100010_x2
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
26 AAGGAGATTCTTTCAGTCCAG
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
27
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
28 file_chromosome contains chromosome seq in fasta format
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
29
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
30 ";
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
31
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
32
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
33 ####################################### INPUT FILES ############################
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
34
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
35 my $file_bowtie_result=shift or die $usage;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
36 my $file_short_seq=shift or die $usage;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
37 my $file_chromosome_seq=shift or die $usage;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
38
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
39
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
40 ##################################### GLOBAL VARIBALES #########################
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
41
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
42 my %short_seq_length=();
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
43 my %chromosome_length=();
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
44
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
45
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
46 ######################################### MAIN #################################
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
47
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
48 #get the short sequence id and its length
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
49 sequence_length($file_short_seq,\%short_seq_length);
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
50
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
51 #get the chromosome sequence id and its length
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
52 sequence_length($file_chromosome_seq,\%chromosome_length);
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
53
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
54 #convert bowtie result format to blast format;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
55 change_format($file_bowtie_result);
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
56
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
57 exit;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
58
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
59
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
60 ##################################### SUBROUTINES ##############################
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
61
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
62 sub sequence_length{
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
63 my ($file,$hash) = @_;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
64 my ($id, $desc, $sequence, $seq_length) = ();
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
65
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
66 open (FASTA, "<$file") or die "can not open $$file\n";
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
67 while (<FASTA>)
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
68 {
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
69 chomp;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
70 if (/^>(\S+)(.*)/)
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
71 {
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
72 $id = $1;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
73 $desc = $2;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
74 $sequence = "";
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
75 while (<FASTA>){
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
76 chomp;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
77 if (/^>(\S+)(.*)/){
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
78 $$hash{$id} = length $sequence;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
79 $id = $1;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
80 $desc = $2;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
81 $sequence = "";
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
82 next;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
83 }
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
84 $sequence .= $_;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
85 }
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
86 }
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
87 }
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
88 $seq_length=length($sequence);
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
89 $$hash{$id} = $seq_length;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
90 close FASTA;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
91 }
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
92
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
93
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
94
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
95
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
96
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
97 sub change_format{
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
98 #Change Bowtie format into blast format
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
99 my $file=shift @_;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
100 open(FILE,"<$file")||die"can not open the bowtie result file:$!\n";
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
101 #open(BLASTOUT,">blastout")||die"can not create the blastout file:$!\n";
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
102
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
103 while(<FILE>){
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
104 chomp;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
105 my @tmp=split("\t",$_);
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
106 #Clean the reads ID
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
107 my @tmp1=split(" ",$tmp[0]);
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
108 print "$tmp1[0]"."\t"."$short_seq_length{$tmp1[0]}"."\t"."1".'..'."$short_seq_length{$tmp1[0]}"."\t"."$tmp[2]"."\t"."$chromosome_length{$tmp[2]}"."\t";
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
109 if($tmp[1] eq "+"){
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
110 my $seq_end=$tmp[3] + $short_seq_length{$tmp1[0]};
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
111 my $seq_bg=$tmp[3] + 1;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
112 print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Plus"."\n";
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
113 }
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
114 if($tmp[1] eq "-"){
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
115 my $seq_end=$chromosome_length{$tmp[2]} - $tmp[3];
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
116 my $seq_bg=$seq_end - $short_seq_length{$tmp1[0]} + 1;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
117 print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Minus"."\n";
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
118 }
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
119 }
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
120
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
121 # close BLASTOUT;
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
122
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
123 }
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
124
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
125
faf38239b1a9 Uploaded
big-tiandm
parents:
diff changeset
126