annotate AGEseq_web.pl @ 0:a9c5e846dd76 draft

Perl code
author lxue
date Fri, 15 May 2015 16:37:02 -0400
parents
children 449c8cf8fa3f
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1 #!/usr/bin/perl
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
2 use strict;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
3 my $file_design = $ARGV[0];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
4 my $read_file = $ARGV[1];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
5 my $mismatch_cutoff = $ARGV[2]; # mismatch rate to filter low quality alignment, default 0.1 (10 %)
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
6 my $min_cutoff = $ARGV[3]; # cutoff to filter reads with low abundance, default 0
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
7 my $wt_like_report = $ARGV[4]; # report top xx WT like records, default 20
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
8 my $indel_report = $ARGV[5]; # report top xx records with indel, default 50
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
9 my $final_out = $ARGV[6];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
10
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
11 #my $blat = ''; # working directory or PATH
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
12 my $blat = '/usr/local/bin/blat'; # Your prefered full location
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
13
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
14 my $rand = rand 1000000;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
15 ## setting for reports
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
16 if(not defined ($mismatch_cutoff )){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
17 $mismatch_cutoff = 0.1 ; # mismatch rate to filter low quality alignment, default 0.1 (10 %)
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
18 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
19
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
20 if(not defined ($min_cutoff )){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
21 $min_cutoff = 0 ; # cutoff to filter reads with low abundance, default 0
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
22 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
23
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
24 if(not defined ($wt_like_report )){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
25 $wt_like_report = 20 ; # report top xx WT like records, default 20
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
26 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
27
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
28 if(not defined ($indel_report )){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
29 $indel_report = 50 ; # report top xx records with indel, default 50
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
30 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
31
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
32
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
33 my $remove_files = 1 ; # keep (0) or delete (1) intermediate files, default = 1
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
34
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
35 ###############################################################
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
36 # default setting
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
37
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
38 my $remove = 'rm';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
39 my $osname = $^O;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
40
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
41 my $design_fas = '/tmp/fasta'.$rand.'_DESIGN.fa';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
42 my @psl_file_data = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
43
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
44 if(not defined ($file_design )){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
45 die "Design file is needed\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
46 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
47
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
48 if(not defined ($final_out )){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
49 $final_out = 'AGE_output.txt';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
50 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
51 ##########################################################
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
52 ### step 1 load design file
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
53
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
54 open DESIGN,"<$file_design" or die "File $file_design not found error here\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
55 open DESIGNFAS,">$design_fas" or die "can not wirte $design_fas not found error here\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
56
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
57 my $num = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
58 my %design_hash = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
59
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
60
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
61 while (my $line = <DESIGN>) {
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
62 $num ++;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
63 if($num == 1){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
64 next;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
65 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
66 if($line !~ m/\w/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
67 next;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
68 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
69
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
70 chomp $line;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
71 my @temp = split(/\t/,$line);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
72 my $id = $temp[0];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
73 my $seq = $temp[1];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
74 $seq =~ s/\s+//g;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
75 $seq =~ m/([^\r\n]+)/;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
76 $seq = $1;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
77 $id =~ s/\s+/\_/g;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
78
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
79 $design_hash{$id} = $seq;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
80 print DESIGNFAS ">$id\n$seq\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
81 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
82 close(DESIGNFAS);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
83
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
84
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
85 #######################################################
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
86 ############## step 2 - load read files #############
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
87
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
88 my %total_read_num = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
89 my $num_c = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
90 my @fasta_files = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
91
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
92 my $file_in = $read_file;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
93 my $fasta_out = '/tmp/fasta'.$rand.'_Read.fa';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
94
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
95
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
96 my $type = check_file_type($file_in ) ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
97
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
98 if($type eq 'fq'){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
99 # fastq
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
100 # print "this is fastq file\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
101 my $total_num = &fastq2fasta($file_in,$fasta_out);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
102 $total_read_num{$fasta_out} = $total_num;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
103 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
104 elsif($type eq 'fa'){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
105 # fasta
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
106 # print "this is fasta file\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
107 my $total_num = &fasta2fasta($file_in,$fasta_out);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
108 # load number
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
109 $total_read_num{$fasta_out} = $total_num;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
110 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
111 else{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
112 die "Please check the format of read file\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
113 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
114
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
115
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
116 ########################################################
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
117 ## Here will put step3 to step 5 into one loop
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
118
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
119
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
120 open (REPORT,">$final_out") || die "cannot write\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
121
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
122
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
123 my $file_blat_in = $fasta_out;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
124
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
125 ############ step 3 - blat ###########################
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
126 my $blat_out = '/tmp/fasta'.$rand.'_blat_crispr.psl';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
127
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
128 my $command_line = $blat.' '.' -tileSize=7 -oneOff=1 -maxGap=20 -minIdentity=70 -minScore=20 '.$file_blat_in.' '. $design_fas.' '.$blat_out .' >/tmp/blat.txt';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
129
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
130 system("$command_line");
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
131
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
132 #print "blat job $file_blat_in is done\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
133 ########## step 4 - convert psl -> bed ##############
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
134
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
135 my $bed_hash_address = &psl2bed ($blat_out ); ## address of one hash
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
136
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
137 ########## step 5 - get sequences number from bed ##############
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
138
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
139 my $read_num_address = &MAIN ($file_blat_in,$bed_hash_address);## address of one hash
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
140
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
141
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
142 ## remove files
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
143
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
144 if($remove_files == 1){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
145 $command_line = $remove.' '.$file_blat_in;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
146 system("$command_line");
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
147
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
148 $command_line = $remove.' '.$blat_out ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
149 system("$command_line");
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
150
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
151 my $command_line2 = $remove.' '.$design_fas ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
152 system("$command_line2");
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
153 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
154
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
155
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
156 #########################################################################################
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
157 ## functions
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
158
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
159
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
160 sub MAIN{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
161 my $fas_file_in = shift @_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
162 my $bed_file_address_in = shift @_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
163
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
164 my %bed_hash_in = %{$bed_file_address_in};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
165
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
166
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
167 #########################################################
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
168 # read convereted reads fasta file
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
169
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
170 open (FAS,"$fas_file_in ")||die "can not open fasta file: $fas_file_in ";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
171
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
172 my %query_with_seq2num = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
173 my %seq2alignment = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
174 my %seq_part2assignment = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
175
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
176 while (my $lineIn =<FAS>){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
177 chomp $lineIn;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
178 if($lineIn =~ m/^\>/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
179 my $id = substr $lineIn,1;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
180
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
181 my @numbers = split(/\_/, $id);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
182 my $total_num = $numbers[-1];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
183
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
184 $lineIn = <FAS>;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
185 chomp $lineIn;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
186 my $seq = $lineIn;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
187
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
188 if(exists $bed_hash_in{$id}){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
189 # $psl_hash{$read_target}{$query} # format of hash
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
190 ## asign read to target reference
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
191
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
192 ##########################################################
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
193 my @read_asignment = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
194 ## loop for best
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
195 for my $read_key (keys %{$bed_hash_in{$id}}){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
196 ### get annotation from array
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
197 ## Q: design T: read
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
198 my @get_array = @{$bed_hash_in{$id}{$read_key}};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
199 my ($strand, $q_name,$q_size,$q_start,$q_end,
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
200 $t_name,$t_size,$t_start,$t_end,
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
201 $block_num,$block_size,$q_start_s,$t_start_s) = @get_array ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
202
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
203 my $seq_out = substr $seq,$t_start,($t_end-$t_start);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
204
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
205 if($strand eq '-'){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
206 $seq_out = &rev_com ($seq_out);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
207 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
208 ## get orignal
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
209 my $ref_ori_seq = '';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
210 if(exists $design_hash{$q_name}){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
211 $ref_ori_seq = $design_hash{$q_name};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
212 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
213
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
214
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
215 my $edit_note = &get_editing_note(\@get_array,$ref_ori_seq, $seq_out,$seq);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
216 # target # part of read
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
217 my ($align_ref,$align_read,$note) = split(/\t/, $edit_note);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
218
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
219 my $len_ori = length($ref_ori_seq);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
220 my @align_1 = split(//, $align_ref);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
221 my @align_2 = split(//, $align_read);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
222 my $site_snp = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
223 my @site_snp = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
224 my $iden_num = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
225
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
226 for my $a1(@align_1){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
227 my $a2 = shift @align_2;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
228 if($a1 ne '-' and $a2 ne '-'){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
229 if($a1 ne $a2){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
230 push @site_snp, $site_snp;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
231 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
232 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
233
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
234 if($a1 eq $a2){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
235 $iden_num++;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
236 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
237
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
238 $site_snp++;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
239 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
240 $iden_num = $iden_num-2;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
241
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
242 ## filter on SNP number
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
243 my $snp_num_for_filter = @site_snp+0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
244
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
245 my $dis_max = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
246 my $dis_current = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
247 my @continuous_site_temp = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
248 my @continuous_site = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
249 my @site_snp_checking = @site_snp;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
250 my $start = shift @site_snp_checking ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
251 for my $site_checking (@site_snp_checking){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
252 my $dis_in = $site_checking -$start;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
253 if($dis_in <4){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
254 $dis_current++;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
255 push @continuous_site_temp,$site_checking ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
256 if($dis_current > $dis_max){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
257 $dis_max = $dis_current;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
258 @continuous_site = @continuous_site_temp;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
259 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
260 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
261 else{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
262 $dis_current = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
263 @continuous_site_temp = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
264
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
265 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
266
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
267 $start = $site_checking;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
268 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
269
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
270
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
271 if( ($snp_num_for_filter/$len_ori) > 0.5){ # first filtering
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
272 if($dis_max <5){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
273 next;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
274 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
275 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
276
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
277
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
278 if($dis_max>=5){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
279 @site_snp = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
280 my @align_x = split(//, $align_read);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
281 for my $iii(@continuous_site ){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
282 $align_x[$iii] = 'N';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
283 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
284 $align_read = join('',@align_x);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
285 $note = 'strange editing';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
286 my @ttt = ($align_ref,$align_read,$note) ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
287
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
288 $edit_note = join("\t", @ttt );
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
289 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
290
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
291
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
292
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
293
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
294 my @ooo = ($iden_num,$ref_ori_seq,$seq_out,$edit_note,\@site_snp,\@get_array);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
295 push @read_asignment,[@ooo];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
296 }## for loop for best hit of each fasta reads
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
297
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
298 ## get the best hits
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
299
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
300 if(@read_asignment<1){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
301 next;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
302 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
303
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
304 ## assign the first hit
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
305 @read_asignment = sort {$b->[0] <=> $a->[0]}@read_asignment ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
306 my ($iden_num,$ref_ori_seq,$seq_out,$edit_note,$array_snp_addr,$bed_array_addr) = @{$read_asignment[0]};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
307
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
308 my @snp_filtering = @{$array_snp_addr};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
309
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
310 if((@snp_filtering+0)/length($ref_ori_seq) >$mismatch_cutoff){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
311 next;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
312 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
313
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
314
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
315 ## after asignment
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
316
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
317 ## check whether seq_out is assigned earlier
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
318 ## $seq_out has been asigned to one record
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
319 ## just assign reads to that one althogh it may be get one new assignment
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
320 if( exists $seq_part2assignment{$seq_out}){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
321 my $first_assignment = $seq_part2assignment{$seq_out};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
322 $query_with_seq2num{$first_assignment}{$seq_out} = $query_with_seq2num{$first_assignment}{$seq_out}+$total_num;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
323 next;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
324 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
325
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
326
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
327 # number for part of reads
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
328 my ($strand, $q_name,$q_size,$q_start,$q_end,
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
329 $t_name,$t_size,$t_start,$t_end,
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
330 $block_num,$block_size,$q_start_s,$t_start_s) = @{$bed_array_addr};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
331
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
332 # this the first asignment
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
333 if(not exists $seq2alignment{$seq_out}){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
334 my @out = ($ref_ori_seq,$edit_note,$array_snp_addr);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
335 $seq2alignment{$seq_out} = \@out;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
336
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
337 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
338 # assigned
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
339 if(not exists $seq_part2assignment{$seq_out}) {
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
340 $seq_part2assignment{$seq_out} = $q_name;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
341 $query_with_seq2num{$q_name}{$seq_out} = $total_num;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
342 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
343
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
344
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
345
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
346 }# exists bed file
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
347 }# lineIN
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
348 }# while file
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
349 close(FAS);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
350
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
351 ### summary data for output
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
352 my ($hash_addr1, $hash_addr2) = &get_out_buffer($fas_file_in,\%query_with_seq2num, \%seq2alignment);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
353
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
354 ### write output
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
355
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
356 &write_buffer_out($hash_addr1, $hash_addr2);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
357
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
358 } # Main function
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
359
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
360
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
361
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
362
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
363 sub get_out_buffer{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
364 my $fas_file_in = $_[0];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
365 my %query_with_seq2num_in = %{$_[1]};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
366 my %seq2alignment_in = %{$_[2]};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
367
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
368 my %buffer_out = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
369 my %buffer_num = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
370
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
371 my $total_non_redun = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
372 ## push output in buffer before printing
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
373 my $test_case_num = $total_read_num{$fas_file_in};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
374
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
375 for my $ref_name (sort keys %query_with_seq2num_in){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
376 my %in_hash = %{$query_with_seq2num_in{$ref_name}};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
377
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
378 ## each group
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
379 my $report_wt_count = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
380 my $report_indel_count = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
381 my $other_num = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
382 my $sub_hit_num = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
383 my $indel_hit_num = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
384
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
385
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
386 my @other_record = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
387 ### loop for records
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
388 for my $part_of_read (sort {$in_hash{$b} <=> $in_hash{$a}} keys %in_hash){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
389
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
390 my $hitnum = $in_hash{$part_of_read};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
391
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
392 if($hitnum < $min_cutoff) {
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
393 next;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
394 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
395
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
396 if($hitnum/$test_case_num < 0.00005) {
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
397 next;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
398 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
399
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
400 ## get Editing Note for output only
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
401
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
402 if(exists $seq2alignment_in{$part_of_read}){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
403 my ($ref_ori_seq,$edit_note,$array_snp_addr)= @{$seq2alignment_in{$part_of_read}};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
404 my ($align_ref,$align_read,$note) = split(/\t/,$edit_note);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
405 my @snp_out = @{$array_snp_addr};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
406 my $snp_num = @snp_out +0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
407 # deep sequencing SNP
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
408 if($test_case_num >500 and $hitnum <=2 and $snp_num/length($ref_ori_seq)>0.05){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
409 next;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
410 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
411
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
412 my $snp_note = '';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
413 if($snp_num>0 ){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
414 $snp_note = $snp_num.' SNP('.join(',',@snp_out).')';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
415 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
416
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
417 $sub_hit_num = $sub_hit_num+$hitnum;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
418 $total_non_redun = $total_non_redun + $hitnum;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
419
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
420
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
421 # output here
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
422 if($ref_ori_seq ne $design_hash{$ref_name} ){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
423 print "error in read assingment \n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
424 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
425
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
426
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
427 my @out_line = ($fas_file_in,$ref_name,$ref_ori_seq, $part_of_read,$hitnum, $align_ref,$align_read,$note,$snp_note);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
428 ## indel
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
429 if($edit_note !~ m/no.+indel/ ){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
430
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
431 if((($test_case_num >500 and $hitnum <=3) or ($hitnum /$test_case_num <0.001 ) ) and $edit_note =~ m/strange/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
432 next; # filter strange case in deep sequencing with few reads
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
433 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
434
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
435
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
436 $report_indel_count ++;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
437 # total indel
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
438 $indel_hit_num = $indel_hit_num + $hitnum;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
439
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
440 if($report_indel_count <= $indel_report ){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
441 #print REPORT join("\t",@out_line),"\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
442 ## in buffer
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
443 if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
444 my @bbb = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
445 push @bbb, [@out_line];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
446 $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
447 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
448 else{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
449 push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@out_line];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
450 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
451 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
452 else{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
453 $other_num = $other_num+$hitnum;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
454 @other_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'others',$hitnum, '-', '-', '-', '-');
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
455 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
456
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
457 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
458 else{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
459 $report_wt_count ++;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
460 if($report_wt_count <= $wt_like_report ){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
461 #print REPORT join("\t",@out_line),"\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
462 # in buffer
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
463 if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
464 my @bbb = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
465 push @bbb, [@out_line];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
466 $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
467 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
468 else{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
469 push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@out_line];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
470 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
471 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
472 else{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
473 $other_num = $other_num+$hitnum;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
474 @other_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'others',$hitnum, '-', '-', '-', '-');
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
475 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
476
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
477 }# wt like report
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
478
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
479 } # if exists
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
480
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
481 }# for part of reads
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
482
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
483 ## other report
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
484 if($other_num>0){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
485 $other_record[-5] = $other_num;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
486 #print REPORT join("\t",@other_record),"\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
487
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
488 if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
489 my @bbb = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
490 push @bbb, [@other_record];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
491 $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
492 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
493 else{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
494 push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@other_record];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
495 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
496 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
497
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
498 #my @summary_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'total_hit:'.$total_non_redun,'sub_hit:'.$sub_hit_num, 'indel_hit:'.$indel_hit_num);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
499 ## total read number
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
500 my $total_num = $total_read_num{$fas_file_in};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
501
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
502 # 0 1 2
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
503 # my @summary_record = ($fas_file_in,$ref_name,'Total Reads: '., 'Total Hits: '.$total_non_redun,'Sub Hits: '.$sub_hit_num, 'Indel Hits: '.$indel_hit_num);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
504 my @summary_record = ('Sub Hits: '.$sub_hit_num, 'Indel Hits: '.$indel_hit_num);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
505
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
506 $buffer_out{$fas_file_in}{$ref_name}{'sum'} = \@summary_record ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
507 $buffer_num{$fas_file_in}{'total'} = $total_num ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
508 $buffer_num{$fas_file_in}{'sub'} = $total_non_redun ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
509 #print REPORT join("\t",@summary_record),"\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
510
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
511 }# for ref_seq
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
512
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
513
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
514 return (\%buffer_out,\%buffer_num);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
515 } # sub
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
516
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
517
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
518 ### write the output
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
519
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
520 sub write_buffer_out{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
521 my %hash_out = %{$_[0]};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
522 my %hash_out_num = %{$_[1]};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
523 my $print_tracking = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
524 for my $fas_file_in (sort keys %hash_out){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
525
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
526 my $total_num = $hash_out_num{$fas_file_in}{'total'} ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
527 my $total_non_redun = $hash_out_num{$fas_file_in}{'sub'} ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
528
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
529 for my $ref_name (sort keys %{$hash_out{$fas_file_in }}){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
530 my @data = @{$hash_out{$fas_file_in}{$ref_name}{"data"}};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
531
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
532 for (@data){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
533 $print_tracking++;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
534 if($print_tracking==1){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
535 print REPORT "INPUT","\t","Target\t","TargetSequence\t","ReadSequence\t","Read#","\t",
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
536 "AlignedTarget\t","AlignedRead\t", "Indels\t","SNPs\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
537 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
538 my @ooo = @{$_};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
539 $ooo[0] = 'Input1';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
540 print REPORT join("\t", @ooo),"\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
541 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
542
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
543 # sum
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
544 my @sum_p2 = @{$hash_out{$fas_file_in}{$ref_name}{'sum'}} ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
545
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
546
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
547 my @sum_p1 = ('Input1',$ref_name,'Total Reads: '.$total_num, 'Total Hits: '.$total_non_redun);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
548
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
549 print REPORT join("\t", @sum_p1),"\t", join("\t", @sum_p2),"\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
550
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
551 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
552
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
553
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
554 ##### print summary region
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
555 print REPORT "Summary \t -----------------------------------------------------\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
556 print REPORT "Sum:INPUT\t","Target\t","AlignedTarget\t","AlignedRead\t","Total Hits\t","Sub Hits\t","Indel or WT Hits\t","Indel or WT rate %\t","Pattern\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
557
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
558
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
559
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
560 for my $ref_name (sort keys %{$hash_out{$fas_file_in }}){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
561 my @data = @{$hash_out{$fas_file_in}{$ref_name}{"data"}};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
562
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
563 my $wt_pair = ''."\t".'';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
564 my $indel_pair = ''."\t".'';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
565 my $indel_out = '';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
566 my $i = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
567
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
568 for (@data){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
569 $i++;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
570 my @in= @{$_};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
571
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
572
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
573 if($in[7] !~ m/no\_indel/ and $in[7] ne '-'){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
574 if($indel_pair eq ''."\t".''){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
575 $indel_pair = $in[5]."\t".$in[6];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
576 my @indel_temp = split(/\s+/,$in[7]);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
577
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
578 my @ooo_indel = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
579 for my $ooo_in (@indel_temp ){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
580 if($ooo_in =~ m/I(\d+)/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
581 push @ooo_indel, '+'.$1;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
582 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
583 if($ooo_in =~ m/D(\d+)/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
584 push @ooo_indel, '-'.$1;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
585 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
586 if($ooo_in =~ m/strange/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
587 push @ooo_indel, 'strange editing';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
588 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
589 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
590 $indel_out = 'Indel:'.join(',', @ooo_indel);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
591 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
592 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
593 else{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
594 # no indel
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
595 if($wt_pair eq ''."\t".''){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
596 $wt_pair = $in[5]."\t".$in[6];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
597 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
598 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
599 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
600
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
601 # first record
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
602
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
603
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
604
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
605 # sum
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
606 my ($sub_string, $indel_string) = @{$hash_out{$fas_file_in}{$ref_name}{'sum'}} ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
607 my $sub_num ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
608 my $indel_num;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
609 #print $sub_string. "sub summary\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
610 #print $indel_string ."sub summary\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
611
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
612
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
613 if($sub_string =~ m/\:[^\d]+(\d+)/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
614 $sub_num = $1;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
615 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
616 if($indel_string =~ m/\:[^\d]+(\d+)/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
617 $indel_num = $1;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
618 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
619
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
620 my $rate = int(($indel_num/$sub_num)*10000)/100;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
621
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
622 ## indel
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
623 # two columns
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
624 my @out_sum = ("Input1",$ref_name, $indel_pair,$total_non_redun,$sub_num,$indel_num,$rate,$indel_out);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
625 print REPORT 'Sum:',join("\t", @out_sum),"\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
626 ## wt like
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
627 my $wt_num = $sub_num - $indel_num;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
628 my $wt_rate = 100-$rate;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
629 my @out_sum = ("Input1",$ref_name, $wt_pair ,$total_non_redun,$sub_num,$wt_num,$wt_rate,"WT_like");
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
630 print REPORT 'Sum:',join("\t", @out_sum),"\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
631
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
632 } # records
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
633
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
634 print REPORT "\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
635
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
636 } # file
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
637 } # sub
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
638 ###################################################################
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
639
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
640
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
641
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
642 sub get_editing_note{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
643 my ($address_in,$query_seq, $target_seq ,$full_read) = @_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
644 # target: part of read
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
645 # query : original sequence in design file
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
646
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
647 my ($strand, $q_name,$q_size,$q_start,$q_end,
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
648 $t_name,$t_size,$t_start,$t_end,
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
649 $block_num,$block_size,$q_start_s,$t_start_s) = @{$address_in};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
650
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
651 my @out = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
652 #print $strand ,"strand\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
653 if($strand eq '+'){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
654 @out = get_alignment($block_num,$block_size,$q_start_s,$t_start_s,$query_seq, $full_read);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
655 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
656 else{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
657 @out = get_alignment($block_num,$block_size,$q_start_s,$t_start_s,rev_com($query_seq), $full_read);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
658 @out = get_reverse(@out);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
659 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
660
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
661 $out[0]='B'.$out[0].'E';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
662 $out[1]='B'.$out[1].'E';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
663 if($out[2] !~ m/\w/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
664 $out[2] = 'no_indel';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
665 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
666 return join("\t",@out);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
667 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
668
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
669
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
670
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
671 sub get_alignment{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
672 my ($blocks , $blockLengths, $qStarts , $tStarts,$query_seq, $target_seq) = @_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
673 my @blockSizes = split(/\,/,$blockLengths);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
674 my @qArray = split(/\,/,$qStarts);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
675 my @tArray = split(/\,/,$tStarts);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
676
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
677 # target: part of read
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
678 # query : original sequence in design file
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
679
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
680 # print join("\t",@_),"\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
681
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
682 ### before blocks
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
683 my $before_q = substr $query_seq, 0,$qArray[0];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
684 my $before_t = substr $target_seq,0,$tArray[0];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
685
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
686
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
687 ($before_q,$before_t) = treat_sequence($before_q,$before_t,"before");
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
688
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
689 ### after blocks
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
690 my $after_q = substr $query_seq, $qArray[-1]+$blockSizes[-1];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
691 my $after_t = substr $target_seq,$tArray[-1]+$blockSizes[-1];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
692
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
693
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
694 ($after_q,$after_t) = treat_sequence($after_q,$after_t,"after") ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
695
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
696 ### blocks
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
697 my @med_q = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
698 my @med_t = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
699
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
700 my @indel_out = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
701 my $out = '';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
702 ## first block
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
703 my $med_q_seq = substr $query_seq,$qArray[0],$blockSizes[0];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
704 my $med_t_seq = substr $target_seq,$tArray[0],$blockSizes[0];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
705 push @med_q,$med_q_seq;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
706 push @med_t,$med_t_seq;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
707
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
708 if($blocks>1){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
709 for (my $i= 0;$i<($blocks-1);$i++){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
710 ####### interval
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
711 my $inter_q_seq = substr $query_seq,($qArray[$i]+$blockSizes[$i]), ($qArray[$i+1]-($qArray[$i]+$blockSizes[$i]));
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
712 my $inter_t_seq = substr $target_seq,($tArray[$i]+$blockSizes[$i]),($tArray[$i+1]-($tArray[$i]+$blockSizes[$i]));
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
713
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
714 ### count deletion insertion
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
715 if(length($inter_t_seq) - length($inter_q_seq)>0){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
716 # to fix the mismatch near deletion length($inter_q_seq)
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
717 $out = ($qArray[$i]+$blockSizes[$i]+length($inter_q_seq)).'I'.(length($inter_t_seq) - length($inter_q_seq));
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
718 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
719 if(length($inter_q_seq)-length($inter_t_seq)>0){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
720 $out = ($qArray[$i]+$blockSizes[$i]+length($inter_t_seq)).'D'.(length($inter_q_seq)-length($inter_t_seq));
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
721 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
722 push @indel_out,$out;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
723
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
724 ($inter_q_seq,$inter_t_seq) = treat_inter ($inter_q_seq,$inter_t_seq) ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
725 push @med_q, $inter_q_seq;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
726 push @med_t, $inter_t_seq;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
727
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
728 ###### block after interval
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
729 $med_q_seq = substr $query_seq,$qArray[$i+1],$blockSizes[$i+1];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
730 $med_t_seq = substr $target_seq,$tArray[$i+1],$blockSizes[$i+1];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
731 push @med_q,$med_q_seq;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
732 push @med_t,$med_t_seq;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
733 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
734 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
735
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
736 push @med_q,$after_q;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
737 push @med_t,$after_t;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
738
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
739 unshift @med_q,$before_q;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
740 unshift @med_t,$before_t;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
741 my $q_string = join('',@med_q);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
742 my $t_string = join('',@med_t);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
743
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
744 if($q_string=~m/^(\-+)/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
745 my $len = length($1);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
746 $q_string = substr $q_string,$len;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
747 $t_string = substr $t_string, $len;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
748 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
749 if($q_string=~m/(\-+)$/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
750 my $len = length($1);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
751 $q_string = substr $q_string,0,(-1)*$len;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
752 $t_string = substr $t_string,0, (-1)*$len;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
753 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
754
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
755
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
756 return ($q_string,$t_string,join(' ',@indel_out));
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
757
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
758 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
759
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
760
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
761 sub get_reverse{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
762 my ($q_string,$t_string,$indel_string )= @_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
763 $q_string = rev_com($q_string);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
764 $t_string = rev_com($t_string);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
765
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
766 my $string_test = $q_string;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
767 $string_test =~ s/\-//g;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
768 my $len = length($string_test);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
769
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
770 my @indel_in = split(/\s/, $indel_string);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
771 my @indel_out = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
772 for (@indel_in){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
773
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
774 if(m/^(\d+)(.)(\d+)/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
775 my $ooo = ($len-$1-$3).$2.$3;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
776 if($2 eq 'I'){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
777 $ooo = ($len-$1).$2.$3;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
778 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
779
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
780 unshift @indel_out,$ooo;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
781 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
782 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
783 $indel_string = join(' ',@indel_out);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
784 return($q_string,$t_string, $indel_string);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
785 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
786
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
787
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
788
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
789
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
790
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
791 sub treat_inter{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
792 my ($q,$t) = @_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
793 my $q_len = length($q);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
794 my $t_len = length($t);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
795 my $dis = abs($q_len-$t_len);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
796 my @oooo = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
797
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
798 for(1..$dis){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
799 push @oooo,'-';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
800 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
801
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
802 if($q_len-$t_len >0){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
803 $t = $t.join('',@oooo);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
804 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
805 else{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
806 $q = $q.join('',@oooo);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
807 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
808 return ($q,$t);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
809 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
810
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
811
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
812
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
813 sub treat_sequence{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
814 my ($q,$t,$position) = @_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
815 my $q_len= length($q);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
816 my $t_len = length($t);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
817 my $dis = abs($q_len-$t_len);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
818
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
819 if($dis > 0){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
820 my @oooo = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
821 for(1..$dis){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
822 push @oooo,'-';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
823 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
824 if($q_len>$t_len){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
825 if($position eq 'before'){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
826 $t = join('',@oooo).$t ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
827 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
828 else{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
829 $t = $t.join('',@oooo) ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
830 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
831 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
832 else{ # q small
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
833 if($position eq 'before'){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
834 $q = join('',@oooo).$q ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
835 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
836 else{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
837 $q = $q.join('',@oooo) ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
838 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
839 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
840 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
841 return ($q,$t);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
842 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
843
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
844
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
845 sub psl2bed {
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
846 my $psl_file = shift @_; # file in @psl_file_data
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
847 my @psl_array = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
848
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
849
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
850 open (FILEINBLAT,"$psl_file") || die "Can not open PSL file, Please check BLAT software is working\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
851
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
852 ## should be correct
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
853
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
854 for(1..5){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
855 my $line=<FILEINBLAT>;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
856 ## remove head
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
857 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
858 ## read psl
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
859 while (<FILEINBLAT>)
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
860 {
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
861 my $lineIn = $_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
862 chomp $lineIn;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
863 my @lineArr = split ("\t", $lineIn);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
864
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
865 my $q_size = $lineArr[10];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
866 my $m_size = $lineArr[0];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
867
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
868 if( $m_size/$q_size > 0.20){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
869 push @psl_array,[@lineArr];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
870 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
871
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
872 }# while file
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
873 close(FILEINBLAT);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
874 ## score large to small
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
875
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
876 #@psl_array = sort {$b->[0] <=>$a->[0]}@psl_array ;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
877
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
878 my @out_array = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
879 my $psl_line_num = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
880 my %psl_hash = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
881 for (@psl_array)
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
882 {
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
883 my @lineArr = @{$_};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
884 $psl_line_num++;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
885 my $read_target = $lineArr[13];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
886 my $query = $lineArr[9];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
887 for(1..8){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
888 shift @lineArr;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
889 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
890
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
891 if(not exists $psl_hash{$read_target}{$query}){ ## all alignment
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
892 #if(not exists $psl_hash{$read_target}){ ## only keep the best hit
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
893 $psl_hash{$read_target}{$query} = \@lineArr;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
894 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
895 }# while array
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
896 #print $psl_line_num." total psl lines\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
897 return (\%psl_hash);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
898 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
899
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
900
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
901
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
902
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
903 sub fasta2fasta {
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
904 my $input_file = shift @_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
905 my $out_file = shift @_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
906
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
907 open IN,"<$input_file " or die "File $input_file not found error here\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
908 open TGT,">$out_file" or die "File $out_file not found error here\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
909 my $c = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
910 my @line = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
911 my $id ='';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
912 my $seq = '';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
913 my $processed = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
914
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
915 while(<IN>){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
916 chomp;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
917 if(m/^\>/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
918 $processed ++;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
919 if($processed == 1){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
920 my @seq_id = split(/\s+/, $_);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
921 my $seq_id = $seq_id[0];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
922 $seq_id =~ s/\_//g;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
923 $seq_id = $seq_id.'_1';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
924 print TGT '>'.$seq_id,"\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
925 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
926 else{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
927 $seq =~ s/\s|\n|\r//g;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
928 print TGT $seq,"\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
929 $seq = '';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
930 my @seq_id = split(/\s+/, $_);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
931 my $seq_id = $seq_id[0];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
932 $seq_id =~ s/\_//g;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
933 $seq_id = $seq_id.'_1';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
934 print TGT '>'.$seq_id,"\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
935 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
936
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
937 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
938 else{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
939 $seq = $seq.$_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
940 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
941 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
942 # last records
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
943 $seq =~ s/\s|\n|\r//g;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
944 print TGT $seq,"\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
945
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
946 close IN;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
947 close TGT;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
948 return $processed;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
949 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
950
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
951
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
952
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
953 sub fastq2fasta {
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
954 my $input_file = shift @_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
955 my $out_file = shift @_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
956
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
957 open IN,"<$input_file " or die "File $input_file not found error here\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
958 open TGT,">$out_file" or die "File $out_file not found error here\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
959 my $c = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
960 my @line = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
961 my $id ='';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
962 my $seq = '';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
963 my $processed = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
964 my %read_hash = ();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
965
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
966 ##############################################################################################
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
967
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
968 while(<IN>){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
969 chomp;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
970 $c++;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
971 if($c == 1){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
972 $processed++;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
973 # print STDERR "$processed reads processed\r";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
974 @line = split();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
975 $id = $line[0];
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
976 $id =~ s/\@//;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
977 }elsif($c == 2){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
978 $seq = $_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
979 }elsif($c == 4){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
980 $c = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
981 #print TGT ">$id\n$seq\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
982 # print TGT ">seq_$processed\n$seq\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
983 $read_hash{$seq}{$id} = 1;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
984 $id ='';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
985 @line =();
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
986 $seq ='';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
987 }else{}
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
988 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
989
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
990 ## write TGT
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
991 my $count = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
992 for my $seq_out (sort keys %read_hash){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
993 $count++;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
994 my @hits = keys %{$read_hash{$seq_out}};
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
995 my $num = @hits+0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
996 my $id_out = "R".$count.'_'.$num;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
997 print TGT ">$id_out\n", $seq_out ,"\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
998 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
999
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1000 close IN;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1001 close TGT;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1002 return $processed;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1003 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1004
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1005
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1006 sub check_read_num{
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1007 my $file_in = shift @_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1008 # check read number
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1009
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1010 open FAS_in,"$file_in" || die "can not read fasta file\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1011 my $seq_num = 0;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1012 while (<FAS_in>){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1013 if( m/^\>/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1014 $seq_num ++;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1015 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1016 if( $seq_num>1){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1017 last;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1018 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1019 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1020
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1021 return $seq_num;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1022
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1023 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1024
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1025
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1026
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1027 sub check_file_type {
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1028 my $file_in = shift @_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1029 my $out = 'no';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1030
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1031 open FAS_in, "$file_in" || die "can not read fasta file\n";
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1032
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1033 while (my $line = <FAS_in>){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1034 if($line =~ m/\w/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1035 if($line =~ m/^\>/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1036 $out = 'fa';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1037 last;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1038 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1039 if($line =~ m/^\@/){
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1040 $out = 'fq';
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1041 last;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1042 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1043 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1044 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1045
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1046 return $out;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1047 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1048
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1049
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1050 sub rev_com {
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1051 my $seq = shift @_;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1052 $seq = reverse($seq);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1053 $seq =~ tr/ACGT/TGCA/;
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1054 return($seq);
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1055 }
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1056
a9c5e846dd76 Perl code
lxue
parents:
diff changeset
1057